diff options
-rw-r--r-- | CMakeLists.txt | 3 | ||||
-rw-r--r-- | alc/alu.cpp | 11 | ||||
-rw-r--r-- | core/cubic_defs.h | 13 | ||||
-rw-r--r-- | core/cubic_tables.cpp | 59 | ||||
-rw-r--r-- | core/cubic_tables.h | 16 | ||||
-rw-r--r-- | core/mixer/defs.h | 9 | ||||
-rw-r--r-- | core/mixer/mixer_c.cpp | 35 | ||||
-rw-r--r-- | core/mixer/mixer_neon.cpp | 52 | ||||
-rw-r--r-- | core/mixer/mixer_sse.cpp | 51 |
9 files changed, 227 insertions, 22 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 379544fb..c37fe0b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -654,6 +654,9 @@ set(CORE_OBJS core/converter.h core/cpu_caps.cpp core/cpu_caps.h + core/cubic_defs.h + core/cubic_tables.cpp + core/cubic_tables.h core/devformat.cpp core/devformat.h core/device.cpp diff --git a/alc/alu.cpp b/alc/alu.cpp index 6e243412..45f518ee 100644 --- a/alc/alu.cpp +++ b/alc/alu.cpp @@ -55,6 +55,7 @@ #include "core/buffer_storage.h" #include "core/context.h" #include "core/cpu_caps.h" +#include "core/cubic_tables.h" #include "core/devformat.h" #include "core/device.h" #include "core/effects/base.h" @@ -211,6 +212,14 @@ inline ResamplerFunc SelectResampler(Resampler resampler, uint increment) #endif return Resample_<LerpTag,CTag>; case Resampler::Cubic: +#ifdef HAVE_NEON + if((CPUCapFlags&CPU_CAP_NEON)) + return Resample_<CubicTag,NEONTag>; +#endif +#ifdef HAVE_SSE + if((CPUCapFlags&CPU_CAP_SSE)) + return Resample_<CubicTag,SSETag>; +#endif return Resample_<CubicTag,CTag>; case Resampler::BSinc12: case Resampler::BSinc24: @@ -262,7 +271,9 @@ ResamplerFunc PrepareResampler(Resampler resampler, uint increment, InterpState { case Resampler::Point: case Resampler::Linear: + break; case Resampler::Cubic: + state->cubic.filter = gCubicSpline.Tab; break; case Resampler::FastBSinc12: case Resampler::BSinc12: diff --git a/core/cubic_defs.h b/core/cubic_defs.h new file mode 100644 index 00000000..33751c97 --- /dev/null +++ b/core/cubic_defs.h @@ -0,0 +1,13 @@ +#ifndef CORE_CUBIC_DEFS_H +#define CORE_CUBIC_DEFS_H + +/* The number of distinct phase intervals within the cubic filter tables. */ +constexpr unsigned int CubicPhaseBits{5}; +constexpr unsigned int CubicPhaseCount{1 << CubicPhaseBits}; + +struct CubicCoefficients { + float mCoeffs[4]; + float mDeltas[4]; +}; + +#endif /* CORE_CUBIC_DEFS_H */ diff --git a/core/cubic_tables.cpp b/core/cubic_tables.cpp new file mode 100644 index 00000000..bdcc0bae --- /dev/null +++ b/core/cubic_tables.cpp @@ -0,0 +1,59 @@ + +#include "cubic_tables.h" + +#include <algorithm> +#include <array> +#include <cassert> +#include <cmath> +#include <limits> +#include <memory> +#include <stdexcept> + +#include "alnumbers.h" +#include "core/mixer/defs.h" + + +namespace { + +using uint = unsigned int; + +struct SplineFilterArray { + alignas(16) CubicCoefficients mTable[CubicPhaseCount]{}; + + constexpr SplineFilterArray() + { + /* Fill in the main coefficients. */ + for(size_t pi{0};pi < CubicPhaseCount;++pi) + { + const double mu{pi / double{CubicPhaseCount}}; + const double mu2{mu*mu}, mu3{mu2*mu}; + mTable[pi].mCoeffs[0] = static_cast<float>(-0.5*mu3 + mu2 + -0.5*mu); + mTable[pi].mCoeffs[1] = static_cast<float>( 1.5*mu3 + -2.5*mu2 + 1.0); + mTable[pi].mCoeffs[2] = static_cast<float>(-1.5*mu3 + 2.0*mu2 + 0.5*mu); + mTable[pi].mCoeffs[3] = static_cast<float>( 0.5*mu3 + -0.5*mu2); + } + + /* Fill in the coefficient deltas. */ + for(size_t pi{0};pi < CubicPhaseCount-1;++pi) + { + mTable[pi].mDeltas[0] = mTable[pi+1].mCoeffs[0] - mTable[pi].mCoeffs[0]; + mTable[pi].mDeltas[1] = mTable[pi+1].mCoeffs[1] - mTable[pi].mCoeffs[1]; + mTable[pi].mDeltas[2] = mTable[pi+1].mCoeffs[2] - mTable[pi].mCoeffs[2]; + mTable[pi].mDeltas[3] = mTable[pi+1].mCoeffs[3] - mTable[pi].mCoeffs[3]; + } + + const size_t pi{CubicPhaseCount - 1}; + mTable[pi].mDeltas[0] = -mTable[pi].mCoeffs[0]; + mTable[pi].mDeltas[1] = -mTable[pi].mCoeffs[1]; + mTable[pi].mDeltas[2] = 1.0f - mTable[pi].mCoeffs[2]; + mTable[pi].mDeltas[3] = -mTable[pi].mCoeffs[3]; + } + + constexpr const CubicCoefficients *getTable() const noexcept { return mTable; } +}; + +constexpr SplineFilterArray SplineFilter{}; + +} // namespace + +const CubicTable gCubicSpline{SplineFilter.getTable()}; diff --git a/core/cubic_tables.h b/core/cubic_tables.h new file mode 100644 index 00000000..297aa89e --- /dev/null +++ b/core/cubic_tables.h @@ -0,0 +1,16 @@ +#ifndef CORE_CUBIC_TABLES_H +#define CORE_CUBIC_TABLES_H + +#include "cubic_defs.h" + + +struct CubicTable { + const CubicCoefficients *Tab; +}; + +/* A Catmull-Rom spline. The spline passes through the center two samples, + * ensuring no discontinuity while moving through a series of samples. + */ +extern const CubicTable gCubicSpline; + +#endif /* CORE_CUBIC_TABLES_H */ diff --git a/core/mixer/defs.h b/core/mixer/defs.h index 80d9fc7f..74a474fe 100644 --- a/core/mixer/defs.h +++ b/core/mixer/defs.h @@ -8,6 +8,7 @@ #include "core/bufferline.h" #include "core/resampler_limits.h" +struct CubicCoefficients; struct HrtfChannelState; struct HrtfFilter; struct MixHrtfFilter; @@ -51,7 +52,15 @@ struct BsincState { const float *filter; }; +struct CubicState { + /* Filter coefficients, and coefficient deltas. Starting at phase index 0, + * each subsequent phase index follows contiguously. + */ + const CubicCoefficients *filter; +}; + union InterpState { + CubicState cubic; BsincState bsinc; }; diff --git a/core/mixer/mixer_c.cpp b/core/mixer/mixer_c.cpp index 9ac2a9c4..24b9f0b7 100644 --- a/core/mixer/mixer_c.cpp +++ b/core/mixer/mixer_c.cpp @@ -5,7 +5,8 @@ #include <limits> #include "alnumeric.h" -#include "core/bsinc_tables.h" +#include "core/bsinc_defs.h" +#include "core/cubic_defs.h" #include "defs.h" #include "hrtfbase.h" @@ -20,23 +21,39 @@ struct FastBSincTag; namespace { -constexpr uint FracPhaseBitDiff{MixerFracBits - BSincPhaseBits}; -constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff}; +constexpr uint BsincPhaseBitDiff{MixerFracBits - BSincPhaseBits}; +constexpr uint BsincPhaseDiffOne{1 << BsincPhaseBitDiff}; + +constexpr uint CubicPhaseBitDiff{MixerFracBits - CubicPhaseBits}; +constexpr uint CubicPhaseDiffOne{1 << CubicPhaseBitDiff}; +constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u}; inline float do_point(const InterpState&, const float *RESTRICT vals, const uint) { return vals[0]; } inline float do_lerp(const InterpState&, const float *RESTRICT vals, const uint frac) { return lerpf(vals[0], vals[1], static_cast<float>(frac)*(1.0f/MixerFracOne)); } -inline float do_cubic(const InterpState&, const float *RESTRICT vals, const uint frac) -{ return cubic(vals[0], vals[1], vals[2], vals[3], static_cast<float>(frac)*(1.0f/MixerFracOne)); } +inline float do_cubic(const InterpState &istate, const float *RESTRICT vals, const uint frac) +{ + /* Calculate the phase index and factor. */ + const uint pi{frac >> CubicPhaseBitDiff}; + const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; + + const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(istate.cubic.filter + pi); + + // Apply the phase interpolated filter. + return (filter->mCoeffs[0] + pf*filter->mDeltas[0]) * vals[0] + + (filter->mCoeffs[1] + pf*filter->mDeltas[1]) * vals[1] + + (filter->mCoeffs[2] + pf*filter->mDeltas[2]) * vals[2] + + (filter->mCoeffs[3] + pf*filter->mDeltas[3]) * vals[3]; +} inline float do_bsinc(const InterpState &istate, const float *RESTRICT vals, const uint frac) { const size_t m{istate.bsinc.m}; ASSUME(m > 0); // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BsincPhaseBitDiff}; + const float pf{static_cast<float>(frac & (BsincPhaseDiffOne-1)) * (1.0f/BsincPhaseDiffOne)}; const float *RESTRICT fil{istate.bsinc.filter + m*pi*2}; const float *RESTRICT phd{fil + m}; @@ -55,8 +72,8 @@ inline float do_fastbsinc(const InterpState &istate, const float *RESTRICT vals, ASSUME(m > 0); // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BsincPhaseBitDiff}; + const float pf{static_cast<float>(frac & (BsincPhaseDiffOne-1)) * (1.0f/BsincPhaseDiffOne)}; const float *RESTRICT fil{istate.bsinc.filter + m*pi*2}; const float *RESTRICT phd{fil + m}; diff --git a/core/mixer/mixer_neon.cpp b/core/mixer/mixer_neon.cpp index 5a91da60..e636ac74 100644 --- a/core/mixer/mixer_neon.cpp +++ b/core/mixer/mixer_neon.cpp @@ -7,11 +7,13 @@ #include "alnumeric.h" #include "core/bsinc_defs.h" +#include "core/cubic_defs.h" #include "defs.h" #include "hrtfbase.h" struct NEONTag; struct LerpTag; +struct CubicTag; struct BSincTag; struct FastBSincTag; @@ -22,6 +24,14 @@ struct FastBSincTag; namespace { +constexpr uint BSincPhaseBitDiff{MixerFracBits - BSincPhaseBits}; +constexpr uint BSincPhaseDiffOne{1 << BSincPhaseBitDiff}; +constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u}; + +constexpr uint CubicPhaseBitDiff{MixerFracBits - CubicPhaseBits}; +constexpr uint CubicPhaseDiffOne{1 << CubicPhaseBitDiff}; +constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u}; + inline float32x4_t set_f4(float l0, float l1, float l2, float l3) { float32x4_t ret{vmovq_n_f32(l0)}; @@ -31,9 +41,6 @@ inline float32x4_t set_f4(float l0, float l1, float l2, float l3) return ret; } -constexpr uint FracPhaseBitDiff{MixerFracBits - BSincPhaseBits}; -constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff}; - inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs, const float left, const float right) { @@ -184,6 +191,37 @@ float *Resample_<LerpTag,NEONTag>(const InterpState*, float *RESTRICT src, uint } template<> +float *Resample_<CubicTag,NEONTag>(const InterpState *state, float *RESTRICT src, uint frac, + uint increment, const al::span<float> dst) +{ + const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(state->cubic.filter); + + src -= 1; + for(float &out_sample : dst) + { + const uint pi{frac >> CubicPhaseBitDiff}; + const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; + const float32x4_t pf4{vdupq_n_f32(pf)}; + + /* Apply the phase interpolated filter. */ + + /* f = fil + pf*phd */ + const float32x4_t f4 = vmlaq_f32(vld1q_f32(filter[pi].mCoeffs), pf4, + vld1q_f32(filter[pi].mDeltas)); + /* r = f*src */ + float32x4_t r4{vmulq_f32(f4, vld1q_f32(src))}; + + r4 = vaddq_f32(r4, vrev64q_f32(r4)); + out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0); + + frac += increment; + src += frac>>MixerFracBits; + frac &= MixerFracMask; + } + return dst.data(); +} + +template<> float *Resample_<BSincTag,NEONTag>(const InterpState *state, float *RESTRICT src, uint frac, uint increment, const al::span<float> dst) { @@ -196,8 +234,8 @@ float *Resample_<BSincTag,NEONTag>(const InterpState *state, float *RESTRICT src for(float &out_sample : dst) { // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BSincPhaseBitDiff}; + const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)}; // Apply the scale and phase interpolated filter. float32x4_t r4{vdupq_n_f32(0.0f)}; @@ -242,8 +280,8 @@ float *Resample_<FastBSincTag,NEONTag>(const InterpState *state, float *RESTRICT for(float &out_sample : dst) { // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BSincPhaseBitDiff}; + const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)}; // Apply the phase interpolated filter. float32x4_t r4{vdupq_n_f32(0.0f)}; diff --git a/core/mixer/mixer_sse.cpp b/core/mixer/mixer_sse.cpp index 1b0d1386..4a31a0f1 100644 --- a/core/mixer/mixer_sse.cpp +++ b/core/mixer/mixer_sse.cpp @@ -7,10 +7,12 @@ #include "alnumeric.h" #include "core/bsinc_defs.h" +#include "core/cubic_defs.h" #include "defs.h" #include "hrtfbase.h" struct SSETag; +struct CubicTag; struct BSincTag; struct FastBSincTag; @@ -21,8 +23,13 @@ struct FastBSincTag; namespace { -constexpr uint FracPhaseBitDiff{MixerFracBits - BSincPhaseBits}; -constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff}; +constexpr uint BSincPhaseBitDiff{MixerFracBits - BSincPhaseBits}; +constexpr uint BSincPhaseDiffOne{1 << BSincPhaseBitDiff}; +constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u}; + +constexpr uint CubicPhaseBitDiff{MixerFracBits - CubicPhaseBits}; +constexpr uint CubicPhaseDiffOne{1 << CubicPhaseBitDiff}; +constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u}; #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z)) @@ -147,6 +154,38 @@ force_inline void MixLine(const al::span<const float> InSamples, float *RESTRICT } // namespace template<> +float *Resample_<CubicTag,SSETag>(const InterpState *state, float *RESTRICT src, uint frac, + uint increment, const al::span<float> dst) +{ + const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(state->cubic.filter); + + src -= 1; + for(float &out_sample : dst) + { + const uint pi{frac >> CubicPhaseBitDiff}; + const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; + const __m128 pf4{_mm_set1_ps(pf)}; + + /* Apply the phase interpolated filter. */ + + /* f = fil + pf*phd */ + const __m128 f4 = MLA4(_mm_load_ps(filter[pi].mCoeffs), pf4, + _mm_load_ps(filter[pi].mDeltas)); + /* r = f*src */ + __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(src))}; + + r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); + r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + out_sample = _mm_cvtss_f32(r4); + + frac += increment; + src += frac>>MixerFracBits; + frac &= MixerFracMask; + } + return dst.data(); +} + +template<> float *Resample_<BSincTag,SSETag>(const InterpState *state, float *RESTRICT src, uint frac, uint increment, const al::span<float> dst) { @@ -159,8 +198,8 @@ float *Resample_<BSincTag,SSETag>(const InterpState *state, float *RESTRICT src, for(float &out_sample : dst) { // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BSincPhaseBitDiff}; + const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)}; // Apply the scale and phase interpolated filter. __m128 r4{_mm_setzero_ps()}; @@ -206,8 +245,8 @@ float *Resample_<FastBSincTag,SSETag>(const InterpState *state, float *RESTRICT for(float &out_sample : dst) { // Calculate the phase index and factor. - const uint pi{frac >> FracPhaseBitDiff}; - const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)}; + const uint pi{frac >> BSincPhaseBitDiff}; + const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)}; // Apply the phase interpolated filter. __m128 r4{_mm_setzero_ps()}; |