diff options
-rw-r--r-- | alc/effects/convolution.cpp | 15 | ||||
-rw-r--r-- | common/pffft.cpp | 79 | ||||
-rw-r--r-- | common/pffft.h | 16 |
3 files changed, 96 insertions, 14 deletions
diff --git a/alc/effects/convolution.cpp b/alc/effects/convolution.cpp index c7a342dc..a98a0616 100644 --- a/alc/effects/convolution.cpp +++ b/alc/effects/convolution.cpp @@ -655,11 +655,6 @@ void ConvolutionState::process(const size_t samplesToDo, const float *RESTRICT filter{mComplexData.get() + mNumConvolveSegs*ConvolveUpdateSize}; for(size_t c{0};c < chans.size();++c) { - /* The iFFT'd response is scaled up by the number of bins, so apply - * the inverse to normalize the output. - */ - static constexpr float fftscale{1.0f / float{ConvolveUpdateSize}}; - /* Convolve each input segment with its IR filter counterpart * (aligned in time). */ @@ -667,14 +662,14 @@ void ConvolutionState::process(const size_t samplesToDo, const float *RESTRICT input{&mComplexData[curseg*ConvolveUpdateSize]}; for(size_t s{curseg};s < mNumConvolveSegs;++s) { - pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data(), fftscale); + pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data()); input += ConvolveUpdateSize; filter += ConvolveUpdateSize; } input = mComplexData.get(); for(size_t s{0};s < curseg;++s) { - pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data(), fftscale); + pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data()); input += ConvolveUpdateSize; filter += ConvolveUpdateSize; } @@ -687,8 +682,12 @@ void ConvolutionState::process(const size_t samplesToDo, pffft_transform(mFft.get(), mFftBuffer.data(), mFftBuffer.data(), mFftWorkBuffer.data(), PFFFT_BACKWARD); + /* The iFFT'd response is scaled up by the number of bins, so apply + * the inverse to normalize the output. + */ + static constexpr float fftscale{1.0f / float{ConvolveUpdateSize}}; for(size_t i{0};i < ConvolveUpdateSamples;++i) - mOutput[c][i] = mFftBuffer[i] + mOutput[c][ConvolveUpdateSamples+i]; + mOutput[c][i] = (mFftBuffer[i]+mOutput[c][ConvolveUpdateSamples+i]) * fftscale; for(size_t i{0};i < ConvolveUpdateSamples;++i) mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i]; } diff --git a/common/pffft.cpp b/common/pffft.cpp index 7e5ba5c3..f8568acf 100644 --- a/common/pffft.cpp +++ b/common/pffft.cpp @@ -1904,7 +1904,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc } } -void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, +void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { const size_t Ncvec{s->Ncvec}; @@ -2006,6 +2006,59 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, } } +void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab) +{ + const size_t Ncvec{s->Ncvec}; + const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)}; + const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)}; + v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)}; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +#endif + + const float ar1{VEXTRACT0(va[0])}; + const float ai1{VEXTRACT0(va[1])}; + const float br1{VEXTRACT0(vb[0])}; + const float bi1{VEXTRACT0(vb[1])}; + const float abr1{VEXTRACT0(vab[0])}; + const float abi1{VEXTRACT0(vab[1])}; + + /* No inline assembly for this version. I'm not familiar enough with NEON + * assembly, and I don't know that it's needed with today's optimizers. + */ + for(size_t i{0};i < Ncvec;i += 2) + { + v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]}; + v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]}; + VCPLXMUL(ar4, ai4, br4, bi4); + vab[2*i+0] = VADD(ar4, vab[2*i+0]); + vab[2*i+1] = VADD(ai4, vab[2*i+1]); + ar4 = va[2*i+2]; ai4 = va[2*i+3]; + br4 = vb[2*i+2]; bi4 = vb[2*i+3]; + VCPLXMUL(ar4, ai4, br4, bi4); + vab[2*i+2] = VADD(ar4, vab[2*i+2]); + vab[2*i+3] = VADD(ai4, vab[2*i+3]); + } + + if(s->transform == PFFFT_REAL) + { + vab[0] = VINSERT0(vab[0], abr1 + ar1*br1); + vab[1] = VINSERT0(vab[1], abi1 + ai1*bi1); + } +} + void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { @@ -2115,8 +2168,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, } } -#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate -void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab, +void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { size_t Ncvec{s->Ncvec}; @@ -2138,6 +2190,27 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo } } +void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab) +{ + size_t Ncvec{s->Ncvec}; + + if(s->transform == PFFFT_REAL) + { + // take care of the fftpack ordering + ab[0] += a[0]*b[0]; + ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]; + ++ab; ++a; ++b; --Ncvec; + } + for(size_t i{0};i < Ncvec;++i) + { + float ar{a[2*i+0]}, ai{a[2*i+1]}; + const float br{b[2*i+0]}, bi{b[2*i+1]}; + VCPLXMUL(ar, ai, br, bi); + ab[2*i+0] += ar; + ab[2*i+1] += ai; + } +} + void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { diff --git a/common/pffft.h b/common/pffft.h index 3b402ca4..4bc3ebb6 100644 --- a/common/pffft.h +++ b/common/pffft.h @@ -152,8 +152,8 @@ void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *outp void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); /** - * Perform a multiplication of the z-domain data in dft_a and dft_b and - * accumulate them into dft_ab. The arrays should have been obtained with + * Perform a multiplication of the z-domain data in dft_a and dft_b, and scale + * and accumulate into dft_ab. The arrays should have been obtained with * pffft_transform(..., PFFFT_FORWARD) or pffft_zreorder(..., PFFFT_BACKWARD) * and should *not* be in the usual order (otherwise just perform the operation * yourself as the dft coeffs are stored as interleaved complex numbers). @@ -162,7 +162,17 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft * * The dft_a, dft_b, and dft_ab parameters may alias. */ -void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); +void pffft_zconvolve_scale_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); + +/** + * Perform a multiplication of the z-domain data in dft_a and dft_b, and + * accumulate into dft_ab. + * + * The operation performed is: dft_ab += dft_a * dft_b + * + * The dft_a, dft_b, and dft_ab parameters may alias. + */ +void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab); /** * The float buffers must have the correct alignment (16-byte boundary on intel |