aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--alc/effects/convolution.cpp15
-rw-r--r--common/pffft.cpp79
-rw-r--r--common/pffft.h16
3 files changed, 96 insertions, 14 deletions
diff --git a/alc/effects/convolution.cpp b/alc/effects/convolution.cpp
index c7a342dc..a98a0616 100644
--- a/alc/effects/convolution.cpp
+++ b/alc/effects/convolution.cpp
@@ -655,11 +655,6 @@ void ConvolutionState::process(const size_t samplesToDo,
const float *RESTRICT filter{mComplexData.get() + mNumConvolveSegs*ConvolveUpdateSize};
for(size_t c{0};c < chans.size();++c)
{
- /* The iFFT'd response is scaled up by the number of bins, so apply
- * the inverse to normalize the output.
- */
- static constexpr float fftscale{1.0f / float{ConvolveUpdateSize}};
-
/* Convolve each input segment with its IR filter counterpart
* (aligned in time).
*/
@@ -667,14 +662,14 @@ void ConvolutionState::process(const size_t samplesToDo,
const float *RESTRICT input{&mComplexData[curseg*ConvolveUpdateSize]};
for(size_t s{curseg};s < mNumConvolveSegs;++s)
{
- pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data(), fftscale);
+ pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data());
input += ConvolveUpdateSize;
filter += ConvolveUpdateSize;
}
input = mComplexData.get();
for(size_t s{0};s < curseg;++s)
{
- pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data(), fftscale);
+ pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data());
input += ConvolveUpdateSize;
filter += ConvolveUpdateSize;
}
@@ -687,8 +682,12 @@ void ConvolutionState::process(const size_t samplesToDo,
pffft_transform(mFft.get(), mFftBuffer.data(), mFftBuffer.data(),
mFftWorkBuffer.data(), PFFFT_BACKWARD);
+ /* The iFFT'd response is scaled up by the number of bins, so apply
+ * the inverse to normalize the output.
+ */
+ static constexpr float fftscale{1.0f / float{ConvolveUpdateSize}};
for(size_t i{0};i < ConvolveUpdateSamples;++i)
- mOutput[c][i] = mFftBuffer[i] + mOutput[c][ConvolveUpdateSamples+i];
+ mOutput[c][i] = (mFftBuffer[i]+mOutput[c][ConvolveUpdateSamples+i]) * fftscale;
for(size_t i{0};i < ConvolveUpdateSamples;++i)
mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i];
}
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 7e5ba5c3..f8568acf 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -1904,7 +1904,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
}
}
-void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
float scaling)
{
const size_t Ncvec{s->Ncvec};
@@ -2006,6 +2006,59 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
}
}
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+{
+ const size_t Ncvec{s->Ncvec};
+ const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
+ const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
+ v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
+
+#ifdef __arm__
+ __builtin_prefetch(va);
+ __builtin_prefetch(vb);
+ __builtin_prefetch(vab);
+ __builtin_prefetch(va+2);
+ __builtin_prefetch(vb+2);
+ __builtin_prefetch(vab+2);
+ __builtin_prefetch(va+4);
+ __builtin_prefetch(vb+4);
+ __builtin_prefetch(vab+4);
+ __builtin_prefetch(va+6);
+ __builtin_prefetch(vb+6);
+ __builtin_prefetch(vab+6);
+#endif
+
+ const float ar1{VEXTRACT0(va[0])};
+ const float ai1{VEXTRACT0(va[1])};
+ const float br1{VEXTRACT0(vb[0])};
+ const float bi1{VEXTRACT0(vb[1])};
+ const float abr1{VEXTRACT0(vab[0])};
+ const float abi1{VEXTRACT0(vab[1])};
+
+ /* No inline assembly for this version. I'm not familiar enough with NEON
+ * assembly, and I don't know that it's needed with today's optimizers.
+ */
+ for(size_t i{0};i < Ncvec;i += 2)
+ {
+ v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
+ v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
+ VCPLXMUL(ar4, ai4, br4, bi4);
+ vab[2*i+0] = VADD(ar4, vab[2*i+0]);
+ vab[2*i+1] = VADD(ai4, vab[2*i+1]);
+ ar4 = va[2*i+2]; ai4 = va[2*i+3];
+ br4 = vb[2*i+2]; bi4 = vb[2*i+3];
+ VCPLXMUL(ar4, ai4, br4, bi4);
+ vab[2*i+2] = VADD(ar4, vab[2*i+2]);
+ vab[2*i+3] = VADD(ai4, vab[2*i+3]);
+ }
+
+ if(s->transform == PFFFT_REAL)
+ {
+ vab[0] = VINSERT0(vab[0], abr1 + ar1*br1);
+ vab[1] = VINSERT0(vab[1], abi1 + ai1*bi1);
+ }
+}
+
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{
@@ -2115,8 +2168,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
}
}
-#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
-void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
float scaling)
{
size_t Ncvec{s->Ncvec};
@@ -2138,6 +2190,27 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo
}
}
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+{
+ size_t Ncvec{s->Ncvec};
+
+ if(s->transform == PFFFT_REAL)
+ {
+ // take care of the fftpack ordering
+ ab[0] += a[0]*b[0];
+ ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1];
+ ++ab; ++a; ++b; --Ncvec;
+ }
+ for(size_t i{0};i < Ncvec;++i)
+ {
+ float ar{a[2*i+0]}, ai{a[2*i+1]};
+ const float br{b[2*i+0]}, bi{b[2*i+1]};
+ VCPLXMUL(ar, ai, br, bi);
+ ab[2*i+0] += ar;
+ ab[2*i+1] += ai;
+ }
+}
+
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{
diff --git a/common/pffft.h b/common/pffft.h
index 3b402ca4..4bc3ebb6 100644
--- a/common/pffft.h
+++ b/common/pffft.h
@@ -152,8 +152,8 @@ void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *outp
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
/**
- * Perform a multiplication of the z-domain data in dft_a and dft_b and
- * accumulate them into dft_ab. The arrays should have been obtained with
+ * Perform a multiplication of the z-domain data in dft_a and dft_b, and scale
+ * and accumulate into dft_ab. The arrays should have been obtained with
* pffft_transform(..., PFFFT_FORWARD) or pffft_zreorder(..., PFFFT_BACKWARD)
* and should *not* be in the usual order (otherwise just perform the operation
* yourself as the dft coeffs are stored as interleaved complex numbers).
@@ -162,7 +162,17 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft
*
* The dft_a, dft_b, and dft_ab parameters may alias.
*/
-void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+/**
+ * Perform a multiplication of the z-domain data in dft_a and dft_b, and
+ * accumulate into dft_ab.
+ *
+ * The operation performed is: dft_ab += dft_a * dft_b
+ *
+ * The dft_a, dft_b, and dft_ab parameters may alias.
+ */
+void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab);
/**
* The float buffers must have the correct alignment (16-byte boundary on intel