aboutsummaryrefslogtreecommitdiffstats
path: root/alc
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2020-09-09 02:23:38 -0700
committerChris Robinson <[email protected]>2020-09-09 02:23:38 -0700
commit68a099ba35b9cf1fe0dd18e240abd48662ce75f3 (patch)
tree7333d3710ddc43542798081172fe475d5f6967d4 /alc
parentf4a55cc8c2e8093e1156686292a38ae10a738a08 (diff)
Apply the first convolution segment in the time domain
This avoids an inherent delay from the effect, at the cost of higher CPU use. Having a customizable user-specified delay (with said user ensuring a properly trimmed impulse response) could help alleviate the cost since once the delay exceeds the segment size, the initial FIR filter could be skipped.
Diffstat (limited to 'alc')
-rw-r--r--alc/effects/convolution.cpp108
1 files changed, 86 insertions, 22 deletions
diff --git a/alc/effects/convolution.cpp b/alc/effects/convolution.cpp
index bbf8cb3f..bb8e31d1 100644
--- a/alc/effects/convolution.cpp
+++ b/alc/effects/convolution.cpp
@@ -1,6 +1,10 @@
#include "config.h"
+#ifdef HAVE_SSE_INTRINSICS
+#include <xmmintrin.h>
+#endif
+
#include "AL/al.h"
#include "AL/alc.h"
@@ -46,12 +50,10 @@ namespace {
* convolution being applied in the frequency domain, so these "overflow"
* samples need to be accounted for.
*
- * Limitations:
- * There is currently a 512-sample delay on the output, as a result of needing
- * to collect that many input samples to do an FFT with. This can be fixed by
- * excluding the first impulse response segment from being FFT'd, and applying
- * it directly in the time domain. This will have higher CPU consumption, but
- * it won't have to wait before generating output.
+ * To avoid a delay with gathering enough input samples to apply an FFT with,
+ * the first segment is applied directly in the time-domain as the samples come
+ * in. Once enough have been retrieved, the FFT is applied on the input and
+ * it's paired with the remaining (FFT'd) filter segments for processing.
*/
@@ -98,6 +100,39 @@ constexpr size_t ConvolveUpdateSize{1024};
constexpr size_t ConvolveUpdateSamples{ConvolveUpdateSize / 2};
+void apply_fir(al::span<float> dst, const float *RESTRICT src, const float *RESTRICT filter)
+{
+#ifdef HAVE_SSE_INTRINSICS
+ for(float &output : dst)
+ {
+ __m128 r4{_mm_setzero_ps()};
+ for(size_t j{0};j < ConvolveUpdateSamples;j+=4)
+ {
+ const __m128 coeffs{_mm_load_ps(&filter[j])};
+ const __m128 s{_mm_loadu_ps(&src[j])};
+
+ r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
+ }
+ r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
+ r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+ output = _mm_cvtss_f32(r4);
+
+ ++src;
+ }
+
+#else
+
+ for(float &output : dst)
+ {
+ float ret{0.0f};
+ for(size_t j{0};j < ConvolveUpdateSamples;++j)
+ ret += src[j] * filter[j];
+ output = ret;
+ ++src;
+ }
+#endif
+}
+
struct ConvolutionState final : public EffectState {
FmtChannels mChannels{};
AmbiLayout mAmbiLayout{};
@@ -105,7 +140,10 @@ struct ConvolutionState final : public EffectState {
ALuint mAmbiOrder{};
size_t mFifoPos{0};
- al::vector<std::array<double,ConvolveUpdateSamples*2>,16> mOutput;
+ std::array<float,ConvolveUpdateSamples*2> mInput{};
+ al::vector<std::array<float,ConvolveUpdateSamples>,16> mFilter;
+ al::vector<std::array<float,ConvolveUpdateSamples*2>,16> mOutput;
+
alignas(16) std::array<complex_d,ConvolveUpdateSize> mFftBuffer{};
size_t mCurrentSegment{0};
@@ -166,6 +204,8 @@ void ConvolutionState::deviceUpdate(const ALCdevice* /*device*/)
void ConvolutionState::setBuffer(const ALCdevice *device, const BufferStorage *buffer)
{
mFifoPos = 0;
+ mInput.fill(0.0f);
+ decltype(mFilter){}.swap(mFilter);
decltype(mOutput){}.swap(mOutput);
mFftBuffer.fill(complex_d{});
@@ -210,12 +250,16 @@ void ConvolutionState::setBuffer(const ALCdevice *device, const BufferStorage *b
for(auto &e : *mChans)
e.mFilter = splitter;
+ mFilter.resize(numChannels, {});
mOutput.resize(numChannels, {});
/* Calculate the number of segments needed to hold the impulse response and
- * the input history (rounded up), and allocate them.
+ * the input history (rounded up), and allocate them. Exclude one segment
+ * which gets applied as a time-domain FIR filter. Make sure at least one
+ * segment is allocated to simplify handling.
*/
mNumConvolveSegs = (resampledCount+(ConvolveUpdateSamples-1)) / ConvolveUpdateSamples;
+ mNumConvolveSegs = maxz(mNumConvolveSegs, 2) - 1;
const size_t complex_length{mNumConvolveSegs * m * (numChannels+1)};
mComplexData = std::make_unique<complex_d[]>(complex_length);
@@ -238,7 +282,14 @@ void ConvolutionState::setBuffer(const ALCdevice *device, const BufferStorage *b
resampler.process(buffer->mSampleLen, srcsamples.get(), resampledCount,
srcsamples.get());
- size_t done{0};
+ /* Store the first segment's samples in reverse in the time-domain, to
+ * apply as a FIR filter.
+ */
+ const size_t first_size{minz(resampledCount, ConvolveUpdateSamples)};
+ std::transform(srcsamples.get(), srcsamples.get()+first_size, mFilter[c].rbegin(),
+ [](const double d) noexcept -> float { return static_cast<float>(d); });
+
+ size_t done{first_size};
for(size_t s{0};s < mNumConvolveSegs;++s)
{
const size_t todo{minz(resampledCount-done, ConvolveUpdateSamples)};
@@ -263,10 +314,7 @@ void ConvolutionState::update(const ALCcontext *context, const ALeffectslot *slo
ALCdevice *device{context->mDevice.get()};
mMix = &ConvolutionState::NormalMix;
- /* The iFFT'd response is scaled up by the number of bins, so apply the
- * inverse to the output mixing gain.
- */
- const float gain{slot->Params.Gain * (1.0f/float{ConvolveUpdateSize})};
+ const float gain{slot->Params.Gain};
auto &chans = *mChans;
if(mChannels == FmtBFormat3D || mChannels == FmtBFormat2D)
{
@@ -340,27 +388,36 @@ void ConvolutionState::process(const size_t samplesToDo,
{
const size_t todo{minz(ConvolveUpdateSamples-mFifoPos, samplesToDo-base)};
- /* Retrieve the output samples from the FIFO and fill in the new input
- * samples.
+ std::copy_n(samplesIn[0].begin() + base, todo,
+ mInput.begin()+ConvolveUpdateSamples+mFifoPos);
+
+ /* Apply the FIR for the newly retrieved input samples, and combine it
+ * with the inverse FFT'd output samples.
*/
for(size_t c{0};c < chans.size();++c)
{
+ auto buf_iter = chans[c].mBuffer.begin() + base;
+ apply_fir({std::addressof(*buf_iter), todo}, mInput.cbegin()+1 + mFifoPos,
+ mFilter[c].cbegin());
+
auto fifo_iter = mOutput[c].begin() + mFifoPos;
- std::transform(fifo_iter, fifo_iter+todo, chans[c].mBuffer.begin()+base,
- [](double d) noexcept -> float { return static_cast<float>(d); });
+ std::transform(fifo_iter, fifo_iter+todo, buf_iter, buf_iter, std::plus<>{});
}
- std::copy_n(samplesIn[0].begin()+base, todo, mFftBuffer.begin()+mFifoPos);
mFifoPos += todo;
base += todo;
- /* Check whether FIFO buffer is filled with new samples. */
+ /* Check whether the input buffer is filled with new samples. */
if(mFifoPos < ConvolveUpdateSamples) break;
mFifoPos = 0;
+ /* Move the newest input to the front for the next iteration's history. */
+ std::copy(mInput.cbegin()+ConvolveUpdateSamples, mInput.cend(), mInput.begin());
+
/* Calculate the frequency domain response and add the relevant
- * frequency bins to the input history.
+ * frequency bins to the FFT history.
*/
+ std::copy_n(mInput.cbegin(), ConvolveUpdateSamples, mFftBuffer.begin());
complex_fft(mFftBuffer, -1.0);
std::copy_n(mFftBuffer.begin(), m, &mComplexData[curseg*m]);
@@ -398,10 +455,17 @@ void ConvolutionState::process(const size_t samplesToDo,
*/
complex_fft(mFftBuffer, 1.0);
+ /* The iFFT'd response is scaled up by the number of bins, so apply
+ * the inverse to normalize the output.
+ */
for(size_t i{0};i < ConvolveUpdateSamples;++i)
- mOutput[c][i] = mFftBuffer[i].real() + mOutput[c][ConvolveUpdateSamples+i];
+ mOutput[c][i] =
+ static_cast<float>(mFftBuffer[i].real() * (1.0/double{ConvolveUpdateSize})) +
+ mOutput[c][ConvolveUpdateSamples+i];
for(size_t i{0};i < ConvolveUpdateSamples;++i)
- mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i].real();
+ mOutput[c][ConvolveUpdateSamples+i] =
+ static_cast<float>(mFftBuffer[ConvolveUpdateSamples+i].real() *
+ (1.0/double{ConvolveUpdateSize}));
mFftBuffer.fill(complex_d{});
}