aboutsummaryrefslogtreecommitdiffstats
path: root/common/pffft.cpp
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2023-10-11 15:56:11 -0700
committerChris Robinson <[email protected]>2023-10-11 15:56:11 -0700
commit5149cb8c357630dba5253e2568b68d2ed069bcea (patch)
tree7fbf96b5e694e71029441d9db89df083cfb1c8f5 /common/pffft.cpp
parentce25165944913c12b9b782e40691f3be1d18dadd (diff)
Make and use a separate zconvolve method without scaling
When you're doing hundreds or thousands of separate zconvolve calls into the same buffer, it's more efficient to do the multiply once at the end instead of in each call.
Diffstat (limited to 'common/pffft.cpp')
-rw-r--r--common/pffft.cpp79
1 files changed, 76 insertions, 3 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 7e5ba5c3..f8568acf 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -1904,7 +1904,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
}
}
-void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
float scaling)
{
const size_t Ncvec{s->Ncvec};
@@ -2006,6 +2006,59 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
}
}
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+{
+ const size_t Ncvec{s->Ncvec};
+ const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
+ const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
+ v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
+
+#ifdef __arm__
+ __builtin_prefetch(va);
+ __builtin_prefetch(vb);
+ __builtin_prefetch(vab);
+ __builtin_prefetch(va+2);
+ __builtin_prefetch(vb+2);
+ __builtin_prefetch(vab+2);
+ __builtin_prefetch(va+4);
+ __builtin_prefetch(vb+4);
+ __builtin_prefetch(vab+4);
+ __builtin_prefetch(va+6);
+ __builtin_prefetch(vb+6);
+ __builtin_prefetch(vab+6);
+#endif
+
+ const float ar1{VEXTRACT0(va[0])};
+ const float ai1{VEXTRACT0(va[1])};
+ const float br1{VEXTRACT0(vb[0])};
+ const float bi1{VEXTRACT0(vb[1])};
+ const float abr1{VEXTRACT0(vab[0])};
+ const float abi1{VEXTRACT0(vab[1])};
+
+ /* No inline assembly for this version. I'm not familiar enough with NEON
+ * assembly, and I don't know that it's needed with today's optimizers.
+ */
+ for(size_t i{0};i < Ncvec;i += 2)
+ {
+ v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
+ v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
+ VCPLXMUL(ar4, ai4, br4, bi4);
+ vab[2*i+0] = VADD(ar4, vab[2*i+0]);
+ vab[2*i+1] = VADD(ai4, vab[2*i+1]);
+ ar4 = va[2*i+2]; ai4 = va[2*i+3];
+ br4 = vb[2*i+2]; bi4 = vb[2*i+3];
+ VCPLXMUL(ar4, ai4, br4, bi4);
+ vab[2*i+2] = VADD(ar4, vab[2*i+2]);
+ vab[2*i+3] = VADD(ai4, vab[2*i+3]);
+ }
+
+ if(s->transform == PFFFT_REAL)
+ {
+ vab[0] = VINSERT0(vab[0], abr1 + ar1*br1);
+ vab[1] = VINSERT0(vab[1], abi1 + ai1*bi1);
+ }
+}
+
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{
@@ -2115,8 +2168,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
}
}
-#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
-void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
float scaling)
{
size_t Ncvec{s->Ncvec};
@@ -2138,6 +2190,27 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo
}
}
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+{
+ size_t Ncvec{s->Ncvec};
+
+ if(s->transform == PFFFT_REAL)
+ {
+ // take care of the fftpack ordering
+ ab[0] += a[0]*b[0];
+ ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1];
+ ++ab; ++a; ++b; --Ncvec;
+ }
+ for(size_t i{0};i < Ncvec;++i)
+ {
+ float ar{a[2*i+0]}, ai{a[2*i+1]};
+ const float br{b[2*i+0]}, bi{b[2*i+1]};
+ VCPLXMUL(ar, ai, br, bi);
+ ab[2*i+0] += ar;
+ ab[2*i+1] += ai;
+ }
+}
+
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{