diff options
Diffstat (limited to 'common')
-rw-r--r-- | common/phase_shifter.h | 115 |
1 files changed, 41 insertions, 74 deletions
diff --git a/common/phase_shifter.h b/common/phase_shifter.h index 18ab34c7..ace92c9a 100644 --- a/common/phase_shifter.h +++ b/common/phase_shifter.h @@ -70,6 +70,47 @@ struct PhaseShifterT { void process(al::span<float> dst, const float *RESTRICT src) const; void processAccum(al::span<float> dst, const float *RESTRICT src) const; + +private: +#if defined(HAVE_NEON) + /* There doesn't seem to be NEON intrinsics to do this kind of stipple + * shuffling, so there's two custom methods for it. + */ + static auto shuffle_2020(float32x4_t a, float32x4_t b) + { + float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))}; + ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3); + return ret; + } + static auto shuffle_3131(float32x4_t a, float32x4_t b) + { + float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))}; + ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3); + return ret; + } + static auto unpacklo(float32x4_t a, float32x4_t b) + { + float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))}; + return vcombine_f32(result.val[0], result.val[1]); + } + static auto unpackhi(float32x4_t a, float32x4_t b) + { + float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))}; + return vcombine_f32(result.val[0], result.val[1]); + } + static auto load4(float32_t a, float32_t b, float32_t c, float32_t d) + { + float32x4_t ret{vmovq_n_f32(a)}; + ret = vsetq_lane_f32(b, ret, 1); + ret = vsetq_lane_f32(c, ret, 2); + ret = vsetq_lane_f32(d, ret, 3); + return ret; + } +#endif }; template<size_t S> @@ -123,35 +164,6 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT size_t pos{0}; if(size_t todo{dst.size()>>1}) { - /* There doesn't seem to be NEON intrinsics to do this kind of stipple - * shuffling, so there's two custom methods for it. - */ - auto shuffle_2020 = [](float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3); - return ret; - }; - auto shuffle_3131 = [](float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3); - return ret; - }; - auto unpacklo = [](float32x4_t a, float32x4_t b) - { - float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))}; - return vcombine_f32(result.val[0], result.val[1]); - }; - auto unpackhi = [](float32x4_t a, float32x4_t b) - { - float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))}; - return vcombine_f32(result.val[0], result.val[1]); - }; do { float32x4_t r04{vdupq_n_f32(0.0f)}; float32x4_t r14{vdupq_n_f32(0.0f)}; @@ -175,14 +187,6 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT } if((dst.size()&1)) { - auto load4 = [](float32_t a, float32_t b, float32_t c, float32_t d) - { - float32x4_t ret{vmovq_n_f32(a)}; - ret = vsetq_lane_f32(b, ret, 1); - ret = vsetq_lane_f32(c, ret, 2); - ret = vsetq_lane_f32(d, ret, 3); - return ret; - }; float32x4_t r4{vdupq_n_f32(0.0f)}; for(size_t j{0};j < mCoeffs.size();j+=4) { @@ -245,9 +249,6 @@ inline void PhaseShifterT<S>::processAccum(al::span<float> dst, const float *RES for(size_t j{0};j < mCoeffs.size();j+=4) { const __m128 coeffs{_mm_load_ps(&mCoeffs[j])}; - /* NOTE: This could alternatively be done with two unaligned loads - * and a shuffle. Which would be better? - */ const __m128 s{_mm_setr_ps(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])}; r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs)); } @@ -262,32 +263,6 @@ inline void PhaseShifterT<S>::processAccum(al::span<float> dst, const float *RES size_t pos{0}; if(size_t todo{dst.size()>>1}) { - auto shuffle_2020 = [](float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3); - return ret; - }; - auto shuffle_3131 = [](float32x4_t a, float32x4_t b) - { - float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))}; - ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2); - ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3); - return ret; - }; - auto unpacklo = [](float32x4_t a, float32x4_t b) - { - float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))}; - return vcombine_f32(result.val[0], result.val[1]); - }; - auto unpackhi = [](float32x4_t a, float32x4_t b) - { - float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))}; - return vcombine_f32(result.val[0], result.val[1]); - }; do { float32x4_t r04{vdupq_n_f32(0.0f)}; float32x4_t r14{vdupq_n_f32(0.0f)}; @@ -311,14 +286,6 @@ inline void PhaseShifterT<S>::processAccum(al::span<float> dst, const float *RES } if((dst.size()&1)) { - auto load4 = [](float32_t a, float32_t b, float32_t c, float32_t d) - { - float32x4_t ret{vmovq_n_f32(a)}; - ret = vsetq_lane_f32(b, ret, 1); - ret = vsetq_lane_f32(c, ret, 2); - ret = vsetq_lane_f32(d, ret, 3); - return ret; - }; float32x4_t r4{vdupq_n_f32(0.0f)}; for(size_t j{0};j < mCoeffs.size();j+=4) { |