diff options
author | bradleys <[email protected]> | 2015-01-29 03:05:57 +0000 |
---|---|---|
committer | bradleys <[email protected]> | 2015-01-29 03:05:57 +0000 |
commit | 399c64860c4ec5d8a29f34e3b130ff7018337290 (patch) | |
tree | 6f87f8fb84d2db573518b363501d19d24fd9aee1 | |
parent | 1088f1d85df3d987e295974c44a98f7162ce8fca (diff) |
libhb: NLMeans x86 SSE acceleration/optimizations.
Speed improvements of 1-6% seem typical. Most benefit seems to be for older hardware and/or hardware with fewer threads.
git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6826 b64f7644-9d1e-0410-96f1-a4d463321fa5
-rw-r--r-- | libhb/nlmeans.c | 116 | ||||
-rw-r--r-- | libhb/nlmeans.h | 27 | ||||
-rw-r--r-- | libhb/nlmeans_x86.c | 155 | ||||
-rw-r--r-- | libhb/ports.h | 6 |
4 files changed, 268 insertions, 36 deletions
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c index f5cfd56ac..78422fa1f 100644 --- a/libhb/nlmeans.c +++ b/libhb/nlmeans.c @@ -1,7 +1,7 @@ /* nlmeans.c Copyright (c) 2013 Dirk Farin - Copyright (c) 2003-2014 HandBrake Team + Copyright (c) 2003-2015 HandBrake Team This file is part of the HandBrake source code Homepage: <http://handbrake.fr/>. It may be used under the terms of the GNU General Public License v2. @@ -44,7 +44,7 @@ * 1026: Mean 5x5 plus edge boost * 1281: Mean 3x3 reduced by 25% plus edge boost * etc... - * 2049: Mean 3x3 passthru (NL-means off, prefilter is the output) + * 2049: Mean 3x3 passthru (NLMeans off, prefilter is the output) * etc... * 3329: Mean 3x3 reduced by 25% plus edge boost, passthru * etc... @@ -53,6 +53,7 @@ #include "hb.h" #include "hbffmpeg.h" #include "taskset.h" +#include "nlmeans.h" #define NLMEANS_STRENGTH_LUMA_DEFAULT 8 #define NLMEANS_STRENGTH_CHROMA_DEFAULT 8 @@ -130,6 +131,8 @@ struct hb_filter_private_s int nframes[3]; // temporal search depth in frames int prefilter[3]; // prefilter mode, can improve weight analysis + NLMeansFunctions functions; + Frame *frame; int next_frame; int max_frames; @@ -568,7 +571,50 @@ static void nlmeans_prefilter(BorderedPlane *src, hb_unlock(src->mutex); } -static void nlmeans_plane(Frame *frame, +static void build_integral_scalar(uint32_t *integral, + int integral_stride, + const uint8_t *src, + const uint8_t *src_pre, + int src_w, + const uint8_t *compare, + const uint8_t *compare_pre, + int compare_w, + int w, + int h, + int dx, + int dy) +{ + memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t)); + for (int y = 0; y < h; y++) + { + const uint8_t *p1 = src_pre + y*src_w; + const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx; + uint32_t *out = integral + (y*integral_stride) - 1; + + *out++ = 0; + + for (int x = 0; x < w; x++) + { + int diff = *p1++ - *p2++; + *out = *(out-1) + diff * diff; + out++; + } + + if (y > 0) + { + out = integral + y*integral_stride; + + for (int x = 0; x < w; x++) + { + *out += *(out - integral_stride); + out++; + } + } + } +} + +static void nlmeans_plane(NLMeansFunctions *functions, + Frame *frame, int prefilter, int plane, int nframes, @@ -644,33 +690,18 @@ static void nlmeans_plane(Frame *frame, } // Build integral - memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t)); - for (int y = 0; y < h; y++) - { - const uint8_t *p1 = src_pre + y*src_w; - const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx; - uint32_t *out = integral + (y*integral_stride) - 1; - - *out++ = 0; - - for (int x = 0; x < w; x++) - { - int diff = *p1++ - *p2++; - *out = *(out-1) + diff * diff; - out++; - } - - if (y > 0) - { - out = integral + y*integral_stride; - - for (int x = 0; x < w; x++) - { - *out += *(out - integral_stride); - out++; - } - } - } + functions->build_integral(integral, + integral_stride, + src, + src_pre, + src_w, + compare, + compare_pre, + compare_w, + w, + h, + dx, + dy); // Average displacement // TODO: Parallelize this @@ -743,6 +774,13 @@ static int nlmeans_init(hb_filter_object_t *filter, { filter->private_data = calloc(sizeof(struct hb_filter_private_s), 1); hb_filter_private_t *pv = filter->private_data; + NLMeansFunctions *functions = &pv->functions; + + functions->build_integral = build_integral_scalar; + if (ARCH_X86 == 1) + { + nlmeans_init_x86(functions); + } // Mark parameters unset for (int c = 0; c < 3; c++) @@ -815,7 +853,7 @@ static int nlmeans_init(hb_filter_object_t *filter, if (taskset_init(&pv->taskset, pv->thread_count, sizeof(nlmeans_thread_arg_t)) == 0) { - hb_error("nlmeans could not initialize taskset"); + hb_error("NLMeans could not initialize taskset"); goto fail; } @@ -824,7 +862,7 @@ static int nlmeans_init(hb_filter_object_t *filter, pv->thread_data[ii] = taskset_thread_args(&pv->taskset, ii); if (pv->thread_data[ii] == NULL) { - hb_error("nlmeans could not create thread args"); + hb_error("NLMeans could not create thread args"); goto fail; } pv->thread_data[ii]->pv = pv; @@ -832,7 +870,7 @@ static int nlmeans_init(hb_filter_object_t *filter, if (taskset_thread_spawn(&pv->taskset, ii, "nlmeans_filter", nlmeans_filter_thread, HB_NORMAL_PRIORITY) == 0) { - hb_error("nlmeans could not spawn thread"); + hb_error("NLMeans could not spawn thread"); goto fail; } } @@ -894,7 +932,7 @@ static void nlmeans_filter_thread(void *thread_args_v) hb_filter_private_t *pv = thread_data->pv; int segment = thread_data->segment; - hb_log("NLMeans Denoise thread started for segment %d", segment); + hb_log("NLMeans thread started for segment %d", segment); while (1) { @@ -910,6 +948,8 @@ static void nlmeans_filter_thread(void *thread_args_v) hb_buffer_t *buf; buf = hb_frame_buffer_init(frame->fmt, frame->width, frame->height); + NLMeansFunctions *functions = &pv->functions; + for (int c = 0; c < 3; c++) { if (pv->strength[c] == 0) @@ -929,7 +969,8 @@ static void nlmeans_filter_thread(void *thread_args_v) } // Process current plane - nlmeans_plane(frame, + nlmeans_plane(functions, + frame, pv->prefilter[c], c, pv->nframes[c], @@ -1039,6 +1080,8 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv) hb_buffer_t *buf; buf = hb_frame_buffer_init(frame->fmt, frame->width, frame->height); + NLMeansFunctions *functions = &pv->functions; + for (int c = 0; c < 3; c++) { if (pv->strength[c] == 0) @@ -1061,7 +1104,8 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv) if (pv->nframes[c] < nframes) nframes = pv->nframes[c]; // Process current plane - nlmeans_plane(frame, + nlmeans_plane(functions, + frame, pv->prefilter[c], c, nframes, diff --git a/libhb/nlmeans.h b/libhb/nlmeans.h new file mode 100644 index 000000000..2af72059f --- /dev/null +++ b/libhb/nlmeans.h @@ -0,0 +1,27 @@ +/* nlmeans.h + + Copyright (c) 2013 Dirk Farin + Copyright (c) 2003-2015 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + */ + +typedef struct +{ + void (*build_integral)(uint32_t *integral, + int integral_stride, + const uint8_t *src, + const uint8_t *src_pre, + int src_w, + const uint8_t *compare, + const uint8_t *compare_pre, + int compare_w, + int w, + int h, + int dx, + int dy); +} NLMeansFunctions; + +void nlmeans_init_x86(NLMeansFunctions *functions); diff --git a/libhb/nlmeans_x86.c b/libhb/nlmeans_x86.c new file mode 100644 index 000000000..9acba22d4 --- /dev/null +++ b/libhb/nlmeans_x86.c @@ -0,0 +1,155 @@ +/* nlmeans_x86.c + + Copyright (c) 2013 Dirk Farin + Copyright (c) 2003-2015 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + */ + +#include <emmintrin.h> + +#include "hb.h" +#include "libavutil/cpu.h" +#include "nlmeans.h" + +static void build_integral_sse2(uint32_t *integral, + int integral_stride, + const uint8_t *src, + const uint8_t *src_pre, + int src_w, + const uint8_t *compare, + const uint8_t *compare_pre, + int compare_w, + int w, + int h, + int dx, + int dy) +{ + const __m128i zero = _mm_set1_epi8(0); + + memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t)); + + for (int y = 0; y < h; y++) + { + __m128i prevadd = _mm_set1_epi32(0); + + const uint8_t *p1 = src_pre + y*src_w; + const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx; + uint32_t *out = integral + (y*integral_stride) - 1; + + *out++ = 0; + + const int pixels_step = 16; + + for (int x = 0; x < w; x += pixels_step) + { + __m128i pa, pb; + __m128i pla, plb; + __m128i ldiff, lldiff, lhdiff; + __m128i ltmp,htmp; + __m128i ladd,hadd; + __m128i pha,phb; + __m128i hdiff,hldiff,hhdiff; + __m128i l2tmp,h2tmp; + + pa = _mm_loadu_si128((__m128i*)p1); // Load source pixels into register 1 + pb = _mm_loadu_si128((__m128i*)p2); // Load compare pixels into register 2 + + // Low + pla = _mm_unpacklo_epi8(pa,zero); // Unpack and interleave source low with zeros + plb = _mm_unpacklo_epi8(pb,zero); // Unpack and interleave compare low with zeros + + ldiff = _mm_sub_epi16(pla,plb); // Diff source and compare lows (subtract) + ldiff = _mm_mullo_epi16(ldiff,ldiff); // Square low diff (multiply at 32-bit precision) + + lldiff = _mm_unpacklo_epi16(ldiff,zero); // Unpack and interleave diff low with zeros + lhdiff = _mm_unpackhi_epi16(ldiff,zero); // Unpack and interleave diff high with zeros + + ltmp = _mm_slli_si128(lldiff, 4); // Temp shift diff low left 4 bytes + lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low + ltmp = _mm_slli_si128(lldiff, 8); // Temp shift diff low left 8 bytes + lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low + lldiff = _mm_add_epi32(lldiff, prevadd); // Add previous total to diff low + + ladd = _mm_shuffle_epi32(lldiff, 0xff); // Shuffle diff low + + htmp = _mm_slli_si128(lhdiff, 4); // Temp shift diff high left 4 bytes + lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high + htmp = _mm_slli_si128(lhdiff, 8); // Temp shift diff high left 8 bytes + lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high + lhdiff = _mm_add_epi32(lhdiff, ladd); // Add shuffled diff low to diff high + + prevadd = _mm_shuffle_epi32(lhdiff, 0xff); // Shuffle diff high + + // High + pha = _mm_unpackhi_epi8(pa,zero); // Unpack and interleave source high with zeros + phb = _mm_unpackhi_epi8(pb,zero); // Unpack and interleave compare high with zeros + + hdiff = _mm_sub_epi16(pha,phb); // Diff source and compare highs (subtract) + hdiff = _mm_mullo_epi16(hdiff,hdiff); // Square high diff (multiply at 32-bit precision) + + hldiff = _mm_unpacklo_epi16(hdiff,zero); // Unpack and interleave diff low with zeros + hhdiff = _mm_unpackhi_epi16(hdiff,zero); // Unpack and interleave diff high with zeros + + l2tmp = _mm_slli_si128(hldiff, 4); // Temp shift diff low 4 bytes + hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low + l2tmp = _mm_slli_si128(hldiff, 8); // Temp shift diff low left 8 bytes + hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low + hldiff = _mm_add_epi32(hldiff, prevadd); // Add previous total to diff low + + hadd = _mm_shuffle_epi32(hldiff, 0xff); // Shuffle diff low + + h2tmp = _mm_slli_si128(hhdiff, 4); // Temp shift diff high left 4 bytes + hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high + h2tmp = _mm_slli_si128(hhdiff, 8); // Temp shift diff high left 8 bytes + hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high + hhdiff = _mm_add_epi32(hhdiff, hadd); // Add shuffled diff low to diff high + + prevadd = _mm_shuffle_epi32(hhdiff, 0xff); // Shuffle diff high + + // Store + _mm_store_si128((__m128i*)(out), lldiff); // Store low diff low in memory + _mm_store_si128((__m128i*)(out+4), lhdiff); // Store low diff high in memory + _mm_store_si128((__m128i*)(out+8), hldiff); // Store high diff low in memory + _mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory + + // Increment + out += pixels_step; + p1 += pixels_step; + p2 += pixels_step; + } + + if (y > 0) + { + out = integral + y*integral_stride; + + for (int x = 0; x < w; x += pixels_step) + { + *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride), + *(__m128i*)(out)); + + *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride), + *(__m128i*)(out+4)); + + *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride), + *(__m128i*)(out+8)); + + *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride), + *(__m128i*)(out+12)); + + out += 4*4; + } + } + } +} + +void nlmeans_init_x86(NLMeansFunctions *functions) +{ + if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2) + { + functions->build_integral = build_integral_sse2; + hb_log("NLMeans using SSE2 optimizations"); + } +} diff --git a/libhb/ports.h b/libhb/ports.h index c9ce48d16..bd5cad189 100644 --- a/libhb/ports.h +++ b/libhb/ports.h @@ -10,6 +10,12 @@ #ifndef HB_PORTS_H #define HB_PORTS_H +#if ARCH_X86_64 || ARCH_X86_32 +#define ARCH_X86 1 +#else +#define ARCH_X86 0 +#endif + #if defined(_WIN32) #define DIR_SEP_STR "\\" #define DIR_SEP_CHAR '\\' |