summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbradleys <[email protected]>2015-01-29 03:05:57 +0000
committerbradleys <[email protected]>2015-01-29 03:05:57 +0000
commit399c64860c4ec5d8a29f34e3b130ff7018337290 (patch)
tree6f87f8fb84d2db573518b363501d19d24fd9aee1
parent1088f1d85df3d987e295974c44a98f7162ce8fca (diff)
libhb: NLMeans x86 SSE acceleration/optimizations.
Speed improvements of 1-6% seem typical. Most benefit seems to be for older hardware and/or hardware with fewer threads. git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6826 b64f7644-9d1e-0410-96f1-a4d463321fa5
-rw-r--r--libhb/nlmeans.c116
-rw-r--r--libhb/nlmeans.h27
-rw-r--r--libhb/nlmeans_x86.c155
-rw-r--r--libhb/ports.h6
4 files changed, 268 insertions, 36 deletions
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c
index f5cfd56ac..78422fa1f 100644
--- a/libhb/nlmeans.c
+++ b/libhb/nlmeans.c
@@ -1,7 +1,7 @@
/* nlmeans.c
Copyright (c) 2013 Dirk Farin
- Copyright (c) 2003-2014 HandBrake Team
+ Copyright (c) 2003-2015 HandBrake Team
This file is part of the HandBrake source code
Homepage: <http://handbrake.fr/>.
It may be used under the terms of the GNU General Public License v2.
@@ -44,7 +44,7 @@
* 1026: Mean 5x5 plus edge boost
* 1281: Mean 3x3 reduced by 25% plus edge boost
* etc...
- * 2049: Mean 3x3 passthru (NL-means off, prefilter is the output)
+ * 2049: Mean 3x3 passthru (NLMeans off, prefilter is the output)
* etc...
* 3329: Mean 3x3 reduced by 25% plus edge boost, passthru
* etc...
@@ -53,6 +53,7 @@
#include "hb.h"
#include "hbffmpeg.h"
#include "taskset.h"
+#include "nlmeans.h"
#define NLMEANS_STRENGTH_LUMA_DEFAULT 8
#define NLMEANS_STRENGTH_CHROMA_DEFAULT 8
@@ -130,6 +131,8 @@ struct hb_filter_private_s
int nframes[3]; // temporal search depth in frames
int prefilter[3]; // prefilter mode, can improve weight analysis
+ NLMeansFunctions functions;
+
Frame *frame;
int next_frame;
int max_frames;
@@ -568,7 +571,50 @@ static void nlmeans_prefilter(BorderedPlane *src,
hb_unlock(src->mutex);
}
-static void nlmeans_plane(Frame *frame,
+static void build_integral_scalar(uint32_t *integral,
+ int integral_stride,
+ const uint8_t *src,
+ const uint8_t *src_pre,
+ int src_w,
+ const uint8_t *compare,
+ const uint8_t *compare_pre,
+ int compare_w,
+ int w,
+ int h,
+ int dx,
+ int dy)
+{
+ memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
+ for (int y = 0; y < h; y++)
+ {
+ const uint8_t *p1 = src_pre + y*src_w;
+ const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
+ uint32_t *out = integral + (y*integral_stride) - 1;
+
+ *out++ = 0;
+
+ for (int x = 0; x < w; x++)
+ {
+ int diff = *p1++ - *p2++;
+ *out = *(out-1) + diff * diff;
+ out++;
+ }
+
+ if (y > 0)
+ {
+ out = integral + y*integral_stride;
+
+ for (int x = 0; x < w; x++)
+ {
+ *out += *(out - integral_stride);
+ out++;
+ }
+ }
+ }
+}
+
+static void nlmeans_plane(NLMeansFunctions *functions,
+ Frame *frame,
int prefilter,
int plane,
int nframes,
@@ -644,33 +690,18 @@ static void nlmeans_plane(Frame *frame,
}
// Build integral
- memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
- for (int y = 0; y < h; y++)
- {
- const uint8_t *p1 = src_pre + y*src_w;
- const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
- uint32_t *out = integral + (y*integral_stride) - 1;
-
- *out++ = 0;
-
- for (int x = 0; x < w; x++)
- {
- int diff = *p1++ - *p2++;
- *out = *(out-1) + diff * diff;
- out++;
- }
-
- if (y > 0)
- {
- out = integral + y*integral_stride;
-
- for (int x = 0; x < w; x++)
- {
- *out += *(out - integral_stride);
- out++;
- }
- }
- }
+ functions->build_integral(integral,
+ integral_stride,
+ src,
+ src_pre,
+ src_w,
+ compare,
+ compare_pre,
+ compare_w,
+ w,
+ h,
+ dx,
+ dy);
// Average displacement
// TODO: Parallelize this
@@ -743,6 +774,13 @@ static int nlmeans_init(hb_filter_object_t *filter,
{
filter->private_data = calloc(sizeof(struct hb_filter_private_s), 1);
hb_filter_private_t *pv = filter->private_data;
+ NLMeansFunctions *functions = &pv->functions;
+
+ functions->build_integral = build_integral_scalar;
+ if (ARCH_X86 == 1)
+ {
+ nlmeans_init_x86(functions);
+ }
// Mark parameters unset
for (int c = 0; c < 3; c++)
@@ -815,7 +853,7 @@ static int nlmeans_init(hb_filter_object_t *filter,
if (taskset_init(&pv->taskset, pv->thread_count,
sizeof(nlmeans_thread_arg_t)) == 0)
{
- hb_error("nlmeans could not initialize taskset");
+ hb_error("NLMeans could not initialize taskset");
goto fail;
}
@@ -824,7 +862,7 @@ static int nlmeans_init(hb_filter_object_t *filter,
pv->thread_data[ii] = taskset_thread_args(&pv->taskset, ii);
if (pv->thread_data[ii] == NULL)
{
- hb_error("nlmeans could not create thread args");
+ hb_error("NLMeans could not create thread args");
goto fail;
}
pv->thread_data[ii]->pv = pv;
@@ -832,7 +870,7 @@ static int nlmeans_init(hb_filter_object_t *filter,
if (taskset_thread_spawn(&pv->taskset, ii, "nlmeans_filter",
nlmeans_filter_thread, HB_NORMAL_PRIORITY) == 0)
{
- hb_error("nlmeans could not spawn thread");
+ hb_error("NLMeans could not spawn thread");
goto fail;
}
}
@@ -894,7 +932,7 @@ static void nlmeans_filter_thread(void *thread_args_v)
hb_filter_private_t *pv = thread_data->pv;
int segment = thread_data->segment;
- hb_log("NLMeans Denoise thread started for segment %d", segment);
+ hb_log("NLMeans thread started for segment %d", segment);
while (1)
{
@@ -910,6 +948,8 @@ static void nlmeans_filter_thread(void *thread_args_v)
hb_buffer_t *buf;
buf = hb_frame_buffer_init(frame->fmt, frame->width, frame->height);
+ NLMeansFunctions *functions = &pv->functions;
+
for (int c = 0; c < 3; c++)
{
if (pv->strength[c] == 0)
@@ -929,7 +969,8 @@ static void nlmeans_filter_thread(void *thread_args_v)
}
// Process current plane
- nlmeans_plane(frame,
+ nlmeans_plane(functions,
+ frame,
pv->prefilter[c],
c,
pv->nframes[c],
@@ -1039,6 +1080,8 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
hb_buffer_t *buf;
buf = hb_frame_buffer_init(frame->fmt, frame->width, frame->height);
+ NLMeansFunctions *functions = &pv->functions;
+
for (int c = 0; c < 3; c++)
{
if (pv->strength[c] == 0)
@@ -1061,7 +1104,8 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
if (pv->nframes[c] < nframes)
nframes = pv->nframes[c];
// Process current plane
- nlmeans_plane(frame,
+ nlmeans_plane(functions,
+ frame,
pv->prefilter[c],
c,
nframes,
diff --git a/libhb/nlmeans.h b/libhb/nlmeans.h
new file mode 100644
index 000000000..2af72059f
--- /dev/null
+++ b/libhb/nlmeans.h
@@ -0,0 +1,27 @@
+/* nlmeans.h
+
+ Copyright (c) 2013 Dirk Farin
+ Copyright (c) 2003-2015 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+ */
+
+typedef struct
+{
+ void (*build_integral)(uint32_t *integral,
+ int integral_stride,
+ const uint8_t *src,
+ const uint8_t *src_pre,
+ int src_w,
+ const uint8_t *compare,
+ const uint8_t *compare_pre,
+ int compare_w,
+ int w,
+ int h,
+ int dx,
+ int dy);
+} NLMeansFunctions;
+
+void nlmeans_init_x86(NLMeansFunctions *functions);
diff --git a/libhb/nlmeans_x86.c b/libhb/nlmeans_x86.c
new file mode 100644
index 000000000..9acba22d4
--- /dev/null
+++ b/libhb/nlmeans_x86.c
@@ -0,0 +1,155 @@
+/* nlmeans_x86.c
+
+ Copyright (c) 2013 Dirk Farin
+ Copyright (c) 2003-2015 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+ */
+
+#include <emmintrin.h>
+
+#include "hb.h"
+#include "libavutil/cpu.h"
+#include "nlmeans.h"
+
+static void build_integral_sse2(uint32_t *integral,
+ int integral_stride,
+ const uint8_t *src,
+ const uint8_t *src_pre,
+ int src_w,
+ const uint8_t *compare,
+ const uint8_t *compare_pre,
+ int compare_w,
+ int w,
+ int h,
+ int dx,
+ int dy)
+{
+ const __m128i zero = _mm_set1_epi8(0);
+
+ memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
+
+ for (int y = 0; y < h; y++)
+ {
+ __m128i prevadd = _mm_set1_epi32(0);
+
+ const uint8_t *p1 = src_pre + y*src_w;
+ const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
+ uint32_t *out = integral + (y*integral_stride) - 1;
+
+ *out++ = 0;
+
+ const int pixels_step = 16;
+
+ for (int x = 0; x < w; x += pixels_step)
+ {
+ __m128i pa, pb;
+ __m128i pla, plb;
+ __m128i ldiff, lldiff, lhdiff;
+ __m128i ltmp,htmp;
+ __m128i ladd,hadd;
+ __m128i pha,phb;
+ __m128i hdiff,hldiff,hhdiff;
+ __m128i l2tmp,h2tmp;
+
+ pa = _mm_loadu_si128((__m128i*)p1); // Load source pixels into register 1
+ pb = _mm_loadu_si128((__m128i*)p2); // Load compare pixels into register 2
+
+ // Low
+ pla = _mm_unpacklo_epi8(pa,zero); // Unpack and interleave source low with zeros
+ plb = _mm_unpacklo_epi8(pb,zero); // Unpack and interleave compare low with zeros
+
+ ldiff = _mm_sub_epi16(pla,plb); // Diff source and compare lows (subtract)
+ ldiff = _mm_mullo_epi16(ldiff,ldiff); // Square low diff (multiply at 32-bit precision)
+
+ lldiff = _mm_unpacklo_epi16(ldiff,zero); // Unpack and interleave diff low with zeros
+ lhdiff = _mm_unpackhi_epi16(ldiff,zero); // Unpack and interleave diff high with zeros
+
+ ltmp = _mm_slli_si128(lldiff, 4); // Temp shift diff low left 4 bytes
+ lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low
+ ltmp = _mm_slli_si128(lldiff, 8); // Temp shift diff low left 8 bytes
+ lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low
+ lldiff = _mm_add_epi32(lldiff, prevadd); // Add previous total to diff low
+
+ ladd = _mm_shuffle_epi32(lldiff, 0xff); // Shuffle diff low
+
+ htmp = _mm_slli_si128(lhdiff, 4); // Temp shift diff high left 4 bytes
+ lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high
+ htmp = _mm_slli_si128(lhdiff, 8); // Temp shift diff high left 8 bytes
+ lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high
+ lhdiff = _mm_add_epi32(lhdiff, ladd); // Add shuffled diff low to diff high
+
+ prevadd = _mm_shuffle_epi32(lhdiff, 0xff); // Shuffle diff high
+
+ // High
+ pha = _mm_unpackhi_epi8(pa,zero); // Unpack and interleave source high with zeros
+ phb = _mm_unpackhi_epi8(pb,zero); // Unpack and interleave compare high with zeros
+
+ hdiff = _mm_sub_epi16(pha,phb); // Diff source and compare highs (subtract)
+ hdiff = _mm_mullo_epi16(hdiff,hdiff); // Square high diff (multiply at 32-bit precision)
+
+ hldiff = _mm_unpacklo_epi16(hdiff,zero); // Unpack and interleave diff low with zeros
+ hhdiff = _mm_unpackhi_epi16(hdiff,zero); // Unpack and interleave diff high with zeros
+
+ l2tmp = _mm_slli_si128(hldiff, 4); // Temp shift diff low 4 bytes
+ hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low
+ l2tmp = _mm_slli_si128(hldiff, 8); // Temp shift diff low left 8 bytes
+ hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low
+ hldiff = _mm_add_epi32(hldiff, prevadd); // Add previous total to diff low
+
+ hadd = _mm_shuffle_epi32(hldiff, 0xff); // Shuffle diff low
+
+ h2tmp = _mm_slli_si128(hhdiff, 4); // Temp shift diff high left 4 bytes
+ hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high
+ h2tmp = _mm_slli_si128(hhdiff, 8); // Temp shift diff high left 8 bytes
+ hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high
+ hhdiff = _mm_add_epi32(hhdiff, hadd); // Add shuffled diff low to diff high
+
+ prevadd = _mm_shuffle_epi32(hhdiff, 0xff); // Shuffle diff high
+
+ // Store
+ _mm_store_si128((__m128i*)(out), lldiff); // Store low diff low in memory
+ _mm_store_si128((__m128i*)(out+4), lhdiff); // Store low diff high in memory
+ _mm_store_si128((__m128i*)(out+8), hldiff); // Store high diff low in memory
+ _mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory
+
+ // Increment
+ out += pixels_step;
+ p1 += pixels_step;
+ p2 += pixels_step;
+ }
+
+ if (y > 0)
+ {
+ out = integral + y*integral_stride;
+
+ for (int x = 0; x < w; x += pixels_step)
+ {
+ *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
+ *(__m128i*)(out));
+
+ *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride),
+ *(__m128i*)(out+4));
+
+ *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride),
+ *(__m128i*)(out+8));
+
+ *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),
+ *(__m128i*)(out+12));
+
+ out += 4*4;
+ }
+ }
+ }
+}
+
+void nlmeans_init_x86(NLMeansFunctions *functions)
+{
+ if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2)
+ {
+ functions->build_integral = build_integral_sse2;
+ hb_log("NLMeans using SSE2 optimizations");
+ }
+}
diff --git a/libhb/ports.h b/libhb/ports.h
index c9ce48d16..bd5cad189 100644
--- a/libhb/ports.h
+++ b/libhb/ports.h
@@ -10,6 +10,12 @@
#ifndef HB_PORTS_H
#define HB_PORTS_H
+#if ARCH_X86_64 || ARCH_X86_32
+#define ARCH_X86 1
+#else
+#define ARCH_X86 0
+#endif
+
#if defined(_WIN32)
#define DIR_SEP_STR "\\"
#define DIR_SEP_CHAR '\\'