summaryrefslogtreecommitdiffstats
path: root/libhb/nlmeans_x86.c
diff options
context:
space:
mode:
authorbradleys <[email protected]>2015-02-11 20:58:05 +0000
committerbradleys <[email protected]>2015-02-11 20:58:05 +0000
commitb02dbbdba62bffe7e3224da17c5f2b0585dd24d1 (patch)
tree3341ae7be03e76913920bee4b557b060c025f4ba /libhb/nlmeans_x86.c
parent90bb32c1fc211087736ca52e267c19bf0239bdfe (diff)
libhb: Additional minor optimizations to nlmeans.
Assume buffered planes are equal size in nlmeans. Make nlmeans scalar counters read like accelerated counters (more readable and saves ~2 cycles). Yet more const correctness. Clarify some variable names for readability. git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6896 b64f7644-9d1e-0410-96f1-a4d463321fa5
Diffstat (limited to 'libhb/nlmeans_x86.c')
-rw-r--r--libhb/nlmeans_x86.c17
1 files changed, 9 insertions, 8 deletions
diff --git a/libhb/nlmeans_x86.c b/libhb/nlmeans_x86.c
index 685ac857e..aa727d96a 100644
--- a/libhb/nlmeans_x86.c
+++ b/libhb/nlmeans_x86.c
@@ -18,26 +18,27 @@ static void build_integral_sse2(uint32_t *integral,
int integral_stride,
const uint8_t *src,
const uint8_t *src_pre,
- int src_w,
const uint8_t *compare,
const uint8_t *compare_pre,
- int compare_w,
int w,
- int h,
+ int border,
+ int dst_w,
+ int dst_h,
int dx,
int dy)
{
const __m128i zero = _mm_set1_epi8(0);
+ const int bw = w + 2 * border;
- for (int y = 0; y < h; y++)
+ for (int y = 0; y < dst_h; y++)
{
__m128i prevadd = _mm_set1_epi32(0);
- const uint8_t *p1 = src_pre + y*src_w;
- const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
+ const uint8_t *p1 = src_pre + y*bw;
+ const uint8_t *p2 = compare_pre + (y+dy)*bw + dx;
uint32_t *out = integral + (y*integral_stride);
- for (int x = 0; x < w; x += 16)
+ for (int x = 0; x < dst_w; x += 16)
{
__m128i pa, pb;
__m128i pla, plb;
@@ -119,7 +120,7 @@ static void build_integral_sse2(uint32_t *integral,
{
out = integral + y*integral_stride;
- for (int x = 0; x < w; x += 16)
+ for (int x = 0; x < dst_w; x += 16)
{
*((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
*(__m128i*)(out));