summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbradleys <[email protected]>2015-02-11 20:58:05 +0000
committerbradleys <[email protected]>2015-02-11 20:58:05 +0000
commitb02dbbdba62bffe7e3224da17c5f2b0585dd24d1 (patch)
tree3341ae7be03e76913920bee4b557b060c025f4ba
parent90bb32c1fc211087736ca52e267c19bf0239bdfe (diff)
libhb: Additional minor optimizations to nlmeans.
Assume buffered planes are equal size in nlmeans. Make nlmeans scalar counters read like accelerated counters (more readable and saves ~2 cycles). Yet more const correctness. Clarify some variable names for readability. git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6896 b64f7644-9d1e-0410-96f1-a4d463321fa5
-rw-r--r--libhb/nlmeans.c92
-rw-r--r--libhb/nlmeans.h6
-rw-r--r--libhb/nlmeans_x86.c17
3 files changed, 59 insertions, 56 deletions
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c
index 25389ba77..3ab7ab448 100644
--- a/libhb/nlmeans.c
+++ b/libhb/nlmeans.c
@@ -492,8 +492,8 @@ static void nlmeans_prefilter(BorderedPlane *src,
{
// Source image
- uint8_t *mem = src->mem;
- uint8_t *image = src->image;
+ const uint8_t *mem = src->mem;
+ const uint8_t *image = src->image;
const int border = src->border;
const int w = src->w;
const int h = src->h;
@@ -582,33 +582,36 @@ static void build_integral_scalar(uint32_t *integral,
int integral_stride,
const uint8_t *src,
const uint8_t *src_pre,
- int src_w,
const uint8_t *compare,
const uint8_t *compare_pre,
- int compare_w,
int w,
- int h,
+ int border,
+ int dst_w,
+ int dst_h,
int dx,
int dy)
{
- for (int y = 0; y < h; y++)
+ const int bw = w + 2 * border;
+ for (int y = 0; y < dst_h; y++)
{
- const uint8_t *p1 = src_pre + y*src_w;
- const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
+ const uint8_t *p1 = src_pre + y*bw;
+ const uint8_t *p2 = compare_pre + (y+dy)*bw + dx;
uint32_t *out = integral + (y*integral_stride);
- for (int x = 0; x < w; x++)
+ for (int x = 0; x < dst_w; x++)
{
- int diff = *p1++ - *p2++;
+ int diff = *p1 - *p2;
*out = *(out-1) + diff * diff;
out++;
+ p1++;
+ p2++;
}
if (y > 0)
{
out = integral + y*integral_stride;
- for (int x = 0; x < w; x++)
+ for (int x = 0; x < dst_w; x++)
{
*out += *(out - integral_stride);
out++;
@@ -623,9 +626,9 @@ static void nlmeans_plane(NLMeansFunctions *functions,
int plane,
int nframes,
uint8_t *dst,
- int w,
- int s,
- int h,
+ int dst_w,
+ int dst_s,
+ int dst_h,
double h_param,
double origin_tune,
int n,
@@ -638,18 +641,19 @@ static void nlmeans_plane(NLMeansFunctions *functions,
const int r_half = (r-1) /2;
// Source image
- uint8_t *src = frame[0].plane[plane].image;
- uint8_t *src_pre = frame[0].plane[plane].image_pre;
+ const uint8_t *src = frame[0].plane[plane].image;
+ const uint8_t *src_pre = frame[0].plane[plane].image_pre;
+ const int w = frame[0].plane[plane].w;
const int border = frame[0].plane[plane].border;
- const int src_w = frame[0].plane[plane].w + 2 * border;
+ const int bw = w + 2 * border;
// Allocate temporary pixel sums
- struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum));
+ struct PixelSum *tmp_data = calloc(dst_w * dst_h, sizeof(struct PixelSum));
// Allocate integral image
- const int integral_stride = w + 2 * 16;
- uint32_t *integral_mem = calloc(integral_stride * (h+1), sizeof(uint32_t));
- uint32_t *integral = integral_mem + integral_stride + 16;
+ const int integral_stride = dst_w + 2 * 16;
+ uint32_t* const integral_mem = calloc(integral_stride * (dst_h+1), sizeof(uint32_t));
+ uint32_t* const integral = integral_mem + integral_stride + 16;
// Iterate through available frames
for (int f = 0; f < nframes; f++)
@@ -657,10 +661,8 @@ static void nlmeans_plane(NLMeansFunctions *functions,
nlmeans_prefilter(&frame[f].plane[plane], prefilter);
// Compare image
- uint8_t *compare = frame[f].plane[plane].image;
- uint8_t *compare_pre = frame[f].plane[plane].image_pre;
- const int border = frame[f].plane[plane].border;
- const int compare_w = frame[f].plane[plane].w + 2 * border;
+ const uint8_t *compare = frame[f].plane[plane].image;
+ const uint8_t *compare_pre = frame[f].plane[plane].image_pre;
// Iterate through all displacements
for (int dy = -r_half; dy <= r_half; dy++)
@@ -672,12 +674,12 @@ static void nlmeans_plane(NLMeansFunctions *functions,
if (dx == 0 && dy == 0 && f == 0)
{
// TODO: Parallelize this
- for (int y = n_half; y < h-n + n_half; y++)
+ for (int y = n_half; y < dst_h-n + n_half; y++)
{
- for (int x = n_half; x < w-n + n_half; x++)
+ for (int x = n_half; x < dst_w-n + n_half; x++)
{
- tmp_data[y*w + x].weight_sum += origin_tune;
- tmp_data[y*w + x].pixel_sum += origin_tune * src[y*src_w + x];
+ tmp_data[y*dst_w + x].weight_sum += origin_tune;
+ tmp_data[y*dst_w + x].pixel_sum += origin_tune * src[y*bw + x];
}
}
continue;
@@ -688,23 +690,23 @@ static void nlmeans_plane(NLMeansFunctions *functions,
integral_stride,
src,
src_pre,
- src_w,
compare,
compare_pre,
- compare_w,
w,
- h,
+ border,
+ dst_w,
+ dst_h,
dx,
dy);
// Average displacement
// TODO: Parallelize this
- for (int y = 0; y <= h-n; y++)
+ for (int y = 0; y <= dst_h-n; y++)
{
const uint32_t *integral_ptr1 = integral + (y -1)*integral_stride - 1;
const uint32_t *integral_ptr2 = integral + (y+n-1)*integral_stride - 1;
- for (int x = 0; x <= w-n; x++)
+ for (int x = 0; x <= dst_w-n; x++)
{
const int xc = x + n_half;
const int yc = y + n_half;
@@ -720,8 +722,8 @@ static void nlmeans_plane(NLMeansFunctions *functions,
//float weight = exp(-diff*weightFact);
const float weight = exptable[diffidx];
- tmp_data[yc*w + xc].weight_sum += weight;
- tmp_data[yc*w + xc].pixel_sum += weight * compare[(yc+dy)*compare_w + xc + dx];
+ tmp_data[yc*dst_w + xc].weight_sum += weight;
+ tmp_data[yc*dst_w + xc].pixel_sum += weight * compare[(yc+dy)*bw + xc + dx];
}
integral_ptr1++;
@@ -733,28 +735,28 @@ static void nlmeans_plane(NLMeansFunctions *functions,
}
// Copy edges
- for (int y = 0; y < h; y++)
+ for (int y = 0; y < dst_h; y++)
{
for (int x = 0; x < n_half; x++)
{
- *(dst + y * s + x) = *(src + y * src_w - x - 1);
- *(dst + y * s - x + (w - 1)) = *(src + y * src_w + x + w);
+ *(dst + y * dst_s + x) = *(src + y * bw - x - 1);
+ *(dst + y * dst_s - x + (dst_w - 1)) = *(src + y * bw + x + dst_w);
}
}
for (int y = 0; y < n_half; y++)
{
- memcpy(dst + y*s, src - (y+1)*src_w, w);
- memcpy(dst + (h-y-1)*s, src + (y+h)*src_w, w);
+ memcpy(dst + y*dst_s, src - (y+1)*bw, dst_w);
+ memcpy(dst + (dst_h-y-1)*dst_s, src + (y+dst_h)*bw, dst_w);
}
// Copy main image
uint8_t result;
- for (int y = n_half; y < h-n_half; y++)
+ for (int y = n_half; y < dst_h-n_half; y++)
{
- for (int x = n_half; x < w-n_half; x++)
+ for (int x = n_half; x < dst_w-n_half; x++)
{
- result = (uint8_t)(tmp_data[y*w + x].pixel_sum / tmp_data[y*w + x].weight_sum);
- *(dst + y*s + x) = result ? result : *(src + y*src_w + x);
+ result = (uint8_t)(tmp_data[y*dst_w + x].pixel_sum / tmp_data[y*dst_w + x].weight_sum);
+ *(dst + y*dst_s + x) = result ? result : *(src + y*bw + x);
}
}
diff --git a/libhb/nlmeans.h b/libhb/nlmeans.h
index 2af72059f..9f6e90845 100644
--- a/libhb/nlmeans.h
+++ b/libhb/nlmeans.h
@@ -14,12 +14,12 @@ typedef struct
int integral_stride,
const uint8_t *src,
const uint8_t *src_pre,
- int src_w,
const uint8_t *compare,
const uint8_t *compare_pre,
- int compare_w,
int w,
- int h,
+ int border,
+ int dst_w,
+ int dst_h,
int dx,
int dy);
} NLMeansFunctions;
diff --git a/libhb/nlmeans_x86.c b/libhb/nlmeans_x86.c
index 685ac857e..aa727d96a 100644
--- a/libhb/nlmeans_x86.c
+++ b/libhb/nlmeans_x86.c
@@ -18,26 +18,27 @@ static void build_integral_sse2(uint32_t *integral,
int integral_stride,
const uint8_t *src,
const uint8_t *src_pre,
- int src_w,
const uint8_t *compare,
const uint8_t *compare_pre,
- int compare_w,
int w,
- int h,
+ int border,
+ int dst_w,
+ int dst_h,
int dx,
int dy)
{
const __m128i zero = _mm_set1_epi8(0);
+ const int bw = w + 2 * border;
- for (int y = 0; y < h; y++)
+ for (int y = 0; y < dst_h; y++)
{
__m128i prevadd = _mm_set1_epi32(0);
- const uint8_t *p1 = src_pre + y*src_w;
- const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
+ const uint8_t *p1 = src_pre + y*bw;
+ const uint8_t *p2 = compare_pre + (y+dy)*bw + dx;
uint32_t *out = integral + (y*integral_stride);
- for (int x = 0; x < w; x += 16)
+ for (int x = 0; x < dst_w; x += 16)
{
__m128i pa, pb;
__m128i pla, plb;
@@ -119,7 +120,7 @@ static void build_integral_sse2(uint32_t *integral,
{
out = integral + y*integral_stride;
- for (int x = 0; x < w; x += 16)
+ for (int x = 0; x < dst_w; x += 16)
{
*((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
*(__m128i*)(out));