From b02dbbdba62bffe7e3224da17c5f2b0585dd24d1 Mon Sep 17 00:00:00 2001 From: bradleys Date: Wed, 11 Feb 2015 20:58:05 +0000 Subject: libhb: Additional minor optimizations to nlmeans. Assume buffered planes are equal size in nlmeans. Make nlmeans scalar counters read like accelerated counters (more readable and saves ~2 cycles). Yet more const correctness. Clarify some variable names for readability. git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6896 b64f7644-9d1e-0410-96f1-a4d463321fa5 --- libhb/nlmeans.c | 92 +++++++++++++++++++++++++++-------------------------- libhb/nlmeans.h | 6 ++-- libhb/nlmeans_x86.c | 17 +++++----- 3 files changed, 59 insertions(+), 56 deletions(-) diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c index 25389ba77..3ab7ab448 100644 --- a/libhb/nlmeans.c +++ b/libhb/nlmeans.c @@ -492,8 +492,8 @@ static void nlmeans_prefilter(BorderedPlane *src, { // Source image - uint8_t *mem = src->mem; - uint8_t *image = src->image; + const uint8_t *mem = src->mem; + const uint8_t *image = src->image; const int border = src->border; const int w = src->w; const int h = src->h; @@ -582,33 +582,36 @@ static void build_integral_scalar(uint32_t *integral, int integral_stride, const uint8_t *src, const uint8_t *src_pre, - int src_w, const uint8_t *compare, const uint8_t *compare_pre, - int compare_w, int w, - int h, + int border, + int dst_w, + int dst_h, int dx, int dy) { - for (int y = 0; y < h; y++) + const int bw = w + 2 * border; + for (int y = 0; y < dst_h; y++) { - const uint8_t *p1 = src_pre + y*src_w; - const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx; + const uint8_t *p1 = src_pre + y*bw; + const uint8_t *p2 = compare_pre + (y+dy)*bw + dx; uint32_t *out = integral + (y*integral_stride); - for (int x = 0; x < w; x++) + for (int x = 0; x < dst_w; x++) { - int diff = *p1++ - *p2++; + int diff = *p1 - *p2; *out = *(out-1) + diff * diff; out++; + p1++; + p2++; } if (y > 0) { out = integral + y*integral_stride; - for (int x = 0; x < w; x++) + for (int x = 0; x < dst_w; x++) { *out += *(out - integral_stride); out++; @@ -623,9 +626,9 @@ static void nlmeans_plane(NLMeansFunctions *functions, int plane, int nframes, uint8_t *dst, - int w, - int s, - int h, + int dst_w, + int dst_s, + int dst_h, double h_param, double origin_tune, int n, @@ -638,18 +641,19 @@ static void nlmeans_plane(NLMeansFunctions *functions, const int r_half = (r-1) /2; // Source image - uint8_t *src = frame[0].plane[plane].image; - uint8_t *src_pre = frame[0].plane[plane].image_pre; + const uint8_t *src = frame[0].plane[plane].image; + const uint8_t *src_pre = frame[0].plane[plane].image_pre; + const int w = frame[0].plane[plane].w; const int border = frame[0].plane[plane].border; - const int src_w = frame[0].plane[plane].w + 2 * border; + const int bw = w + 2 * border; // Allocate temporary pixel sums - struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum)); + struct PixelSum *tmp_data = calloc(dst_w * dst_h, sizeof(struct PixelSum)); // Allocate integral image - const int integral_stride = w + 2 * 16; - uint32_t *integral_mem = calloc(integral_stride * (h+1), sizeof(uint32_t)); - uint32_t *integral = integral_mem + integral_stride + 16; + const int integral_stride = dst_w + 2 * 16; + uint32_t* const integral_mem = calloc(integral_stride * (dst_h+1), sizeof(uint32_t)); + uint32_t* const integral = integral_mem + integral_stride + 16; // Iterate through available frames for (int f = 0; f < nframes; f++) @@ -657,10 +661,8 @@ static void nlmeans_plane(NLMeansFunctions *functions, nlmeans_prefilter(&frame[f].plane[plane], prefilter); // Compare image - uint8_t *compare = frame[f].plane[plane].image; - uint8_t *compare_pre = frame[f].plane[plane].image_pre; - const int border = frame[f].plane[plane].border; - const int compare_w = frame[f].plane[plane].w + 2 * border; + const uint8_t *compare = frame[f].plane[plane].image; + const uint8_t *compare_pre = frame[f].plane[plane].image_pre; // Iterate through all displacements for (int dy = -r_half; dy <= r_half; dy++) @@ -672,12 +674,12 @@ static void nlmeans_plane(NLMeansFunctions *functions, if (dx == 0 && dy == 0 && f == 0) { // TODO: Parallelize this - for (int y = n_half; y < h-n + n_half; y++) + for (int y = n_half; y < dst_h-n + n_half; y++) { - for (int x = n_half; x < w-n + n_half; x++) + for (int x = n_half; x < dst_w-n + n_half; x++) { - tmp_data[y*w + x].weight_sum += origin_tune; - tmp_data[y*w + x].pixel_sum += origin_tune * src[y*src_w + x]; + tmp_data[y*dst_w + x].weight_sum += origin_tune; + tmp_data[y*dst_w + x].pixel_sum += origin_tune * src[y*bw + x]; } } continue; @@ -688,23 +690,23 @@ static void nlmeans_plane(NLMeansFunctions *functions, integral_stride, src, src_pre, - src_w, compare, compare_pre, - compare_w, w, - h, + border, + dst_w, + dst_h, dx, dy); // Average displacement // TODO: Parallelize this - for (int y = 0; y <= h-n; y++) + for (int y = 0; y <= dst_h-n; y++) { const uint32_t *integral_ptr1 = integral + (y -1)*integral_stride - 1; const uint32_t *integral_ptr2 = integral + (y+n-1)*integral_stride - 1; - for (int x = 0; x <= w-n; x++) + for (int x = 0; x <= dst_w-n; x++) { const int xc = x + n_half; const int yc = y + n_half; @@ -720,8 +722,8 @@ static void nlmeans_plane(NLMeansFunctions *functions, //float weight = exp(-diff*weightFact); const float weight = exptable[diffidx]; - tmp_data[yc*w + xc].weight_sum += weight; - tmp_data[yc*w + xc].pixel_sum += weight * compare[(yc+dy)*compare_w + xc + dx]; + tmp_data[yc*dst_w + xc].weight_sum += weight; + tmp_data[yc*dst_w + xc].pixel_sum += weight * compare[(yc+dy)*bw + xc + dx]; } integral_ptr1++; @@ -733,28 +735,28 @@ static void nlmeans_plane(NLMeansFunctions *functions, } // Copy edges - for (int y = 0; y < h; y++) + for (int y = 0; y < dst_h; y++) { for (int x = 0; x < n_half; x++) { - *(dst + y * s + x) = *(src + y * src_w - x - 1); - *(dst + y * s - x + (w - 1)) = *(src + y * src_w + x + w); + *(dst + y * dst_s + x) = *(src + y * bw - x - 1); + *(dst + y * dst_s - x + (dst_w - 1)) = *(src + y * bw + x + dst_w); } } for (int y = 0; y < n_half; y++) { - memcpy(dst + y*s, src - (y+1)*src_w, w); - memcpy(dst + (h-y-1)*s, src + (y+h)*src_w, w); + memcpy(dst + y*dst_s, src - (y+1)*bw, dst_w); + memcpy(dst + (dst_h-y-1)*dst_s, src + (y+dst_h)*bw, dst_w); } // Copy main image uint8_t result; - for (int y = n_half; y < h-n_half; y++) + for (int y = n_half; y < dst_h-n_half; y++) { - for (int x = n_half; x < w-n_half; x++) + for (int x = n_half; x < dst_w-n_half; x++) { - result = (uint8_t)(tmp_data[y*w + x].pixel_sum / tmp_data[y*w + x].weight_sum); - *(dst + y*s + x) = result ? result : *(src + y*src_w + x); + result = (uint8_t)(tmp_data[y*dst_w + x].pixel_sum / tmp_data[y*dst_w + x].weight_sum); + *(dst + y*dst_s + x) = result ? result : *(src + y*bw + x); } } diff --git a/libhb/nlmeans.h b/libhb/nlmeans.h index 2af72059f..9f6e90845 100644 --- a/libhb/nlmeans.h +++ b/libhb/nlmeans.h @@ -14,12 +14,12 @@ typedef struct int integral_stride, const uint8_t *src, const uint8_t *src_pre, - int src_w, const uint8_t *compare, const uint8_t *compare_pre, - int compare_w, int w, - int h, + int border, + int dst_w, + int dst_h, int dx, int dy); } NLMeansFunctions; diff --git a/libhb/nlmeans_x86.c b/libhb/nlmeans_x86.c index 685ac857e..aa727d96a 100644 --- a/libhb/nlmeans_x86.c +++ b/libhb/nlmeans_x86.c @@ -18,26 +18,27 @@ static void build_integral_sse2(uint32_t *integral, int integral_stride, const uint8_t *src, const uint8_t *src_pre, - int src_w, const uint8_t *compare, const uint8_t *compare_pre, - int compare_w, int w, - int h, + int border, + int dst_w, + int dst_h, int dx, int dy) { const __m128i zero = _mm_set1_epi8(0); + const int bw = w + 2 * border; - for (int y = 0; y < h; y++) + for (int y = 0; y < dst_h; y++) { __m128i prevadd = _mm_set1_epi32(0); - const uint8_t *p1 = src_pre + y*src_w; - const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx; + const uint8_t *p1 = src_pre + y*bw; + const uint8_t *p2 = compare_pre + (y+dy)*bw + dx; uint32_t *out = integral + (y*integral_stride); - for (int x = 0; x < w; x += 16) + for (int x = 0; x < dst_w; x += 16) { __m128i pa, pb; __m128i pla, plb; @@ -119,7 +120,7 @@ static void build_integral_sse2(uint32_t *integral, { out = integral + y*integral_stride; - for (int x = 0; x < w; x += 16) + for (int x = 0; x < dst_w; x += 16) { *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride), *(__m128i*)(out)); -- cgit v1.2.3