diff options
author | bradleys <[email protected]> | 2015-02-06 11:05:33 +0000 |
---|---|---|
committer | bradleys <[email protected]> | 2015-02-06 11:05:33 +0000 |
commit | 487ea244e6ebe777773712d5763870f688a2e804 (patch) | |
tree | b902d8da09da66904dc56a36f44c2ae74a7adf01 | |
parent | 1141a5f323d6ce712881a2de311b750dc641119e (diff) |
libhb: Minor nlmeans optimizations.
Use calloc for nlmeans integral instead of memsets in-loop zeroing.
Replace superfluous const with literal in SSE implementation.
Move exponential table calculation out of the main loop.
More const correctness.
Add some braces.
Overall, slightly more readable/maintainable and (very) slightly faster.
git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6874 b64f7644-9d1e-0410-96f1-a4d463321fa5
-rw-r--r-- | libhb/nlmeans.c | 135 | ||||
-rw-r--r-- | libhb/nlmeans_x86.c | 20 |
2 files changed, 84 insertions, 71 deletions
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c index 78422fa1f..25389ba77 100644 --- a/libhb/nlmeans.c +++ b/libhb/nlmeans.c @@ -131,6 +131,10 @@ struct hb_filter_private_s int nframes[3]; // temporal search depth in frames int prefilter[3]; // prefilter mode, can improve weight analysis + float exptable[3][NLMEANS_EXPSIZE]; + float weight_fact_table[3]; + int diff_max[3]; + NLMeansFunctions functions; Frame *frame; @@ -166,7 +170,7 @@ static void nlmeans_border(uint8_t *src, int h, int border) { - int bw = w + 2 * border; + const int bw = w + 2 * border; uint8_t *image = src + border + bw * border; // Create faux borders using edge pixels @@ -192,11 +196,14 @@ static void nlmeans_deborder(BorderedPlane *src, int s, int h) { - int bw = src->w + 2 * src->border; + const int bw = src->w + 2 * src->border; uint8_t *image = src->mem + src->border + bw * src->border; + int width = w; if (src->w < width) + { width = src->w; + } // Copy main image for (int y = 0; y < h; y++) @@ -213,8 +220,8 @@ static void nlmeans_alloc(uint8_t *src, BorderedPlane *dst, int border) { - int bw = src_w + 2 * border; - int bh = src_h + 2 * border; + const int bw = src_w + 2 * border; + const int bh = src_h + 2 * border; uint8_t *mem = malloc(bw * bh * sizeof(uint8_t)); uint8_t *image = mem + border + bw * border; @@ -246,11 +253,11 @@ static void nlmeans_filter_mean(uint8_t *src, { // Mean filter - int bw = w + 2 * border; - int offset_min = -((size - 1) /2); - int offset_max = (size + 1) /2; + const int bw = w + 2 * border; + const int offset_min = -((size - 1) /2); + const int offset_max = (size + 1) /2; + const double pixel_weight = 1.0 / (size * size); uint16_t pixel_sum; - double pixel_weight = 1.0 / (size * size); for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) @@ -341,9 +348,9 @@ static void nlmeans_filter_median(uint8_t *src, int size) { // Median filter - int bw = w + 2 * border; - int offset_min = -((size - 1) /2); - int offset_max = (size + 1) /2; + const int bw = w + 2 * border; + const int offset_min = -((size - 1) /2); + const int offset_max = (size + 1) /2; int index; uint8_t pixels[size * size]; for (int y = 0; y < h; y++) @@ -371,19 +378,19 @@ static void nlmeans_filter_edgeboost(uint8_t *src, int h, int border) { - int bw = w + 2 * border; - int bh = h + 2 * border; + const int bw = w + 2 * border; + const int bh = h + 2 * border; // Custom kernel - int kernel_size = 3; - int kernel[3][3] = {{-31, 0, 31}, - {-44, 0, 44}, - {-31, 0, 31}}; - double kernel_coef = 1.0 / 126.42; + const int kernel_size = 3; + const int kernel[3][3] = {{-31, 0, 31}, + {-44, 0, 44}, + {-31, 0, 31}}; + const double kernel_coef = 1.0 / 126.42; // Detect edges - int offset_min = -((kernel_size - 1) /2); - int offset_max = (kernel_size + 1) /2; + const int offset_min = -((kernel_size - 1) /2); + const int offset_max = (kernel_size + 1) /2; uint16_t pixel1; uint16_t pixel2; uint8_t *mask_mem = calloc(bw * bh, sizeof(uint8_t)); @@ -487,11 +494,11 @@ static void nlmeans_prefilter(BorderedPlane *src, // Source image uint8_t *mem = src->mem; uint8_t *image = src->image; - int border = src->border; - int w = src->w; - int h = src->h; - int bw = w + 2 * border; - int bh = h + 2 * border; + const int border = src->border; + const int w = src->w; + const int h = src->h; + const int bw = w + 2 * border; + const int bh = h + 2 * border; // Duplicate plane uint8_t *mem_pre = malloc(bw * bh * sizeof(uint8_t)); @@ -584,14 +591,11 @@ static void build_integral_scalar(uint32_t *integral, int dx, int dy) { - memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t)); for (int y = 0; y < h; y++) { const uint8_t *p1 = src_pre + y*src_w; const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx; - uint32_t *out = integral + (y*integral_stride) - 1; - - *out++ = 0; + uint32_t *out = integral + (y*integral_stride); for (int x = 0; x < w; x++) { @@ -625,38 +629,28 @@ static void nlmeans_plane(NLMeansFunctions *functions, double h_param, double origin_tune, int n, - int r) + int r, + const float *exptable, + const float weight_fact_table, + const int diff_max) { - int n_half = (n-1) /2; - int r_half = (r-1) /2; + const int n_half = (n-1) /2; + const int r_half = (r-1) /2; // Source image uint8_t *src = frame[0].plane[plane].image; uint8_t *src_pre = frame[0].plane[plane].image_pre; - int border = frame[0].plane[plane].border; - int src_w = frame[0].plane[plane].w + 2 * border; + const int border = frame[0].plane[plane].border; + const int src_w = frame[0].plane[plane].w + 2 * border; // Allocate temporary pixel sums struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum)); // Allocate integral image - int integral_stride = w + 2 * 16; - uint32_t *integral_mem = malloc(integral_stride * (h+1) * sizeof(uint32_t)); + const int integral_stride = w + 2 * 16; + uint32_t *integral_mem = calloc(integral_stride * (h+1), sizeof(uint32_t)); uint32_t *integral = integral_mem + integral_stride + 16; - // Precompute exponential table - float exptable[NLMEANS_EXPSIZE]; - const float weight_factor = 1.0/n/n / (h_param * h_param); - const float min_weight_in_table = 0.0005; - const float stretch = NLMEANS_EXPSIZE / (-log(min_weight_in_table)); - const float weight_fact_table = weight_factor * stretch; - const int diff_max = NLMEANS_EXPSIZE / weight_fact_table; - for (int i = 0; i < NLMEANS_EXPSIZE; i++) - { - exptable[i] = exp(-i/stretch); - } - exptable[NLMEANS_EXPSIZE-1] = 0; - // Iterate through available frames for (int f = 0; f < nframes; f++) { @@ -665,8 +659,8 @@ static void nlmeans_plane(NLMeansFunctions *functions, // Compare image uint8_t *compare = frame[f].plane[plane].image; uint8_t *compare_pre = frame[f].plane[plane].image_pre; - int border = frame[f].plane[plane].border; - int compare_w = frame[f].plane[plane].w + 2 * border; + const int border = frame[f].plane[plane].border; + const int compare_w = frame[f].plane[plane].w + 2 * border; // Iterate through all displacements for (int dy = -r_half; dy <= r_half; dy++) @@ -712,19 +706,19 @@ static void nlmeans_plane(NLMeansFunctions *functions, for (int x = 0; x <= w-n; x++) { - int xc = x + n_half; - int yc = y + n_half; + const int xc = x + n_half; + const int yc = y + n_half; // Difference between patches - int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]); + const int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]); // Sum pixel with weight if (diff < diff_max) { - int diffidx = diff * weight_fact_table; + const int diffidx = diff * weight_fact_table; //float weight = exp(-diff*weightFact); - float weight = exptable[diffidx]; + const float weight = exptable[diffidx]; tmp_data[yc*w + xc].weight_sum += weight; tmp_data[yc*w + xc].pixel_sum += weight * compare[(yc+dy)*compare_w + xc + dx]; @@ -837,6 +831,21 @@ static int nlmeans_init(hb_filter_object_t *filter, if (pv->prefilter[c] < 0) { pv->prefilter[c] = 0; } if (pv->max_frames < pv->nframes[c]) pv->max_frames = pv->nframes[c]; + + // Precompute exponential table + float *exptable = &pv->exptable[c][0]; + float *weight_fact_table = &pv->weight_fact_table[c]; + int *diff_max = &pv->diff_max[c]; + const float weight_factor = 1.0/pv->patch_size[c]/pv->patch_size[c] / (pv->strength[c] * pv->strength[c]); + const float min_weight_in_table = 0.0005; + const float stretch = NLMEANS_EXPSIZE / (-log(min_weight_in_table)); + *(weight_fact_table) = weight_factor * stretch; + *(diff_max) = NLMEANS_EXPSIZE / *(weight_fact_table); + for (int i = 0; i < NLMEANS_EXPSIZE; i++) + { + exptable[i] = exp(-i/stretch); + } + exptable[NLMEANS_EXPSIZE-1] = 0; } pv->thread_count = hb_get_cpu_count(); @@ -981,7 +990,10 @@ static void nlmeans_filter_thread(void *thread_args_v) pv->strength[c], pv->origin_tune[c], pv->patch_size[c], - pv->range[c]); + pv->range[c], + pv->exptable[c], + pv->weight_fact_table[c], + pv->diff_max[c]); } buf->s = pv->frame[segment].s; thread_data->out = buf; @@ -1015,7 +1027,9 @@ static void nlmeans_add_frame(hb_filter_private_t *pv, hb_buffer_t *buf) static hb_buffer_t * nlmeans_filter(hb_filter_private_t *pv) { if (pv->next_frame < pv->max_frames + pv->thread_count) + { return NULL; + } taskset_cycle(&pv->taskset); @@ -1102,7 +1116,9 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv) int nframes = pv->next_frame - f; if (pv->nframes[c] < nframes) + { nframes = pv->nframes[c]; + } // Process current plane nlmeans_plane(functions, frame, @@ -1116,7 +1132,10 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv) pv->strength[c], pv->origin_tune[c], pv->patch_size[c], - pv->range[c]); + pv->range[c], + pv->exptable[c], + pv->weight_fact_table[c], + pv->diff_max[c]); } buf->s = frame->s; if (out == NULL) diff --git a/libhb/nlmeans_x86.c b/libhb/nlmeans_x86.c index 9acba22d4..685ac857e 100644 --- a/libhb/nlmeans_x86.c +++ b/libhb/nlmeans_x86.c @@ -29,21 +29,15 @@ static void build_integral_sse2(uint32_t *integral, { const __m128i zero = _mm_set1_epi8(0); - memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t)); - for (int y = 0; y < h; y++) { __m128i prevadd = _mm_set1_epi32(0); const uint8_t *p1 = src_pre + y*src_w; const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx; - uint32_t *out = integral + (y*integral_stride) - 1; - - *out++ = 0; - - const int pixels_step = 16; + uint32_t *out = integral + (y*integral_stride); - for (int x = 0; x < w; x += pixels_step) + for (int x = 0; x < w; x += 16) { __m128i pa, pb; __m128i pla, plb; @@ -116,16 +110,16 @@ static void build_integral_sse2(uint32_t *integral, _mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory // Increment - out += pixels_step; - p1 += pixels_step; - p2 += pixels_step; + out += 16; + p1 += 16; + p2 += 16; } if (y > 0) { out = integral + y*integral_stride; - for (int x = 0; x < w; x += pixels_step) + for (int x = 0; x < w; x += 16) { *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride), *(__m128i*)(out)); @@ -139,7 +133,7 @@ static void build_integral_sse2(uint32_t *integral, *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride), *(__m128i*)(out+12)); - out += 4*4; + out += 16; } } } |