summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbradleys <[email protected]>2015-02-06 11:05:33 +0000
committerbradleys <[email protected]>2015-02-06 11:05:33 +0000
commit487ea244e6ebe777773712d5763870f688a2e804 (patch)
treeb902d8da09da66904dc56a36f44c2ae74a7adf01
parent1141a5f323d6ce712881a2de311b750dc641119e (diff)
libhb: Minor nlmeans optimizations.
Use calloc for nlmeans integral instead of memsets in-loop zeroing. Replace superfluous const with literal in SSE implementation. Move exponential table calculation out of the main loop. More const correctness. Add some braces. Overall, slightly more readable/maintainable and (very) slightly faster. git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6874 b64f7644-9d1e-0410-96f1-a4d463321fa5
-rw-r--r--libhb/nlmeans.c135
-rw-r--r--libhb/nlmeans_x86.c20
2 files changed, 84 insertions, 71 deletions
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c
index 78422fa1f..25389ba77 100644
--- a/libhb/nlmeans.c
+++ b/libhb/nlmeans.c
@@ -131,6 +131,10 @@ struct hb_filter_private_s
int nframes[3]; // temporal search depth in frames
int prefilter[3]; // prefilter mode, can improve weight analysis
+ float exptable[3][NLMEANS_EXPSIZE];
+ float weight_fact_table[3];
+ int diff_max[3];
+
NLMeansFunctions functions;
Frame *frame;
@@ -166,7 +170,7 @@ static void nlmeans_border(uint8_t *src,
int h,
int border)
{
- int bw = w + 2 * border;
+ const int bw = w + 2 * border;
uint8_t *image = src + border + bw * border;
// Create faux borders using edge pixels
@@ -192,11 +196,14 @@ static void nlmeans_deborder(BorderedPlane *src,
int s,
int h)
{
- int bw = src->w + 2 * src->border;
+ const int bw = src->w + 2 * src->border;
uint8_t *image = src->mem + src->border + bw * src->border;
+
int width = w;
if (src->w < width)
+ {
width = src->w;
+ }
// Copy main image
for (int y = 0; y < h; y++)
@@ -213,8 +220,8 @@ static void nlmeans_alloc(uint8_t *src,
BorderedPlane *dst,
int border)
{
- int bw = src_w + 2 * border;
- int bh = src_h + 2 * border;
+ const int bw = src_w + 2 * border;
+ const int bh = src_h + 2 * border;
uint8_t *mem = malloc(bw * bh * sizeof(uint8_t));
uint8_t *image = mem + border + bw * border;
@@ -246,11 +253,11 @@ static void nlmeans_filter_mean(uint8_t *src,
{
// Mean filter
- int bw = w + 2 * border;
- int offset_min = -((size - 1) /2);
- int offset_max = (size + 1) /2;
+ const int bw = w + 2 * border;
+ const int offset_min = -((size - 1) /2);
+ const int offset_max = (size + 1) /2;
+ const double pixel_weight = 1.0 / (size * size);
uint16_t pixel_sum;
- double pixel_weight = 1.0 / (size * size);
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
@@ -341,9 +348,9 @@ static void nlmeans_filter_median(uint8_t *src,
int size)
{
// Median filter
- int bw = w + 2 * border;
- int offset_min = -((size - 1) /2);
- int offset_max = (size + 1) /2;
+ const int bw = w + 2 * border;
+ const int offset_min = -((size - 1) /2);
+ const int offset_max = (size + 1) /2;
int index;
uint8_t pixels[size * size];
for (int y = 0; y < h; y++)
@@ -371,19 +378,19 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
int h,
int border)
{
- int bw = w + 2 * border;
- int bh = h + 2 * border;
+ const int bw = w + 2 * border;
+ const int bh = h + 2 * border;
// Custom kernel
- int kernel_size = 3;
- int kernel[3][3] = {{-31, 0, 31},
- {-44, 0, 44},
- {-31, 0, 31}};
- double kernel_coef = 1.0 / 126.42;
+ const int kernel_size = 3;
+ const int kernel[3][3] = {{-31, 0, 31},
+ {-44, 0, 44},
+ {-31, 0, 31}};
+ const double kernel_coef = 1.0 / 126.42;
// Detect edges
- int offset_min = -((kernel_size - 1) /2);
- int offset_max = (kernel_size + 1) /2;
+ const int offset_min = -((kernel_size - 1) /2);
+ const int offset_max = (kernel_size + 1) /2;
uint16_t pixel1;
uint16_t pixel2;
uint8_t *mask_mem = calloc(bw * bh, sizeof(uint8_t));
@@ -487,11 +494,11 @@ static void nlmeans_prefilter(BorderedPlane *src,
// Source image
uint8_t *mem = src->mem;
uint8_t *image = src->image;
- int border = src->border;
- int w = src->w;
- int h = src->h;
- int bw = w + 2 * border;
- int bh = h + 2 * border;
+ const int border = src->border;
+ const int w = src->w;
+ const int h = src->h;
+ const int bw = w + 2 * border;
+ const int bh = h + 2 * border;
// Duplicate plane
uint8_t *mem_pre = malloc(bw * bh * sizeof(uint8_t));
@@ -584,14 +591,11 @@ static void build_integral_scalar(uint32_t *integral,
int dx,
int dy)
{
- memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
for (int y = 0; y < h; y++)
{
const uint8_t *p1 = src_pre + y*src_w;
const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
- uint32_t *out = integral + (y*integral_stride) - 1;
-
- *out++ = 0;
+ uint32_t *out = integral + (y*integral_stride);
for (int x = 0; x < w; x++)
{
@@ -625,38 +629,28 @@ static void nlmeans_plane(NLMeansFunctions *functions,
double h_param,
double origin_tune,
int n,
- int r)
+ int r,
+ const float *exptable,
+ const float weight_fact_table,
+ const int diff_max)
{
- int n_half = (n-1) /2;
- int r_half = (r-1) /2;
+ const int n_half = (n-1) /2;
+ const int r_half = (r-1) /2;
// Source image
uint8_t *src = frame[0].plane[plane].image;
uint8_t *src_pre = frame[0].plane[plane].image_pre;
- int border = frame[0].plane[plane].border;
- int src_w = frame[0].plane[plane].w + 2 * border;
+ const int border = frame[0].plane[plane].border;
+ const int src_w = frame[0].plane[plane].w + 2 * border;
// Allocate temporary pixel sums
struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum));
// Allocate integral image
- int integral_stride = w + 2 * 16;
- uint32_t *integral_mem = malloc(integral_stride * (h+1) * sizeof(uint32_t));
+ const int integral_stride = w + 2 * 16;
+ uint32_t *integral_mem = calloc(integral_stride * (h+1), sizeof(uint32_t));
uint32_t *integral = integral_mem + integral_stride + 16;
- // Precompute exponential table
- float exptable[NLMEANS_EXPSIZE];
- const float weight_factor = 1.0/n/n / (h_param * h_param);
- const float min_weight_in_table = 0.0005;
- const float stretch = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
- const float weight_fact_table = weight_factor * stretch;
- const int diff_max = NLMEANS_EXPSIZE / weight_fact_table;
- for (int i = 0; i < NLMEANS_EXPSIZE; i++)
- {
- exptable[i] = exp(-i/stretch);
- }
- exptable[NLMEANS_EXPSIZE-1] = 0;
-
// Iterate through available frames
for (int f = 0; f < nframes; f++)
{
@@ -665,8 +659,8 @@ static void nlmeans_plane(NLMeansFunctions *functions,
// Compare image
uint8_t *compare = frame[f].plane[plane].image;
uint8_t *compare_pre = frame[f].plane[plane].image_pre;
- int border = frame[f].plane[plane].border;
- int compare_w = frame[f].plane[plane].w + 2 * border;
+ const int border = frame[f].plane[plane].border;
+ const int compare_w = frame[f].plane[plane].w + 2 * border;
// Iterate through all displacements
for (int dy = -r_half; dy <= r_half; dy++)
@@ -712,19 +706,19 @@ static void nlmeans_plane(NLMeansFunctions *functions,
for (int x = 0; x <= w-n; x++)
{
- int xc = x + n_half;
- int yc = y + n_half;
+ const int xc = x + n_half;
+ const int yc = y + n_half;
// Difference between patches
- int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
+ const int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
// Sum pixel with weight
if (diff < diff_max)
{
- int diffidx = diff * weight_fact_table;
+ const int diffidx = diff * weight_fact_table;
//float weight = exp(-diff*weightFact);
- float weight = exptable[diffidx];
+ const float weight = exptable[diffidx];
tmp_data[yc*w + xc].weight_sum += weight;
tmp_data[yc*w + xc].pixel_sum += weight * compare[(yc+dy)*compare_w + xc + dx];
@@ -837,6 +831,21 @@ static int nlmeans_init(hb_filter_object_t *filter,
if (pv->prefilter[c] < 0) { pv->prefilter[c] = 0; }
if (pv->max_frames < pv->nframes[c]) pv->max_frames = pv->nframes[c];
+
+ // Precompute exponential table
+ float *exptable = &pv->exptable[c][0];
+ float *weight_fact_table = &pv->weight_fact_table[c];
+ int *diff_max = &pv->diff_max[c];
+ const float weight_factor = 1.0/pv->patch_size[c]/pv->patch_size[c] / (pv->strength[c] * pv->strength[c]);
+ const float min_weight_in_table = 0.0005;
+ const float stretch = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
+ *(weight_fact_table) = weight_factor * stretch;
+ *(diff_max) = NLMEANS_EXPSIZE / *(weight_fact_table);
+ for (int i = 0; i < NLMEANS_EXPSIZE; i++)
+ {
+ exptable[i] = exp(-i/stretch);
+ }
+ exptable[NLMEANS_EXPSIZE-1] = 0;
}
pv->thread_count = hb_get_cpu_count();
@@ -981,7 +990,10 @@ static void nlmeans_filter_thread(void *thread_args_v)
pv->strength[c],
pv->origin_tune[c],
pv->patch_size[c],
- pv->range[c]);
+ pv->range[c],
+ pv->exptable[c],
+ pv->weight_fact_table[c],
+ pv->diff_max[c]);
}
buf->s = pv->frame[segment].s;
thread_data->out = buf;
@@ -1015,7 +1027,9 @@ static void nlmeans_add_frame(hb_filter_private_t *pv, hb_buffer_t *buf)
static hb_buffer_t * nlmeans_filter(hb_filter_private_t *pv)
{
if (pv->next_frame < pv->max_frames + pv->thread_count)
+ {
return NULL;
+ }
taskset_cycle(&pv->taskset);
@@ -1102,7 +1116,9 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
int nframes = pv->next_frame - f;
if (pv->nframes[c] < nframes)
+ {
nframes = pv->nframes[c];
+ }
// Process current plane
nlmeans_plane(functions,
frame,
@@ -1116,7 +1132,10 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
pv->strength[c],
pv->origin_tune[c],
pv->patch_size[c],
- pv->range[c]);
+ pv->range[c],
+ pv->exptable[c],
+ pv->weight_fact_table[c],
+ pv->diff_max[c]);
}
buf->s = frame->s;
if (out == NULL)
diff --git a/libhb/nlmeans_x86.c b/libhb/nlmeans_x86.c
index 9acba22d4..685ac857e 100644
--- a/libhb/nlmeans_x86.c
+++ b/libhb/nlmeans_x86.c
@@ -29,21 +29,15 @@ static void build_integral_sse2(uint32_t *integral,
{
const __m128i zero = _mm_set1_epi8(0);
- memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
-
for (int y = 0; y < h; y++)
{
__m128i prevadd = _mm_set1_epi32(0);
const uint8_t *p1 = src_pre + y*src_w;
const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
- uint32_t *out = integral + (y*integral_stride) - 1;
-
- *out++ = 0;
-
- const int pixels_step = 16;
+ uint32_t *out = integral + (y*integral_stride);
- for (int x = 0; x < w; x += pixels_step)
+ for (int x = 0; x < w; x += 16)
{
__m128i pa, pb;
__m128i pla, plb;
@@ -116,16 +110,16 @@ static void build_integral_sse2(uint32_t *integral,
_mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory
// Increment
- out += pixels_step;
- p1 += pixels_step;
- p2 += pixels_step;
+ out += 16;
+ p1 += 16;
+ p2 += 16;
}
if (y > 0)
{
out = integral + y*integral_stride;
- for (int x = 0; x < w; x += pixels_step)
+ for (int x = 0; x < w; x += 16)
{
*((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
*(__m128i*)(out));
@@ -139,7 +133,7 @@ static void build_integral_sse2(uint32_t *integral,
*((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),
*(__m128i*)(out+12));
- out += 4*4;
+ out += 16;
}
}
}