libhb: Minor nlmeans optimizations.

Use calloc for nlmeans integral instead of memsets in-loop zeroing. Replace superfluous const with literal in SSE implementation. Move exponential table calculation out of the main loop. More const correctness. Add some braces. Overall, slightly more readable/maintainable and (very) slightly faster. git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6874 b64f7644-9d1e-0410-96f1-a4d463321fa5
author: bradleys <[email protected]> 2015-02-06 11:05:33 +0000
committer: bradleys <[email protected]> 2015-02-06 11:05:33 +0000
commit: 487ea244e6ebe777773712d5763870f688a2e804 (patch)
tree: b902d8da09da66904dc56a36f44c2ae74a7adf01 /libhb/nlmeans.c
parent: 1141a5f323d6ce712881a2de311b750dc641119e (diff)
1 files changed, 77 insertions, 58 deletions
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c
index 78422fa1f..25389ba77 100644
--- a/libhb/nlmeans.c
+++ b/libhb/nlmeans.c
@@ -131,6 +131,10 @@ struct hb_filter_private_s
     int    nframes[3];     // temporal search depth in frames
     int    prefilter[3];   // prefilter mode, can improve weight analysis
 
+    float  exptable[3][NLMEANS_EXPSIZE];
+    float  weight_fact_table[3];
+    int    diff_max[3];
+
     NLMeansFunctions functions;
 
     Frame      *frame;
@@ -166,7 +170,7 @@ static void nlmeans_border(uint8_t *src,
                            int h,
                            int border)
 {
-    int bw = w + 2 * border;
+    const int bw = w + 2 * border;
     uint8_t *image = src + border + bw * border;
 
     // Create faux borders using edge pixels
@@ -192,11 +196,14 @@ static void nlmeans_deborder(BorderedPlane *src,
                              int s,
                              int h)
 {
-    int bw = src->w + 2 * src->border;
+    const int bw = src->w + 2 * src->border;
     uint8_t *image = src->mem + src->border + bw * src->border;
+
     int width = w;
     if (src->w < width)
+    {
         width = src->w;
+    }
 
     // Copy main image
     for (int y = 0; y < h; y++)
@@ -213,8 +220,8 @@ static void nlmeans_alloc(uint8_t *src,
                           BorderedPlane *dst,
                           int border)
 {
-    int bw = src_w + 2 * border;
-    int bh = src_h + 2 * border;
+    const int bw = src_w + 2 * border;
+    const int bh = src_h + 2 * border;
 
     uint8_t *mem   = malloc(bw * bh * sizeof(uint8_t));
     uint8_t *image = mem + border + bw * border;
@@ -246,11 +253,11 @@ static void nlmeans_filter_mean(uint8_t *src,
 {
 
     // Mean filter
-    int bw = w + 2 * border;
-    int offset_min = -((size - 1) /2);
-    int offset_max =   (size + 1) /2;
+    const int bw = w + 2 * border;
+    const int offset_min = -((size - 1) /2);
+    const int offset_max =   (size + 1) /2;
+    const double pixel_weight = 1.0 / (size * size);
     uint16_t pixel_sum;
-    double pixel_weight = 1.0 / (size * size);
     for (int y = 0; y < h; y++)
     {
         for (int x = 0; x < w; x++)
@@ -341,9 +348,9 @@ static void nlmeans_filter_median(uint8_t *src,
                                   int size)
 {
     // Median filter
-    int bw = w + 2 * border;
-    int offset_min = -((size - 1) /2);
-    int offset_max =   (size + 1) /2;
+    const int bw = w + 2 * border;
+    const int offset_min = -((size - 1) /2);
+    const int offset_max =   (size + 1) /2;
     int index;
     uint8_t pixels[size * size];
     for (int y = 0; y < h; y++)
@@ -371,19 +378,19 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
                                      int h,
                                      int border)
 {
-    int bw = w + 2 * border;
-    int bh = h + 2 * border;
+    const int bw = w + 2 * border;
+    const int bh = h + 2 * border;
 
     // Custom kernel
-    int kernel_size = 3;
-    int kernel[3][3] = {{-31, 0, 31},
-                        {-44, 0, 44},
-                        {-31, 0, 31}};
-    double kernel_coef = 1.0 / 126.42;
+    const int kernel_size = 3;
+    const int kernel[3][3] = {{-31, 0, 31},
+                              {-44, 0, 44},
+                              {-31, 0, 31}};
+    const double kernel_coef = 1.0 / 126.42;
 
     // Detect edges
-    int offset_min = -((kernel_size - 1) /2);
-    int offset_max =   (kernel_size + 1) /2;
+    const int offset_min = -((kernel_size - 1) /2);
+    const int offset_max =   (kernel_size + 1) /2;
     uint16_t pixel1;
     uint16_t pixel2;
     uint8_t *mask_mem = calloc(bw * bh, sizeof(uint8_t));
@@ -487,11 +494,11 @@ static void nlmeans_prefilter(BorderedPlane *src,
         // Source image
         uint8_t *mem   = src->mem;
         uint8_t *image = src->image;
-        int border     = src->border;
-        int w          = src->w;
-        int h          = src->h;
-        int bw         = w + 2 * border;
-        int bh         = h + 2 * border;
+        const int border     = src->border;
+        const int w          = src->w;
+        const int h          = src->h;
+        const int bw         = w + 2 * border;
+        const int bh         = h + 2 * border;
 
         // Duplicate plane
         uint8_t *mem_pre = malloc(bw * bh * sizeof(uint8_t));
@@ -584,14 +591,11 @@ static void build_integral_scalar(uint32_t *integral,
                                   int       dx,
                                   int       dy)
 {
-    memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
     for (int y = 0; y < h; y++)
     {
         const uint8_t *p1 = src_pre + y*src_w;
         const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
-        uint32_t *out = integral + (y*integral_stride) - 1;
-
-        *out++ = 0;
+        uint32_t *out = integral + (y*integral_stride);
 
         for (int x = 0; x < w; x++)
         {
@@ -625,38 +629,28 @@ static void nlmeans_plane(NLMeansFunctions *functions,
                           double h_param,
                           double origin_tune,
                           int n,
-                          int r)
+                          int r,
+                    const float *exptable,
+                    const float  weight_fact_table,
+                    const int    diff_max)
 {
-    int n_half = (n-1) /2;
-    int r_half = (r-1) /2;
+    const int n_half = (n-1) /2;
+    const int r_half = (r-1) /2;
 
     // Source image
     uint8_t *src     = frame[0].plane[plane].image;
     uint8_t *src_pre = frame[0].plane[plane].image_pre;
-    int border       = frame[0].plane[plane].border;
-    int src_w        = frame[0].plane[plane].w + 2 * border;
+    const int border = frame[0].plane[plane].border;
+    const int src_w  = frame[0].plane[plane].w + 2 * border;
 
     // Allocate temporary pixel sums
     struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum));
 
     // Allocate integral image
-    int integral_stride = w + 2 * 16;
-    uint32_t *integral_mem = malloc(integral_stride * (h+1) * sizeof(uint32_t));
+    const int integral_stride = w + 2 * 16;
+    uint32_t *integral_mem = calloc(integral_stride * (h+1), sizeof(uint32_t));
     uint32_t *integral     = integral_mem + integral_stride + 16;
 
-    // Precompute exponential table
-    float exptable[NLMEANS_EXPSIZE];
-    const float weight_factor       = 1.0/n/n / (h_param * h_param);
-    const float min_weight_in_table = 0.0005;
-    const float stretch             = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
-    const float weight_fact_table   = weight_factor * stretch;
-    const int   diff_max            = NLMEANS_EXPSIZE / weight_fact_table;
-    for (int i = 0; i < NLMEANS_EXPSIZE; i++)
-    {
-        exptable[i] = exp(-i/stretch);
-    }
-    exptable[NLMEANS_EXPSIZE-1] = 0;
-
     // Iterate through available frames
     for (int f = 0; f < nframes; f++)
     {
@@ -665,8 +659,8 @@ static void nlmeans_plane(NLMeansFunctions *functions,
         // Compare image
         uint8_t *compare     = frame[f].plane[plane].image;
         uint8_t *compare_pre = frame[f].plane[plane].image_pre;
-        int border           = frame[f].plane[plane].border;
-        int compare_w        = frame[f].plane[plane].w + 2 * border;
+        const int border     = frame[f].plane[plane].border;
+        const int compare_w  = frame[f].plane[plane].w + 2 * border;
 
         // Iterate through all displacements
         for (int dy = -r_half; dy <= r_half; dy++)
@@ -712,19 +706,19 @@ static void nlmeans_plane(NLMeansFunctions *functions,
 
                     for (int x = 0; x <= w-n; x++)
                     {
-                        int xc = x + n_half;
-                        int yc = y + n_half;
+                        const int xc = x + n_half;
+                        const int yc = y + n_half;
 
                         // Difference between patches
-                        int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
+                        const int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
 
                         // Sum pixel with weight
                         if (diff < diff_max)
                         {
-                            int diffidx = diff * weight_fact_table;
+                            const int diffidx = diff * weight_fact_table;
 
                             //float weight = exp(-diff*weightFact);
-                            float weight = exptable[diffidx];
+                            const float weight = exptable[diffidx];
 
                             tmp_data[yc*w + xc].weight_sum += weight;
                             tmp_data[yc*w + xc].pixel_sum  += weight * compare[(yc+dy)*compare_w + xc + dx];
@@ -837,6 +831,21 @@ static int nlmeans_init(hb_filter_object_t *filter,
         if (pv->prefilter[c] < 0)       { pv->prefilter[c] = 0; }
 
         if (pv->max_frames < pv->nframes[c]) pv->max_frames = pv->nframes[c];
+
+        // Precompute exponential table
+        float *exptable = &pv->exptable[c][0];
+        float *weight_fact_table = &pv->weight_fact_table[c];
+        int   *diff_max = &pv->diff_max[c];
+        const float weight_factor        = 1.0/pv->patch_size[c]/pv->patch_size[c] / (pv->strength[c] * pv->strength[c]);
+        const float min_weight_in_table  = 0.0005;
+        const float stretch              = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
+        *(weight_fact_table)             = weight_factor * stretch;
+        *(diff_max)                      = NLMEANS_EXPSIZE / *(weight_fact_table);
+        for (int i = 0; i < NLMEANS_EXPSIZE; i++)
+        {
+            exptable[i] = exp(-i/stretch);
+        }
+        exptable[NLMEANS_EXPSIZE-1] = 0;
     }
 
     pv->thread_count = hb_get_cpu_count();
@@ -981,7 +990,10 @@ static void nlmeans_filter_thread(void *thread_args_v)
                           pv->strength[c],
                           pv->origin_tune[c],
                           pv->patch_size[c],
-                          pv->range[c]);
+                          pv->range[c],
+                          pv->exptable[c],
+                          pv->weight_fact_table[c],
+                          pv->diff_max[c]);
         }
         buf->s = pv->frame[segment].s;
         thread_data->out = buf;
@@ -1015,7 +1027,9 @@ static void nlmeans_add_frame(hb_filter_private_t *pv, hb_buffer_t *buf)
 static hb_buffer_t * nlmeans_filter(hb_filter_private_t *pv)
 {
     if (pv->next_frame < pv->max_frames + pv->thread_count)
+    {
         return NULL;
+    }
 
     taskset_cycle(&pv->taskset);
 
@@ -1102,7 +1116,9 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
 
             int nframes = pv->next_frame - f;
             if (pv->nframes[c] < nframes)
+            {
                 nframes = pv->nframes[c];
+            }
             // Process current plane
             nlmeans_plane(functions,
                           frame,
@@ -1116,7 +1132,10 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
                           pv->strength[c],
                           pv->origin_tune[c],
                           pv->patch_size[c],
-                          pv->range[c]);
+                          pv->range[c],
+                          pv->exptable[c],
+                          pv->weight_fact_table[c],
+                          pv->diff_max[c]);
         }
         buf->s = frame->s;
         if (out == NULL)
author	bradleys <[email protected]>	2015-02-06 11:05:33 +0000
committer	bradleys <[email protected]>	2015-02-06 11:05:33 +0000
commit	487ea244e6ebe777773712d5763870f688a2e804 (patch)
tree	b902d8da09da66904dc56a36f44c2ae74a7adf01 /libhb/nlmeans.c
parent	1141a5f323d6ce712881a2de311b750dc641119e (diff)