5 files changed, 953 insertions, 0 deletions
diff --git a/libhb/common.c b/libhb/common.c
index 60b926d94..6ba81f762 100644
--- a/libhb/common.c
+++ b/libhb/common.c
@@ -3247,6 +3247,10 @@ hb_filter_object_t * hb_filter_init( int filter_id )
             filter = &hb_filter_denoise;
             break;
 
+        case HB_FILTER_NLMEANS:
+            filter = &hb_filter_nlmeans;
+            break;
+
         case HB_FILTER_RENDER_SUB:
             filter = &hb_filter_render_sub;
             break;
diff --git a/libhb/common.h b/libhb/common.h
index 778918db2..2a57b0534 100644
--- a/libhb/common.h
+++ b/libhb/common.h
@@ -1210,6 +1210,7 @@ enum
     // Filters that must operate on the original source image are next
     HB_FILTER_DEBLOCK,
     HB_FILTER_DENOISE,
+    HB_FILTER_NLMEANS,
     HB_FILTER_RENDER_SUB,
     HB_FILTER_CROP_SCALE,
 
diff --git a/libhb/internal.h b/libhb/internal.h
index e3b81cd0c..2f7cbacc1 100644
--- a/libhb/internal.h
+++ b/libhb/internal.h
@@ -433,6 +433,7 @@ extern hb_filter_object_t hb_filter_detelecine;
 extern hb_filter_object_t hb_filter_deinterlace;
 extern hb_filter_object_t hb_filter_deblock;
 extern hb_filter_object_t hb_filter_denoise;
+extern hb_filter_object_t hb_filter_nlmeans;
 extern hb_filter_object_t hb_filter_decomb;
 extern hb_filter_object_t hb_filter_rotate;
 extern hb_filter_object_t hb_filter_crop_scale;
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c
new file mode 100644
index 000000000..ad3f1d393
--- /dev/null
+++ b/libhb/nlmeans.c
@@ -0,0 +1,742 @@
+/* nlmeans.c
+
+   Copyright (c) 2013 Dirk Farin
+   Copyright (c) 2003-2014 HandBrake Team
+   This file is part of the HandBrake source code
+   Homepage: <http://handbrake.fr/>.
+   It may be used under the terms of the GNU General Public License v2.
+   For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+ */
+
+/* Usage
+ *
+ * Parameters:
+ *     lumaY_strength   : lumaY_origin_tune   : lumaY_patch_size   : lumaY_range   : lumaY_frames   : lumaY_prefilter   :
+ *     chromaB_strength : chromaB_origin_tune : chromaB_patch_size : chromaB_range : chromaB_frames : chromaB_prefilter :
+ *     chromaR_strength : chromaR_origin_tune : chromaR_patch_size : chromaR_range : chromaR_frames : chromaR_prefilter
+ *
+ * Defaults:
+ *     8:1:7:3:2:0 for each channel (equivalent to 8:1:7:3:2:0:8:1:7:3:2:0:8:1:7:3:2:0)
+ *
+ * Parameters cascade, e.g. 6:0.8:7:3:3:0:4:1 sets:
+ *     strength 6, origin tune 0.8 for luma
+ *     patch size 7, range 3, frames 3, prefilter 0 for all channels
+ *     strength 4, origin tune 1 for both chroma channels
+ *
+ * Strength is relative and must be adjusted; ALL parameters affect overall strength.
+ * Lower origin tune improves results for noisier input or animation (film 0.5-1, animation 0.15-0.5).
+ * Large patch size (>9) may greatly reduce quality by clobbering detail.
+ * Larger search range increases quality; however, computation time increases exponentially.
+ * Large number of frames (film >3, animation >6) may cause temporal smearing.
+ * Prefiltering can potentially improve weight decisions, yielding better results for difficult sources.
+ *
+ * Prefilter enum combos:
+ *     1: Mean 3x3
+ *     2: Mean 5x5
+ *     3: Mean 5x5 (overrides Mean 3x3)
+ *   257: Mean 3x3 reduced by 25%
+ *   258: Mean 5x5 reduced by 25%
+ *   513: Mean 3x3 reduced by 50%
+ *   514: Mean 5x5 reduced by 50%
+ *   769: Mean 3x3 reduced by 75%
+ *   770: Mean 5x5 reduced by 75%
+ *  1025: Mean 3x3 plus edge boost (restores lost edge detail)
+ *  1026: Mean 5x5 plus edge boost
+ *  1281: Mean 3x3 reduced by 25% plus edge boost
+ *        etc...
+ *  2049: Mean 3x3 passthru (NL-means off, prefilter is the output)
+ *        etc...
+ *  3329: Mean 3x3 reduced by 25% plus edge boost, passthru
+ *        etc...
+ */
+
+#include "hb.h"
+#include "hbffmpeg.h"
+
+#define NLMEANS_STRENGTH_LUMA_DEFAULT      8
+#define NLMEANS_STRENGTH_CHROMA_DEFAULT    8
+#define NLMEANS_ORIGIN_TUNE_LUMA_DEFAULT   1
+#define NLMEANS_ORIGIN_TUNE_CHROMA_DEFAULT 1
+#define NLMEANS_PATCH_SIZE_LUMA_DEFAULT    7
+#define NLMEANS_PATCH_SIZE_CHROMA_DEFAULT  7
+#define NLMEANS_RANGE_LUMA_DEFAULT         3
+#define NLMEANS_RANGE_CHROMA_DEFAULT       3
+#define NLMEANS_FRAMES_LUMA_DEFAULT        2
+#define NLMEANS_FRAMES_CHROMA_DEFAULT      2
+#define NLMEANS_PREFILTER_LUMA_DEFAULT     0
+#define NLMEANS_PREFILTER_CHROMA_DEFAULT   0
+
+#define NLMEANS_PREFILTER_MODE_MEAN3X3       1
+#define NLMEANS_PREFILTER_MODE_MEAN5X5       2
+#define NLMEANS_PREFILTER_MODE_RESERVED4     4 // Reserved
+#define NLMEANS_PREFILTER_MODE_RESERVED8     8 // Reserved
+#define NLMEANS_PREFILTER_MODE_RESERVED16   16 // Reserved
+#define NLMEANS_PREFILTER_MODE_RESERVED32   32 // Reserved
+#define NLMEANS_PREFILTER_MODE_RESERVED64   64 // Reserved
+#define NLMEANS_PREFILTER_MODE_RESERVED128 128 // Reserved
+#define NLMEANS_PREFILTER_MODE_REDUCE25    256
+#define NLMEANS_PREFILTER_MODE_REDUCE50    512
+#define NLMEANS_PREFILTER_MODE_EDGEBOOST  1024
+#define NLMEANS_PREFILTER_MODE_PASSTHRU   2048
+
+#define NLMEANS_FRAMES_MAX 32
+#define NLMEANS_EXPSIZE    128
+
+typedef struct
+{
+    uint8_t *mem;
+    uint8_t *mem_pre;
+    uint8_t *image;
+    uint8_t *image_pre;
+    int w;
+    int h;
+    int border;
+} BorderedPlane;
+
+struct PixelSum
+{
+    float weight_sum;
+    float pixel_sum;
+};
+
+struct hb_filter_private_s
+{
+    double strength[3];    // averaging weight decay, larger produces smoother output
+    double origin_tune[3]; // weight tuning for origin patch, 0.00..1.00
+    int    patch_size[3];  // pixel context region width  (must be odd)
+    int    range[3];       // spatial search window width (must be odd)
+    int    frames[3];      // temporal search depth in frames
+    int    prefilter[3];   // prefilter mode, can improve weight analysis
+
+    BorderedPlane frame_tmp[3][32];
+    int           frame_ready[3][32];
+};
+
+static int hb_nlmeans_init(hb_filter_object_t *filter,
+                           hb_filter_init_t   *init);
+
+static int hb_nlmeans_work(hb_filter_object_t *filter,
+                           hb_buffer_t **buf_in,
+                           hb_buffer_t **buf_out);
+
+static void hb_nlmeans_close(hb_filter_object_t *filter);
+
+hb_filter_object_t hb_filter_nlmeans =
+{
+    .id            = HB_FILTER_NLMEANS,
+    .enforce_order = 1,
+    .name          = "Denoise (nlmeans)",
+    .settings      = NULL,
+    .init          = hb_nlmeans_init,
+    .work          = hb_nlmeans_work,
+    .close         = hb_nlmeans_close,
+};
+
+static void nlmeans_border(uint8_t *src,
+                           int w,
+                           int h,
+                           int border)
+{
+
+    uint8_t *image = src + border + w*border;
+    int iw = w - 2*border;
+    int ih = h - 2*border;
+
+    // Create faux borders using edge pixels
+    for (int y = 0; y < ih; y++)
+    {
+        for (int x = 0; x < border; x++)
+        {
+            *(image + y*w - x - 1)  = *(image + y*w + x);
+            *(image + y*w + x + iw) = *(image + y*w - x + (iw-1));
+        }
+    }
+    for (int y = 0; y < border; y++)
+    {
+        memcpy(image - border -  (y+1)*w, image - border +        y*w, w);
+        memcpy(image - border + (y+ih)*w, image - border + (ih-y-1)*w, w);
+    }
+
+}
+
+static void nlmeans_deborder(uint8_t *src,
+                             uint8_t *dst,
+                             int w,
+                             int h,
+                             int border)
+{
+
+    uint8_t *image = src + border + w*border;
+    int iw = w - 2*border;
+    int ih = h - 2*border;
+
+    // Copy main image
+    for (int y = 0; y < ih; y++)
+    {
+        memcpy(dst + y*iw, image + y*w, iw);
+    }
+
+}
+
+static void nlmeans_alloc(uint8_t *src,
+                          int src_w,
+                          int src_h,
+                          BorderedPlane *dst,
+                          int dst_w,
+                          int dst_h,
+                          int border)
+{
+
+    uint8_t *mem   = malloc(dst_w * dst_h * sizeof(uint8_t));
+    uint8_t *image = mem + border + dst_w*border;
+
+    // Copy main image
+    for (int y = 0; y < src_h; y++)
+    {
+        memcpy(image + y*dst_w, src + y*src_w, src_w);
+    }
+
+    dst->mem       = mem;
+    dst->image     = image;
+    dst->w         = dst_w;
+    dst->h         = dst_h;
+    dst->border    = border;
+
+    nlmeans_border(dst->mem, dst->w, dst->h, dst->border);
+    dst->mem_pre   = dst->mem;
+    dst->image_pre = dst->image;
+
+}
+
+static void nlmeans_filter_mean(uint8_t *src,
+                                uint8_t *dst,
+                                int w,
+                                int h,
+                                int border,
+                                int size)
+{
+
+    // Mean filter
+    int iw = w - 2*border;
+    int ih = h - 2*border;
+    int offset_min = -((size - 1) /2);
+    int offset_max =   (size + 1) /2;
+    uint16_t pixel_sum;
+    double pixel_weight = 1.0 / (size * size);
+    for (int y = 0; y < ih; y++)
+    {
+        for (int x = 0; x < iw; x++)
+        {
+            pixel_sum = 0;
+            for (int k = offset_min; k < offset_max; k++)
+            {
+                for (int j = offset_min; j < offset_max; j++)
+                {
+                    pixel_sum = pixel_sum + *(src + w*(y+j) + (x+k));
+                }
+            }
+            *(dst + w*y + x) = (uint8_t)(pixel_sum * pixel_weight);
+        }
+    }
+
+}
+
+static void nlmeans_filter_edgeboost(uint8_t *src,
+                                     uint8_t *dst,
+                                     int w,
+                                     int h,
+                                     int border)
+{
+
+    int iw = w - 2*border;
+    int ih = h - 2*border;
+
+    // Custom kernel
+    int kernel_size = 3;
+    int kernel[3][3] = {{-31, 0, 31},
+                        {-44, 0, 44},
+                        {-31, 0, 31}};
+    double kernel_coef = 1.0 / 27.0;
+
+    // Detect edges
+    int offset_min = -((kernel_size - 1) /2);
+    int offset_max =   (kernel_size + 1) /2;
+    uint16_t pixel1;
+    uint16_t pixel2;
+    uint8_t *mask = malloc(iw * ih * sizeof(uint8_t));
+    for (int y = 0; y < ih; y++)
+    {
+        for (int x = 0; x < iw; x++)
+        {
+            pixel1 = 0;
+            pixel2 = 0;
+            for (int k = offset_min; k < offset_max; k++)
+            {
+                for (int j = offset_min; j < offset_max; j++)
+                {
+                    pixel1 += kernel[j+1][k+1] * *(src + w*(y+j) + (x+k));
+                    pixel2 += kernel[k+1][j+1] * *(src + w*(y+j) + (x+k));
+                }
+            }
+            pixel1 = pixel1 > 0 ? pixel1 : -pixel1;
+            pixel2 = pixel2 > 0 ? pixel2 : -pixel2;
+            pixel1 = (uint16_t)(((double)pixel1 * kernel_coef) + 128);
+            pixel2 = (uint16_t)(((double)pixel2 * kernel_coef) + 128);
+            *(mask + iw*y + x) = (uint8_t)(pixel1 + pixel2);
+            if (*(mask + iw*y + x) > 160)
+            {
+                *(dst + w*y + x) = (3 * *(src + w*y + x) + 1 * *(dst + w*y + x)) /4;
+                //*(dst + w*y + x) = 235;
+            }
+            else if (*(mask + iw*y + x) > 88)
+            {
+                *(dst + w*y + x) = (2 * *(src + w*y + x) + 3 * *(dst + w*y + x)) /5;
+                //*(dst + w*y + x) = 128;
+            }
+            else
+            {
+                //*(dst + w*y + x) = 16;
+            }
+        }
+    }
+
+    free(mask);
+
+}
+
+static void nlmeans_prefilter(BorderedPlane *src,
+                              int filter_type)
+{
+
+    if (filter_type & NLMEANS_PREFILTER_MODE_MEAN3X3 ||
+        filter_type & NLMEANS_PREFILTER_MODE_MEAN5X5)
+    {
+
+        // Source image
+        uint8_t *mem   = src->mem;
+        uint8_t *image = src->image;
+        int w          = src->w;
+        int h          = src->h;
+        int border     = src->border;
+
+        // Duplicate plane
+        uint8_t *mem_pre = malloc(w * h * sizeof(uint8_t));
+        uint8_t *image_pre = mem_pre + border + w*border;
+        for (int y = 0; y < h; y++)
+        {
+            memcpy(mem_pre + y*w, mem + y*w, w);
+        }
+
+        // Filter plane; should already have at least 2px extra border on each side
+        if (filter_type & NLMEANS_PREFILTER_MODE_MEAN5X5)
+        {
+            // Mean 5x5
+            nlmeans_filter_mean(image, image_pre, w, h, border, 5);
+        }
+        else if (filter_type & NLMEANS_PREFILTER_MODE_MEAN3X3)
+        {
+            // Mean 3x3
+            nlmeans_filter_mean(image, image_pre, w, h, border, 3);
+        }
+
+        // Restore edges
+        if (filter_type & NLMEANS_PREFILTER_MODE_EDGEBOOST)
+        {
+            nlmeans_filter_edgeboost(image, image_pre, w, h, border);
+        }
+
+        // Blend source and destination for lesser effect
+        int wet = 1;
+        int dry = 0;
+        if (filter_type & NLMEANS_PREFILTER_MODE_REDUCE50 &&
+            filter_type & NLMEANS_PREFILTER_MODE_REDUCE25)
+        {
+            wet = 1;
+            dry = 3;
+        }
+        else if (filter_type & NLMEANS_PREFILTER_MODE_REDUCE50)
+        {
+            wet = 1;
+            dry = 1;
+        }
+        else if (filter_type & NLMEANS_PREFILTER_MODE_REDUCE25)
+        {
+            wet = 3;
+            dry = 1;
+        }
+        if (dry > 0)
+        {
+            for (int y = 0; y < h; y++)
+            {
+                for (int x = 0; x < w; x++)
+                {
+                    *(mem_pre + w*y + x) = (uint8_t)((wet * *(mem_pre + w*y + x) + dry * *(mem + w*y + x)) / (wet + dry));
+                }
+            }
+        }
+
+        // Assign result
+        src->mem_pre   = mem_pre;
+        src->image_pre = image_pre;
+
+        // Recreate borders
+        nlmeans_border(mem_pre, w, h, border);
+
+    }
+
+}
+
+static void nlmeans_plane(BorderedPlane *plane_tmp,
+                          int *plane_ready,
+                          uint8_t *dst,
+                          int w,
+                          int h,
+                          double h_param,
+                          double origin_tune,
+                          int n,
+                          int r)
+{
+
+    int n_half = (n-1) /2;
+    int r_half = (r-1) /2;
+
+    // Source image
+    uint8_t *src     = plane_tmp[0].image;
+    uint8_t *src_pre = plane_tmp[0].image_pre;
+    int src_w        = plane_tmp[0].w;
+
+    // Allocate temporary pixel sums
+    struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum));
+
+    // Allocate integral image
+    int integral_stride = w + 2*16;
+    uint32_t *integral_mem = malloc(integral_stride * (h+1) * sizeof(uint32_t));
+    uint32_t *integral     = integral_mem + integral_stride + 16;
+
+    // Precompute exponential table
+    float exptable[NLMEANS_EXPSIZE];
+    const float weight_factor       = 1.0/n/n / (h_param * h_param);
+    const float min_weight_in_table = 0.0005;
+    const float stretch             = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
+    const float weight_fact_table   = weight_factor * stretch;
+    const int   diff_max            = NLMEANS_EXPSIZE / weight_fact_table;
+    for (int i = 0; i < NLMEANS_EXPSIZE; i++)
+    {
+        exptable[i] = exp(-i/stretch);
+    }
+    exptable[NLMEANS_EXPSIZE-1] = 0;
+
+    // Iterate through available frames
+    for (int plane_index = 0; plane_ready[plane_index] == 1; plane_index++)
+    {
+
+        // Compare image
+        uint8_t *compare     = plane_tmp[plane_index].image;
+        uint8_t *compare_pre = plane_tmp[plane_index].image_pre;
+        int compare_w        = plane_tmp[plane_index].w;
+
+        // Iterate through all displacements
+        for (int dy = -r_half; dy <= r_half; dy++)
+        {
+            for (int dx = -r_half; dx <= r_half; dx++)
+            {
+
+                // Apply special weight tuning to origin patch
+                if (dx == 0 && dy == 0 && plane_index == 0)
+                {
+                    // TODO: Parallelize this
+                    for (int y = n_half; y < h-n + n_half; y++)
+                    {
+                        for (int x = n_half; x < w-n + n_half; x++)
+                        {
+                            tmp_data[y*w + x].weight_sum += origin_tune;
+                            tmp_data[y*w + x].pixel_sum  += origin_tune * src[y*src_w + x];
+                        }
+                    }
+                    continue;
+                }
+
+                // Build integral
+                memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
+                for (int y = 0; y < h; y++)
+                {
+                    const uint8_t *p1 = src_pre + y*src_w;
+                    const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
+                    uint32_t *out = integral + (y*integral_stride) - 1;
+
+                    *out++ = 0;
+
+                    for (int x = 0; x < w; x++)
+                    {
+                        int diff = *p1++ - *p2++;
+                        *out = *(out-1) + diff * diff;
+                        out++;
+                    }
+
+                    if (y > 0)
+                    {
+                        out = integral + y*integral_stride;
+
+                        for (int x = 0; x < w; x++)
+                        {
+                            *out += *(out - integral_stride);
+                            out++;
+                        }
+                    }
+                }
+
+                // Average displacement
+                // TODO: Parallelize this
+                for (int y = 0; y <= h-n; y++)
+                {
+                    const uint32_t *integral_ptr1 = integral + (y  -1)*integral_stride - 1;
+                    const uint32_t *integral_ptr2 = integral + (y+n-1)*integral_stride - 1;
+
+                    for (int x = 0; x <= w-n; x++)
+                    {
+                        int xc = x + n_half;
+                        int yc = y + n_half;
+
+                        // Difference between patches
+                        int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
+
+                        // Sum pixel with weight
+                        if (diff < diff_max)
+                        {
+                            int diffidx = diff * weight_fact_table;
+
+                            //float weight = exp(-diff*weightFact);
+                            float weight = exptable[diffidx];
+
+                            tmp_data[yc*w + xc].weight_sum += weight;
+                            tmp_data[yc*w + xc].pixel_sum  += weight * compare[(yc+dy)*compare_w + xc + dx];
+                        }
+
+                        integral_ptr1++;
+                        integral_ptr2++;
+                    }
+                }
+            }
+        }
+    }
+
+    // Copy edges
+    for (int y = 0; y < h; y++)
+    {
+        for (int x = 0; x < n_half; x++)
+        {
+            *(dst + y*w + x)         = *(src + y*src_w - x - 1);
+            *(dst + y*w - x + (w-1)) = *(src + y*src_w + x + w);
+        }
+    }
+    for (int y = 0; y < n_half; y++)
+    {
+        memcpy(dst +       y*w, src - (y+1)*src_w, w);
+        memcpy(dst + (h-y-1)*w, src + (y+h)*src_w, w);
+    }
+
+    // Copy main image
+    uint8_t result;
+    for (int y = n_half; y < h-n_half; y++)
+    {
+        for (int x = n_half; x < w-n_half; x++)
+        {
+            result = (uint8_t)(tmp_data[y*w + x].pixel_sum / tmp_data[y*w + x].weight_sum);
+            *(dst + y*w + x) = result ? result : *(src + y*src_w + x);
+        }
+    }
+
+    free(tmp_data);
+    free(integral_mem);
+
+}
+
+static int hb_nlmeans_init(hb_filter_object_t *filter,
+                           hb_filter_init_t *init)
+{
+    filter->private_data = calloc(sizeof(struct hb_filter_private_s), 1);
+    hb_filter_private_t *pv = filter->private_data;
+
+    // Mark parameters unset
+    for (int c = 0; c < 3; c++)
+    {
+        pv->strength[c]    = -1;
+        pv->origin_tune[c] = -1;
+        pv->patch_size[c]  = -1;
+        pv->range[c]       = -1;
+        pv->frames[c]      = -1;
+        pv->prefilter[c]   = -1;
+    }
+
+    // Read user parameters
+    if (filter->settings != NULL)
+    {
+        sscanf(filter->settings, "%lf:%lf:%d:%d:%d:%d:%lf:%lf:%d:%d:%d:%d:%lf:%lf:%d:%d:%d:%d",
+               &pv->strength[0], &pv->origin_tune[0], &pv->patch_size[0], &pv->range[0], &pv->frames[0], &pv->prefilter[0],
+               &pv->strength[1], &pv->origin_tune[1], &pv->patch_size[1], &pv->range[1], &pv->frames[1], &pv->prefilter[1],
+               &pv->strength[2], &pv->origin_tune[2], &pv->patch_size[2], &pv->range[2], &pv->frames[2], &pv->prefilter[2]);
+    }
+
+    // Cascade values
+    // Cr not set; inherit Cb. Cb not set; inherit Y. Y not set; defaults.
+    for (int c = 1; c < 3; c++)
+    {
+        if (pv->strength[c]    == -1) { pv->strength[c]    = pv->strength[c-1]; }
+        if (pv->origin_tune[c] == -1) { pv->origin_tune[c] = pv->origin_tune[c-1]; }
+        if (pv->patch_size[c]  == -1) { pv->patch_size[c]  = pv->patch_size[c-1]; }
+        if (pv->range[c]       == -1) { pv->range[c]       = pv->range[c-1]; }
+        if (pv->frames[c]      == -1) { pv->frames[c]      = pv->frames[c-1]; }
+        if (pv->prefilter[c]   == -1) { pv->prefilter[c]   = pv->prefilter[c-1]; }
+    }
+
+    for (int c = 0; c < 3; c++)
+    {
+        // Replace unset values with defaults
+        if (pv->strength[c]    == -1) { pv->strength[c]    = c ? NLMEANS_STRENGTH_LUMA_DEFAULT    : NLMEANS_STRENGTH_CHROMA_DEFAULT; }
+        if (pv->origin_tune[c] == -1) { pv->origin_tune[c] = c ? NLMEANS_ORIGIN_TUNE_LUMA_DEFAULT : NLMEANS_ORIGIN_TUNE_CHROMA_DEFAULT; }
+        if (pv->patch_size[c]  == -1) { pv->patch_size[c]  = c ? NLMEANS_PATCH_SIZE_LUMA_DEFAULT  : NLMEANS_PATCH_SIZE_CHROMA_DEFAULT; }
+        if (pv->range[c]       == -1) { pv->range[c]       = c ? NLMEANS_RANGE_LUMA_DEFAULT       : NLMEANS_RANGE_CHROMA_DEFAULT; }
+        if (pv->frames[c]      == -1) { pv->frames[c]      = c ? NLMEANS_FRAMES_LUMA_DEFAULT      : NLMEANS_FRAMES_CHROMA_DEFAULT; }
+        if (pv->prefilter[c]   == -1) { pv->prefilter[c]   = c ? NLMEANS_PREFILTER_LUMA_DEFAULT   : NLMEANS_PREFILTER_CHROMA_DEFAULT; }
+
+        // Sanitize
+        if (pv->strength[c] < 0)        { pv->strength[c] = 0; }
+        if (pv->origin_tune[c] < 0.01)  { pv->origin_tune[c] = 0.01; } // avoid black artifacts
+        if (pv->origin_tune[c] > 1)     { pv->origin_tune[c] = 1; }
+        if (pv->patch_size[c] % 2 == 0) { pv->patch_size[c]--; }
+        if (pv->patch_size[c] < 1)      { pv->patch_size[c] = 1; }
+        if (pv->range[c] % 2 == 0)      { pv->range[c]--; }
+        if (pv->range[c] < 1)           { pv->range[c] = 1; }
+        if (pv->frames[c] < 1)          { pv->frames[c] = 1; }
+        if (pv->frames[c] > NLMEANS_FRAMES_MAX) { pv->frames[c] = NLMEANS_FRAMES_MAX; }
+        if (pv->prefilter[c] < 0)       { pv->prefilter[c] = 0; }
+
+        // Mark buffer empty
+        for (int f = 0; f < NLMEANS_FRAMES_MAX; f++)
+        {
+            pv->frame_ready[c][f] = 0;
+        }
+    }
+
+    return 0;
+}
+
+static void hb_nlmeans_close(hb_filter_object_t *filter)
+{
+    hb_filter_private_t *pv = filter->private_data;
+
+    if (pv == NULL)
+    {
+        return;
+    }
+
+    for (int c = 0; c < 3; c++)
+    {
+        for (int f = 0; f < pv->frames[c]; f++)
+        {
+            if (pv->frame_tmp[c][f].mem_pre != NULL &&
+                pv->frame_tmp[c][f].mem_pre != pv->frame_tmp[c][f].mem)
+            {
+                free(pv->frame_tmp[c][f].mem_pre);
+                pv->frame_tmp[c][f].mem_pre = NULL;
+            }
+            if (pv->frame_tmp[c][f].mem != NULL)
+            {
+                free(pv->frame_tmp[c][f].mem);
+                pv->frame_tmp[c][f].mem = NULL;
+            }
+        }
+    }
+
+    free(pv);
+    filter->private_data = NULL;
+}
+
+static int hb_nlmeans_work(hb_filter_object_t *filter,
+                           hb_buffer_t **buf_in,
+                           hb_buffer_t **buf_out )
+{
+    hb_filter_private_t *pv = filter->private_data;
+    hb_buffer_t *in = *buf_in, *out;
+
+    if (in->size <= 0)
+    {
+        *buf_out = in;
+        *buf_in  = NULL;
+        return HB_FILTER_DONE;
+    }
+
+    out = hb_video_buffer_init(in->f.width, in->f.height);
+
+    for (int c = 0; c < 3; c++)
+    {
+
+        if (pv->strength[c] == 0)
+        {
+            out->plane[c].data = in->plane[c].data;
+            continue;
+        }
+
+        int frames = pv->frames[c];
+
+        // Release last frame in buffer
+        if (pv->frame_tmp[c][frames-1].mem_pre != NULL &&
+            pv->frame_tmp[c][frames-1].mem_pre != pv->frame_tmp[c][frames-1].mem)
+        {
+            free(pv->frame_tmp[c][frames-1].mem_pre);
+            pv->frame_tmp[c][frames-1].mem_pre = NULL;
+        }
+        if (pv->frame_tmp[c][frames-1].mem != NULL)
+        {
+            free(pv->frame_tmp[c][frames-1].mem);
+            pv->frame_tmp[c][frames-1].mem = NULL;
+        }
+        pv->frame_ready[c][frames-1] = 0;
+
+        // Shift frames in buffer down one level
+        for (int f = frames-1; f > 0; f--)
+        {
+            pv->frame_tmp[c][f]   = pv->frame_tmp[c][f-1];
+            pv->frame_ready[c][f] = pv->frame_ready[c][f-1];
+        }
+
+        // Extend copy of plane with extra border and place in buffer
+        int border = ((pv->range[c] + 2) / 2 + 15) /16*16;
+        int w = in->plane[c].stride + 2*border;
+        int h = in->plane[c].height + 2*border;
+        nlmeans_alloc(in->plane[c].data,
+                      in->plane[c].stride,
+                      in->plane[c].height,
+                      &pv->frame_tmp[c][0],
+                      w,
+                      h,
+                      border);
+        nlmeans_prefilter(&pv->frame_tmp[c][0], pv->prefilter[c]);
+        pv->frame_ready[c][0] = 1;
+
+        if (pv->prefilter[c] & NLMEANS_PREFILTER_MODE_PASSTHRU)
+        {
+            nlmeans_deborder(pv->frame_tmp[c][0].mem_pre, out->plane[c].data, w, h, border);
+            continue;
+        }
+
+        // Process current plane
+        nlmeans_plane(pv->frame_tmp[c],
+                      pv->frame_ready[c],
+                      out->plane[c].data,
+                      in->plane[c].stride,
+                      in->plane[c].height,
+                      pv->strength[c],
+                      pv->origin_tune[c],
+                      pv->patch_size[c],
+                      pv->range[c]);
+
+    }
+
+    out->s = in->s;
+    hb_buffer_move_subs(out, in);
+
+    *buf_out = out;
+
+    return HB_FILTER_OK;
+}
diff --git a/test/test.c b/test/test.c
index c35a8fc96..249f8adcf 100644
--- a/test/test.c
+++ b/test/test.c
@@ -60,6 +60,9 @@ static int    deblock               = 0;
 static char * deblock_opt           = 0;
 static int    denoise               = 0;
 static char * denoise_opt           = 0;
+static int    nlmeans               = 0;
+static char * nlmeans_opt           = NULL;
+static char * nlmeans_tune_opt      = NULL;
 static int    detelecine            = 0;
 static char * detelecine_opt        = 0;
 static int    decomb                = 0;
@@ -435,6 +438,8 @@ int main( int argc, char ** argv )
     free(advanced_opts);
     free(h264_profile);
     free(h264_level);
+    free(nlmeans_opt);
+    free(nlmeans_tune_opt);
 
     // write a carriage return to stdout
     // avoids overlap / line wrapping when stderr is redirected
@@ -1705,6 +1710,11 @@ static int HandleEvents( hb_handle_t * h )
                 filter = hb_filter_init( HB_FILTER_DENOISE );
                 hb_add_filter( job, filter, denoise_opt );
             }
+            if( nlmeans )
+            {
+                filter = hb_filter_init( HB_FILTER_NLMEANS );
+                hb_add_filter( job, filter, nlmeans_opt );
+            }
             if( rotate )
             {
                 filter = hb_filter_init( HB_FILTER_ROTATE );
@@ -3464,6 +3474,15 @@ if (hb_qsv_available())
      "           or\n"
      "          <SL:SCb:SCr:TL:TCb:TCr>\n"
      "          (default: 4:3:3:6:4.5:4.5)\n"
+     "    --nlmeans               Denoise video with nlmeans filter\n"
+     "          <ultralight/light/medium/strong> or omitted\n"
+     "           or\n"
+     "          <SY:OTY:PSY:RY:FY:PY:Sb:OTb:PSb:Rb:Fb:Pb:Sr:OTr:PSr:Rr:Fr:Pr>\n"
+     "          (default 8:1:7:3:2:0)\n"
+     "    --nlmeans-tune          Tune nlmeans filter to content type\n"
+     "                            Note: only works in conjunction with presets\n"
+     "                            ultralight/light/medium/strong.\n"
+     "          <none/film/grain/highmotion/animation> or omitted (default none)\n"
      "    -7, --deblock           Deblock video with pp7 filter\n"
      "          <QP:M>            (default 5:2)\n"
      "        --rotate            Flips images axes\n"
@@ -3695,6 +3714,8 @@ static int ParseOptions( int argc, char ** argv )
     #define QSV_BASELINE         295
     #define QSV_ASYNC_DEPTH      296
     #define QSV_IMPLEMENTATION   297
+    #define FILTER_NLMEANS       298
+    #define FILTER_NLMEANS_TUNE  299
 
     for( ;; )
     {
@@ -3753,6 +3774,8 @@ static int ParseOptions( int argc, char ** argv )
             { "deinterlace", optional_argument, NULL,    'd' },
             { "deblock",     optional_argument, NULL,    '7' },
             { "denoise",     optional_argument, NULL,    '8' },
+            { "nlmeans",     optional_argument, NULL,    FILTER_NLMEANS },
+            { "nlmeans-tune",required_argument, NULL,    FILTER_NLMEANS_TUNE },
             { "detelecine",  optional_argument, NULL,    '9' },
             { "decomb",      optional_argument, NULL,    '5' },
             { "grayscale",   no_argument,       NULL,    'g' },
@@ -4109,6 +4132,30 @@ static int ParseOptions( int argc, char ** argv )
                 }
                 denoise = 1;
                 break;
+            case FILTER_NLMEANS:
+                if(optarg != NULL)
+                {
+                    free(nlmeans_opt);
+                    nlmeans_opt = strdup(optarg);
+                }
+                nlmeans = 1;
+                break;
+            case FILTER_NLMEANS_TUNE:
+                if (!strcmp(optarg, "none") ||
+                    !strcmp(optarg, "film") ||
+                    !strcmp(optarg, "grain") ||
+                    !strcmp(optarg, "highmotion") ||
+                    !strcmp(optarg, "animation"))
+                {
+                    free(nlmeans_tune_opt);
+                    nlmeans_tune_opt = strdup(optarg);
+                }
+                else
+                {
+                    fprintf(stderr, "invalid nlmeans tune (%s)\n", optarg);
+                    return -1;
+                }
+                break;
             case '9':
                 if( optarg != NULL )
                 {
@@ -4409,6 +4456,164 @@ static int ParseOptions( int argc, char ** argv )
                 fprintf( stderr, "unknown option (%s)\n", argv[cur_optind] );
                 return -1;
         }
+
+    }
+
+    /* NL-means presets (--nlmeans) and tunes (--nlmeans-tune)
+     *
+     * Presets adjust strength:
+     * ultralight - visually transparent
+     * light
+     * medium
+     * strong
+     *
+     * Tunes adjust settings to the specified content type:
+     * none
+     * film       - most content, live action
+     * grain      - like film but preserves luma grain
+     * highmotion - like film but avoids color smearing with stronger settings
+     * animation  - cel animation such as cartoons, anime
+     */
+    if (nlmeans == 1 && nlmeans_opt != NULL)
+    {
+        if (!strcmp(nlmeans_opt, "ultralight") ||
+            !strcmp(nlmeans_opt, "light") ||
+            !strcmp(nlmeans_opt, "medium") ||
+            !strcmp(nlmeans_opt, "strong"))
+        {
+            char   opt[1024];
+            double strength[2],
+                   origin_tune[2];
+            int    patch_size[2],
+                   range[2],
+                   frames[2],
+                   prefilter[2];
+
+            if (nlmeans_tune_opt == NULL || !strcmp(nlmeans_tune_opt, "none"))
+            {
+                strength[0]    = strength[1]    = 6;
+                origin_tune[0] = origin_tune[1] = 1;
+                patch_size[0]  = patch_size[1]  = 7;
+                range[0]       = range[1]       = 3;
+                frames[0]      = frames[1]      = 2;
+                prefilter[0]   = prefilter[1]   = 0;
+                if (!strcmp(nlmeans_opt, "ultralight"))
+                {
+                    strength[0] = strength[1] = 1.5;
+                }
+                else if (!strcmp(nlmeans_opt, "light"))
+                {
+                    strength[0] = strength[1] = 3;
+                }
+                else if (!strcmp(nlmeans_opt, "strong"))
+                {
+                    strength[0] = strength[1] = 10;
+                }
+            }
+            else if (!strcmp(nlmeans_tune_opt, "film"))
+            {
+                strength[0]    = 6; strength[1] = 8;
+                origin_tune[0] = origin_tune[1] = 0.8;
+                patch_size[0]  = patch_size[1]  = 7;
+                range[0]       = range[1]       = 3;
+                frames[0]      = frames[1]      = 2;
+                prefilter[0]   = prefilter[1]   = 0;
+                if (!strcmp(nlmeans_opt, "ultralight"))
+                {
+                    strength[0]    = 1.5;   strength[1]  = 2.4;
+                    origin_tune[0] = 0.9; origin_tune[1] = 0.9;
+                }
+                else if (!strcmp(nlmeans_opt, "light"))
+                {
+                    strength[0]    = 3;   strength[1]    = 4;
+                    origin_tune[0] = 0.9; origin_tune[1] = 0.9;
+                }
+                else if (!strcmp(nlmeans_opt, "strong"))
+                {
+                    strength[0]    = 8;   strength[1]    = 10;
+                    origin_tune[0] = 0.6; origin_tune[1] = 0.6;
+                }
+            }
+            else if (!strcmp(nlmeans_tune_opt, "grain"))
+            {
+                strength[0]    = 0; strength[1] = 6;
+                origin_tune[0] = origin_tune[1] = 0.8;
+                patch_size[0]  = patch_size[1]  = 7;
+                range[0]       = range[1]       = 3;
+                frames[0]      = frames[1]      = 2;
+                prefilter[0]   = prefilter[1]   = 0;
+                if (!strcmp(nlmeans_opt, "ultralight"))
+                {
+                    strength[0]    = 0;   strength[1]    = 2.4;
+                    origin_tune[0] = 0.9; origin_tune[1] = 0.9;
+                }
+                else if (!strcmp(nlmeans_opt, "light"))
+                {
+                    strength[0]    = 0;   strength[1]    = 3.5;
+                    origin_tune[0] = 0.9; origin_tune[1] = 0.9;
+                }
+                else if (!strcmp(nlmeans_opt, "strong"))
+                {
+                    strength[0]    = 0;   strength[1]    = 8;
+                    origin_tune[0] = 0.6; origin_tune[1] = 0.6;
+                }
+            }
+            else if (!strcmp(nlmeans_tune_opt, "highmotion"))
+            {
+                strength[0]    = 6;   strength[1]    = 6;
+                origin_tune[0] = 0.8; origin_tune[1] = 0.7;
+                patch_size[0]  = 7;   patch_size[1]  = 7;
+                range[0]       = 3;   range[1]       = 5;
+                frames[0]      = 2;   frames[1]      = 1;
+                prefilter[0]   = 0;   prefilter[1]   = 0;
+                if (!strcmp(nlmeans_opt, "ultralight"))
+                {
+                    strength[0]    = 1.5;   strength[1]  = 2.4;
+                    origin_tune[0] = 0.9; origin_tune[1] = 0.9;
+                }
+                else if (!strcmp(nlmeans_opt, "light"))
+                {
+                    strength[0]    = 3;   strength[1]    = 3.25;
+                    origin_tune[0] = 0.9; origin_tune[1] = 0.8;
+                }
+                else if (!strcmp(nlmeans_opt, "strong"))
+                {
+                    strength[0]    = 8;   strength[1]    = 6.75;
+                    origin_tune[0] = 0.6; origin_tune[1] = 0.5;
+                }
+            }
+            else if (!strcmp(nlmeans_tune_opt, "animation"))
+            {
+                strength[0]    = 5; strength[1] = 4;
+                origin_tune[0] = origin_tune[1] = 0.15;
+                patch_size[0]  = patch_size[1]  = 5;
+                range[0]       = range[1]       = 7;
+                frames[0]      = frames[1]      = 4;
+                prefilter[0]   = prefilter[1]   = 0;
+                if (!strcmp(nlmeans_opt, "ultralight"))
+                {
+                    strength[0] = 2.5; strength[1] = 2;
+                    frames[0]   = 2;   frames[1]   = 2;
+                }
+                else if (!strcmp(nlmeans_opt, "light"))
+                {
+                    strength[0] = 3; strength[1] = 2.25;
+                    frames[0]   = 3; frames[1]   = 3;
+                }
+                else if (!strcmp(nlmeans_opt, "strong"))
+                {
+                    strength[0] = 10; strength[1] = 8;
+                }
+            }
+
+            sprintf(opt, "%lf:%lf:%d:%d:%d:%d:%lf:%lf:%d:%d:%d:%d",
+                    strength[0], origin_tune[0], patch_size[0], range[0], frames[0], prefilter[0],
+                    strength[1], origin_tune[1], patch_size[1], range[1], frames[1], prefilter[1]);
+
+            free(nlmeans_opt);
+            nlmeans_opt = strdup(opt);
+
+        }
     }
 
     return 0;