summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjstebbins <[email protected]>2014-09-09 17:42:53 +0000
committerjstebbins <[email protected]>2014-09-09 17:42:53 +0000
commit47f221161b42b7d3cb1ae49a16a6ec7452eb978f (patch)
treef76b8c9efe6a9d4f275ef15c04e1613cfe0a7853
parent54dbf9c1dea56d271b1760e7f6b085914b0cff7c (diff)
libhb: thread nlmeans filter
Scales well with number of CPUs, so is 4x faster on quad cores. git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6397 b64f7644-9d1e-0410-96f1-a4d463321fa5
-rw-r--r--libhb/common.h2
-rw-r--r--libhb/internal.h71
-rw-r--r--libhb/nlmeans.c594
3 files changed, 449 insertions, 218 deletions
diff --git a/libhb/common.h b/libhb/common.h
index ef10fff41..c4392fd94 100644
--- a/libhb/common.h
+++ b/libhb/common.h
@@ -99,6 +99,8 @@ typedef struct hb_work_object_s hb_work_object_t;
typedef struct hb_filter_private_s hb_filter_private_t;
typedef struct hb_filter_object_s hb_filter_object_t;
typedef struct hb_buffer_s hb_buffer_t;
+typedef struct hb_buffer_settings_s hb_buffer_settings_t;
+typedef struct hb_image_format_s hb_image_format_t;
typedef struct hb_fifo_s hb_fifo_t;
typedef struct hb_lock_s hb_lock_t;
typedef enum
diff --git a/libhb/internal.h b/libhb/internal.h
index f32c3d3ff..7fb266c8c 100644
--- a/libhb/internal.h
+++ b/libhb/internal.h
@@ -51,6 +51,41 @@ void hb_set_state( hb_handle_t *, hb_state_t * );
* May have metadata associated with it via extra fields
* that are conditionally used depending on the type of packet.
*/
+struct hb_buffer_settings_s
+{
+ enum { AUDIO_BUF, VIDEO_BUF, SUBTITLE_BUF, FRAME_BUF, OTHER_BUF } type;
+
+ int id; // ID of the track that the packet comes from
+ int64_t start; // start time of frame
+ double duration; // Actual duration, may be fractional ticks
+ int64_t stop; // stop time of frame
+ int64_t renderOffset; // DTS used by b-frame offsets in muxmp4
+ int64_t pcr;
+ uint8_t discontinuity;
+ int new_chap; // Video packets: if non-zero, is the index of the chapter whose boundary was crossed
+
+#define HB_FRAME_IDR 0x01
+#define HB_FRAME_I 0x02
+#define HB_FRAME_AUDIO 0x04
+#define HB_FRAME_SUBTITLE 0x08
+#define HB_FRAME_P 0x10
+#define HB_FRAME_B 0x20
+#define HB_FRAME_BREF 0x40
+#define HB_FRAME_KEY 0x0F
+#define HB_FRAME_REF 0xF0
+ uint8_t frametype;
+ uint16_t flags;
+};
+
+struct hb_image_format_s
+{
+ int x;
+ int y;
+ int width;
+ int height;
+ int fmt;
+};
+
struct hb_buffer_s
{
int size; // size of this packet
@@ -71,40 +106,8 @@ struct hb_buffer_s
*/
int64_t sequence;
- struct settings
- {
- enum { AUDIO_BUF, VIDEO_BUF, SUBTITLE_BUF, FRAME_BUF, OTHER_BUF } type;
-
- int id; // ID of the track that the packet comes from
- int64_t start; // start time of frame
- double duration; // Actual duration, may be fractional ticks
- int64_t stop; // stop time of frame
- int64_t renderOffset; // DTS used by b-frame offsets in muxmp4
- int64_t pcr;
- uint8_t discontinuity;
- int new_chap; // Video packets: if non-zero, is the index of the chapter whose boundary was crossed
-
- #define HB_FRAME_IDR 0x01
- #define HB_FRAME_I 0x02
- #define HB_FRAME_AUDIO 0x04
- #define HB_FRAME_SUBTITLE 0x08
- #define HB_FRAME_P 0x10
- #define HB_FRAME_B 0x20
- #define HB_FRAME_BREF 0x40
- #define HB_FRAME_KEY 0x0F
- #define HB_FRAME_REF 0xF0
- uint8_t frametype;
- uint16_t flags;
- } s;
-
- struct format
- {
- int x;
- int y;
- int width;
- int height;
- int fmt;
- } f;
+ hb_buffer_settings_t s;
+ hb_image_format_t f;
struct buffer_plane
{
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c
index c7c82f11d..f5cfd56ac 100644
--- a/libhb/nlmeans.c
+++ b/libhb/nlmeans.c
@@ -52,6 +52,7 @@
#include "hb.h"
#include "hbffmpeg.h"
+#include "taskset.h"
#define NLMEANS_STRENGTH_LUMA_DEFAULT 8
#define NLMEANS_STRENGTH_CHROMA_DEFAULT 8
@@ -82,8 +83,8 @@
#define NLMEANS_SORT(a,b) { if (a > b) NLMEANS_SWAP(a, b); }
#define NLMEANS_SWAP(a,b) { a = (a ^ b); b = (a ^ b); a = (b ^ a); }
-#define NLMEANS_FRAMES_MAX 32
-#define NLMEANS_EXPSIZE 128
+#define NLMEANS_FRAMES_MAX 32
+#define NLMEANS_EXPSIZE 128
typedef struct
{
@@ -94,35 +95,57 @@ typedef struct
int w;
int h;
int border;
+ hb_lock_t *mutex;
+ int prefiltered;
} BorderedPlane;
+typedef struct
+{
+ int width;
+ int height;
+ int fmt;
+ BorderedPlane plane[3];
+ hb_buffer_settings_t s;
+} Frame;
+
struct PixelSum
{
float weight_sum;
float pixel_sum;
};
+typedef struct
+{
+ hb_filter_private_t *pv;
+ int segment;
+ hb_buffer_t *out;
+} nlmeans_thread_arg_t;
+
struct hb_filter_private_s
{
double strength[3]; // averaging weight decay, larger produces smoother output
double origin_tune[3]; // weight tuning for origin patch, 0.00..1.00
int patch_size[3]; // pixel context region width (must be odd)
int range[3]; // spatial search window width (must be odd)
- int frames[3]; // temporal search depth in frames
+ int nframes[3]; // temporal search depth in frames
int prefilter[3]; // prefilter mode, can improve weight analysis
- BorderedPlane frame_tmp[3][32];
- int frame_ready[3][32];
-};
+ Frame *frame;
+ int next_frame;
+ int max_frames;
-static int hb_nlmeans_init(hb_filter_object_t *filter,
- hb_filter_init_t *init);
+ taskset_t taskset;
+ int thread_count;
+ nlmeans_thread_arg_t **thread_data;
+};
-static int hb_nlmeans_work(hb_filter_object_t *filter,
+static int nlmeans_init(hb_filter_object_t *filter, hb_filter_init_t *init);
+static int nlmeans_work(hb_filter_object_t *filter,
hb_buffer_t **buf_in,
hb_buffer_t **buf_out);
+static void nlmeans_close(hb_filter_object_t *filter);
-static void hb_nlmeans_close(hb_filter_object_t *filter);
+static void nlmeans_filter_thread(void *thread_args_v);
hb_filter_object_t hb_filter_nlmeans =
{
@@ -130,9 +153,9 @@ hb_filter_object_t hb_filter_nlmeans =
.enforce_order = 1,
.name = "Denoise (nlmeans)",
.settings = NULL,
- .init = hb_nlmeans_init,
- .work = hb_nlmeans_work,
- .close = hb_nlmeans_close,
+ .init = nlmeans_init,
+ .work = nlmeans_work,
+ .close = nlmeans_close,
};
static void nlmeans_border(uint8_t *src,
@@ -140,69 +163,69 @@ static void nlmeans_border(uint8_t *src,
int h,
int border)
{
-
- uint8_t *image = src + border + w*border;
- int iw = w - 2*border;
- int ih = h - 2*border;
+ int bw = w + 2 * border;
+ uint8_t *image = src + border + bw * border;
// Create faux borders using edge pixels
- for (int y = 0; y < ih; y++)
+ for (int y = 0; y < h; y++)
{
for (int x = 0; x < border; x++)
{
- *(image + y*w - x - 1) = *(image + y*w + x);
- *(image + y*w + x + iw) = *(image + y*w - x + (iw-1));
+ *(image + y*bw - x - 1) = *(image + y*bw + x);
+ *(image + y*bw + x + w) = *(image + y*bw - x + (w-1));
}
}
for (int y = 0; y < border; y++)
{
- memcpy(image - border - (y+1)*w, image - border + y*w, w);
- memcpy(image - border + (y+ih)*w, image - border + (ih-y-1)*w, w);
+ memcpy(image - border - (y+1)*bw, image - border + y*bw, bw);
+ memcpy(image - border + (y+h)*bw, image - border + (h-y-1)*bw, bw);
}
}
-static void nlmeans_deborder(uint8_t *src,
+static void nlmeans_deborder(BorderedPlane *src,
uint8_t *dst,
int w,
- int h,
- int border)
+ int s,
+ int h)
{
-
- uint8_t *image = src + border + w*border;
- int iw = w - 2*border;
- int ih = h - 2*border;
+ int bw = src->w + 2 * src->border;
+ uint8_t *image = src->mem + src->border + bw * src->border;
+ int width = w;
+ if (src->w < width)
+ width = src->w;
// Copy main image
- for (int y = 0; y < ih; y++)
+ for (int y = 0; y < h; y++)
{
- memcpy(dst + y*iw, image + y*w, iw);
+ memcpy(dst + y * s, image + y * bw, width);
}
}
static void nlmeans_alloc(uint8_t *src,
int src_w,
+ int src_s,
int src_h,
BorderedPlane *dst,
- int dst_w,
- int dst_h,
int border)
{
+ int bw = src_w + 2 * border;
+ int bh = src_h + 2 * border;
- uint8_t *mem = malloc(dst_w * dst_h * sizeof(uint8_t));
- uint8_t *image = mem + border + dst_w*border;
+ uint8_t *mem = malloc(bw * bh * sizeof(uint8_t));
+ uint8_t *image = mem + border + bw * border;
// Copy main image
for (int y = 0; y < src_h; y++)
{
- memcpy(image + y*dst_w, src + y*src_w, src_w);
+ memcpy(image + y * bw, src + y * src_s, src_w);
}
dst->mem = mem;
dst->image = image;
- dst->w = dst_w;
- dst->h = dst_h;
+ dst->w = src_w;
+ dst->h = src_h;
dst->border = border;
nlmeans_border(dst->mem, dst->w, dst->h, dst->border);
@@ -220,25 +243,24 @@ static void nlmeans_filter_mean(uint8_t *src,
{
// Mean filter
- int iw = w - 2*border;
- int ih = h - 2*border;
+ int bw = w + 2 * border;
int offset_min = -((size - 1) /2);
int offset_max = (size + 1) /2;
uint16_t pixel_sum;
double pixel_weight = 1.0 / (size * size);
- for (int y = 0; y < ih; y++)
+ for (int y = 0; y < h; y++)
{
- for (int x = 0; x < iw; x++)
+ for (int x = 0; x < w; x++)
{
pixel_sum = 0;
for (int k = offset_min; k < offset_max; k++)
{
for (int j = offset_min; j < offset_max; j++)
{
- pixel_sum = pixel_sum + *(src + w*(y+j) + (x+k));
+ pixel_sum = pixel_sum + *(src + bw*(y+j) + (x+k));
}
}
- *(dst + w*y + x) = (uint8_t)(pixel_sum * pixel_weight);
+ *(dst + bw*y + x) = (uint8_t)(pixel_sum * pixel_weight);
}
}
@@ -315,28 +337,26 @@ static void nlmeans_filter_median(uint8_t *src,
int border,
int size)
{
-
// Median filter
- int iw = w - 2*border;
- int ih = h - 2*border;
+ int bw = w + 2 * border;
int offset_min = -((size - 1) /2);
int offset_max = (size + 1) /2;
int index;
uint8_t pixels[size * size];
- for (int y = 0; y < ih; y++)
+ for (int y = 0; y < h; y++)
{
- for (int x = 0; x < iw; x++)
+ for (int x = 0; x < w; x++)
{
index = 0;
for (int k = offset_min; k < offset_max; k++)
{
for (int j = offset_min; j < offset_max; j++)
{
- pixels[index] = *(src + w*(y+j) + (x+k));
+ pixels[index] = *(src + bw*(y+j) + (x+k));
index++;
}
}
- *(dst + w*y + x) = nlmeans_filter_median_opt(pixels, size);
+ *(dst + bw*y + x) = nlmeans_filter_median_opt(pixels, size);
}
}
@@ -348,9 +368,8 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
int h,
int border)
{
-
- int iw = w - 2*border;
- int ih = h - 2*border;
+ int bw = w + 2 * border;
+ int bh = h + 2 * border;
// Custom kernel
int kernel_size = 3;
@@ -364,11 +383,11 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
int offset_max = (kernel_size + 1) /2;
uint16_t pixel1;
uint16_t pixel2;
- uint8_t *mask_mem = calloc(w * h, sizeof(uint8_t));
- uint8_t *mask = mask_mem + border + w*border;
- for (int y = 0; y < ih; y++)
+ uint8_t *mask_mem = calloc(bw * bh, sizeof(uint8_t));
+ uint8_t *mask = mask_mem + border + bw * border;
+ for (int y = 0; y < h; y++)
{
- for (int x = 0; x < iw; x++)
+ for (int x = 0; x < w; x++)
{
pixel1 = 0;
pixel2 = 0;
@@ -376,37 +395,37 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
{
for (int j = offset_min; j < offset_max; j++)
{
- pixel1 += kernel[j+1][k+1] * *(src + w*(y+j) + (x+k));
- pixel2 += kernel[k+1][j+1] * *(src + w*(y+j) + (x+k));
+ pixel1 += kernel[j+1][k+1] * *(src + bw*(y+j) + (x+k));
+ pixel2 += kernel[k+1][j+1] * *(src + bw*(y+j) + (x+k));
}
}
pixel1 = pixel1 > 0 ? pixel1 : -pixel1;
pixel2 = pixel2 > 0 ? pixel2 : -pixel2;
pixel1 = (uint16_t)(((double)pixel1 * kernel_coef) + 128);
pixel2 = (uint16_t)(((double)pixel2 * kernel_coef) + 128);
- *(mask + w*y + x) = (uint8_t)(pixel1 + pixel2);
- if (*(mask + w*y + x) > 160)
+ *(mask + bw*y + x) = (uint8_t)(pixel1 + pixel2);
+ if (*(mask + bw*y + x) > 160)
{
- *(mask + w*y + x) = 235;
+ *(mask + bw*y + x) = 235;
}
- else if (*(mask + w*y + x) > 16)
+ else if (*(mask + bw*y + x) > 16)
{
- *(mask + w*y + x) = 128;
+ *(mask + bw*y + x) = 128;
}
else
{
- *(mask + w*y + x) = 16;
+ *(mask + bw*y + x) = 16;
}
}
}
// Post-process and output
int pixels;
- for (int y = 0; y < ih; y++)
+ for (int y = 0; y < h; y++)
{
- for (int x = 0; x < iw; x++)
+ for (int x = 0; x < w; x++)
{
- if (*(mask + w*y + x) > 16)
+ if (*(mask + bw*y + x) > 16)
{
// Count nearby edge pixels
pixels = 0;
@@ -414,7 +433,7 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
{
for (int j = offset_min; j < offset_max; j++)
{
- if (*(mask + w*(y+j) + (x+k)) > 16)
+ if (*(mask + bw*(y+j) + (x+k)) > 16)
{
pixels++;
}
@@ -423,33 +442,38 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
// Remove false positive
if (pixels < 3)
{
- *(mask + w*y + x) = 16;
+ *(mask + bw*y + x) = 16;
}
// Filter output
- if (*(mask + w*y + x) > 16)
+ if (*(mask + bw*y + x) > 16)
{
- if (*(mask + w*y + x) == 235)
+ if (*(mask + bw*y + x) == 235)
{
- *(dst + w*y + x) = (3 * *(src + w*y + x) + 1 * *(dst + w*y + x)) /4;
+ *(dst + bw*y + x) = (3 * *(src + bw*y + x) + 1 * *(dst + bw*y + x)) /4;
}
else
{
- *(dst + w*y + x) = (2 * *(src + w*y + x) + 3 * *(dst + w*y + x)) /5;
+ *(dst + bw*y + x) = (2 * *(src + bw*y + x) + 3 * *(dst + bw*y + x)) /5;
}
- //*(dst + w*y + x) = *(mask + w*y + x); // Overlay mask
+ //*(dst + bw*y + x) = *(mask + bw*y + x); // Overlay mask
}
}
- //*(dst + w*y + x) = *(mask + w*y + x); // Full mask
+ //*(dst + bw*y + x) = *(mask + bw*y + x); // Full mask
}
}
free(mask_mem);
-
}
static void nlmeans_prefilter(BorderedPlane *src,
int filter_type)
{
+ hb_lock(src->mutex);
+ if (src->prefiltered)
+ {
+ hb_unlock(src->mutex);
+ return;
+ }
if (filter_type & NLMEANS_PREFILTER_MODE_MEAN3X3 ||
filter_type & NLMEANS_PREFILTER_MODE_MEAN5X5 ||
@@ -460,16 +484,18 @@ static void nlmeans_prefilter(BorderedPlane *src,
// Source image
uint8_t *mem = src->mem;
uint8_t *image = src->image;
+ int border = src->border;
int w = src->w;
int h = src->h;
- int border = src->border;
+ int bw = w + 2 * border;
+ int bh = h + 2 * border;
// Duplicate plane
- uint8_t *mem_pre = malloc(w * h * sizeof(uint8_t));
- uint8_t *image_pre = mem_pre + border + w*border;
+ uint8_t *mem_pre = malloc(bw * bh * sizeof(uint8_t));
+ uint8_t *image_pre = mem_pre + border + bw * border;
for (int y = 0; y < h; y++)
{
- memcpy(mem_pre + y*w, mem + y*w, w);
+ memcpy(mem_pre + y * bw, mem + y * bw, bw);
}
// Filter plane; should already have at least 2px extra border on each side
@@ -521,11 +547,11 @@ static void nlmeans_prefilter(BorderedPlane *src,
}
if (dry > 0)
{
- for (int y = 0; y < h; y++)
+ for (int y = 0; y < bh; y++)
{
- for (int x = 0; x < w; x++)
+ for (int x = 0; x < bw; x++)
{
- *(mem_pre + w*y + x) = (uint8_t)((wet * *(mem_pre + w*y + x) + dry * *(mem + w*y + x)) / (wet + dry));
+ *(mem_pre + bw*y + x) = (uint8_t)((wet * *(mem_pre + bw*y + x) + dry * *(mem + bw*y + x)) / (wet + dry));
}
}
}
@@ -538,33 +564,37 @@ static void nlmeans_prefilter(BorderedPlane *src,
nlmeans_border(mem_pre, w, h, border);
}
-
+ src->prefiltered = 1;
+ hb_unlock(src->mutex);
}
-static void nlmeans_plane(BorderedPlane *plane_tmp,
- int *plane_ready,
+static void nlmeans_plane(Frame *frame,
+ int prefilter,
+ int plane,
+ int nframes,
uint8_t *dst,
int w,
+ int s,
int h,
double h_param,
double origin_tune,
int n,
int r)
{
-
int n_half = (n-1) /2;
int r_half = (r-1) /2;
// Source image
- uint8_t *src = plane_tmp[0].image;
- uint8_t *src_pre = plane_tmp[0].image_pre;
- int src_w = plane_tmp[0].w;
+ uint8_t *src = frame[0].plane[plane].image;
+ uint8_t *src_pre = frame[0].plane[plane].image_pre;
+ int border = frame[0].plane[plane].border;
+ int src_w = frame[0].plane[plane].w + 2 * border;
// Allocate temporary pixel sums
struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum));
// Allocate integral image
- int integral_stride = w + 2*16;
+ int integral_stride = w + 2 * 16;
uint32_t *integral_mem = malloc(integral_stride * (h+1) * sizeof(uint32_t));
uint32_t *integral = integral_mem + integral_stride + 16;
@@ -582,13 +612,15 @@ static void nlmeans_plane(BorderedPlane *plane_tmp,
exptable[NLMEANS_EXPSIZE-1] = 0;
// Iterate through available frames
- for (int plane_index = 0; plane_ready[plane_index] == 1; plane_index++)
+ for (int f = 0; f < nframes; f++)
{
+ nlmeans_prefilter(&frame[f].plane[plane], prefilter);
// Compare image
- uint8_t *compare = plane_tmp[plane_index].image;
- uint8_t *compare_pre = plane_tmp[plane_index].image_pre;
- int compare_w = plane_tmp[plane_index].w;
+ uint8_t *compare = frame[f].plane[plane].image;
+ uint8_t *compare_pre = frame[f].plane[plane].image_pre;
+ int border = frame[f].plane[plane].border;
+ int compare_w = frame[f].plane[plane].w + 2 * border;
// Iterate through all displacements
for (int dy = -r_half; dy <= r_half; dy++)
@@ -597,7 +629,7 @@ static void nlmeans_plane(BorderedPlane *plane_tmp,
{
// Apply special weight tuning to origin patch
- if (dx == 0 && dy == 0 && plane_index == 0)
+ if (dx == 0 && dy == 0 && f == 0)
{
// TODO: Parallelize this
for (int y = n_half; y < h-n + n_half; y++)
@@ -680,14 +712,14 @@ static void nlmeans_plane(BorderedPlane *plane_tmp,
{
for (int x = 0; x < n_half; x++)
{
- *(dst + y*w + x) = *(src + y*src_w - x - 1);
- *(dst + y*w - x + (w-1)) = *(src + y*src_w + x + w);
+ *(dst + y * s + x) = *(src + y * src_w - x - 1);
+ *(dst + y * s - x + (w - 1)) = *(src + y * src_w + x + w);
}
}
for (int y = 0; y < n_half; y++)
{
- memcpy(dst + y*w, src - (y+1)*src_w, w);
- memcpy(dst + (h-y-1)*w, src + (y+h)*src_w, w);
+ memcpy(dst + y*s, src - (y+1)*src_w, w);
+ memcpy(dst + (h-y-1)*s, src + (y+h)*src_w, w);
}
// Copy main image
@@ -697,7 +729,7 @@ static void nlmeans_plane(BorderedPlane *plane_tmp,
for (int x = n_half; x < w-n_half; x++)
{
result = (uint8_t)(tmp_data[y*w + x].pixel_sum / tmp_data[y*w + x].weight_sum);
- *(dst + y*w + x) = result ? result : *(src + y*src_w + x);
+ *(dst + y*s + x) = result ? result : *(src + y*src_w + x);
}
}
@@ -706,7 +738,7 @@ static void nlmeans_plane(BorderedPlane *plane_tmp,
}
-static int hb_nlmeans_init(hb_filter_object_t *filter,
+static int nlmeans_init(hb_filter_object_t *filter,
hb_filter_init_t *init)
{
filter->private_data = calloc(sizeof(struct hb_filter_private_s), 1);
@@ -719,7 +751,7 @@ static int hb_nlmeans_init(hb_filter_object_t *filter,
pv->origin_tune[c] = -1;
pv->patch_size[c] = -1;
pv->range[c] = -1;
- pv->frames[c] = -1;
+ pv->nframes[c] = -1;
pv->prefilter[c] = -1;
}
@@ -727,9 +759,9 @@ static int hb_nlmeans_init(hb_filter_object_t *filter,
if (filter->settings != NULL)
{
sscanf(filter->settings, "%lf:%lf:%d:%d:%d:%d:%lf:%lf:%d:%d:%d:%d:%lf:%lf:%d:%d:%d:%d",
- &pv->strength[0], &pv->origin_tune[0], &pv->patch_size[0], &pv->range[0], &pv->frames[0], &pv->prefilter[0],
- &pv->strength[1], &pv->origin_tune[1], &pv->patch_size[1], &pv->range[1], &pv->frames[1], &pv->prefilter[1],
- &pv->strength[2], &pv->origin_tune[2], &pv->patch_size[2], &pv->range[2], &pv->frames[2], &pv->prefilter[2]);
+ &pv->strength[0], &pv->origin_tune[0], &pv->patch_size[0], &pv->range[0], &pv->nframes[0], &pv->prefilter[0],
+ &pv->strength[1], &pv->origin_tune[1], &pv->patch_size[1], &pv->range[1], &pv->nframes[1], &pv->prefilter[1],
+ &pv->strength[2], &pv->origin_tune[2], &pv->patch_size[2], &pv->range[2], &pv->nframes[2], &pv->prefilter[2]);
}
// Cascade values
@@ -740,7 +772,7 @@ static int hb_nlmeans_init(hb_filter_object_t *filter,
if (pv->origin_tune[c] == -1) { pv->origin_tune[c] = pv->origin_tune[c-1]; }
if (pv->patch_size[c] == -1) { pv->patch_size[c] = pv->patch_size[c-1]; }
if (pv->range[c] == -1) { pv->range[c] = pv->range[c-1]; }
- if (pv->frames[c] == -1) { pv->frames[c] = pv->frames[c-1]; }
+ if (pv->nframes[c] == -1) { pv->nframes[c] = pv->nframes[c-1]; }
if (pv->prefilter[c] == -1) { pv->prefilter[c] = pv->prefilter[c-1]; }
}
@@ -751,7 +783,7 @@ static int hb_nlmeans_init(hb_filter_object_t *filter,
if (pv->origin_tune[c] == -1) { pv->origin_tune[c] = c ? NLMEANS_ORIGIN_TUNE_LUMA_DEFAULT : NLMEANS_ORIGIN_TUNE_CHROMA_DEFAULT; }
if (pv->patch_size[c] == -1) { pv->patch_size[c] = c ? NLMEANS_PATCH_SIZE_LUMA_DEFAULT : NLMEANS_PATCH_SIZE_CHROMA_DEFAULT; }
if (pv->range[c] == -1) { pv->range[c] = c ? NLMEANS_RANGE_LUMA_DEFAULT : NLMEANS_RANGE_CHROMA_DEFAULT; }
- if (pv->frames[c] == -1) { pv->frames[c] = c ? NLMEANS_FRAMES_LUMA_DEFAULT : NLMEANS_FRAMES_CHROMA_DEFAULT; }
+ if (pv->nframes[c] == -1) { pv->nframes[c] = c ? NLMEANS_FRAMES_LUMA_DEFAULT : NLMEANS_FRAMES_CHROMA_DEFAULT; }
if (pv->prefilter[c] == -1) { pv->prefilter[c] = c ? NLMEANS_PREFILTER_LUMA_DEFAULT : NLMEANS_PREFILTER_CHROMA_DEFAULT; }
// Sanitize
@@ -762,21 +794,59 @@ static int hb_nlmeans_init(hb_filter_object_t *filter,
if (pv->patch_size[c] < 1) { pv->patch_size[c] = 1; }
if (pv->range[c] % 2 == 0) { pv->range[c]--; }
if (pv->range[c] < 1) { pv->range[c] = 1; }
- if (pv->frames[c] < 1) { pv->frames[c] = 1; }
- if (pv->frames[c] > NLMEANS_FRAMES_MAX) { pv->frames[c] = NLMEANS_FRAMES_MAX; }
+ if (pv->nframes[c] < 1) { pv->nframes[c] = 1; }
+ if (pv->nframes[c] > NLMEANS_FRAMES_MAX) { pv->nframes[c] = NLMEANS_FRAMES_MAX; }
if (pv->prefilter[c] < 0) { pv->prefilter[c] = 0; }
- // Mark buffer empty
- for (int f = 0; f < NLMEANS_FRAMES_MAX; f++)
+ if (pv->max_frames < pv->nframes[c]) pv->max_frames = pv->nframes[c];
+ }
+
+ pv->thread_count = hb_get_cpu_count();
+ pv->frame = calloc(pv->thread_count + pv->max_frames, sizeof(Frame));
+ for (int ii = 0; ii < pv->thread_count + pv->max_frames; ii++)
+ {
+ for (int c = 0; c < 3; c++)
+ {
+ pv->frame[ii].plane[c].mutex = hb_lock_init();
+ }
+ }
+
+ pv->thread_data = malloc(pv->thread_count * sizeof(nlmeans_thread_arg_t*));
+ if (taskset_init(&pv->taskset, pv->thread_count,
+ sizeof(nlmeans_thread_arg_t)) == 0)
+ {
+ hb_error("nlmeans could not initialize taskset");
+ goto fail;
+ }
+
+ for (int ii = 0; ii < pv->thread_count; ii++)
+ {
+ pv->thread_data[ii] = taskset_thread_args(&pv->taskset, ii);
+ if (pv->thread_data[ii] == NULL)
{
- pv->frame_ready[c][f] = 0;
+ hb_error("nlmeans could not create thread args");
+ goto fail;
+ }
+ pv->thread_data[ii]->pv = pv;
+ pv->thread_data[ii]->segment = ii;
+ if (taskset_thread_spawn(&pv->taskset, ii, "nlmeans_filter",
+ nlmeans_filter_thread, HB_NORMAL_PRIORITY) == 0)
+ {
+ hb_error("nlmeans could not spawn thread");
+ goto fail;
}
}
return 0;
+
+fail:
+ taskset_fini(&pv->taskset);
+ free(pv->thread_data);
+ free(pv);
+ return -1;
}
-static void hb_nlmeans_close(hb_filter_object_t *filter)
+static void nlmeans_close(hb_filter_object_t *filter)
{
hb_filter_private_t *pv = filter->private_data;
@@ -785,113 +855,269 @@ static void hb_nlmeans_close(hb_filter_object_t *filter)
return;
}
+ taskset_fini(&pv->taskset);
for (int c = 0; c < 3; c++)
{
- for (int f = 0; f < pv->frames[c]; f++)
+ for (int f = 0; f < pv->nframes[c]; f++)
{
- if (pv->frame_tmp[c][f].mem_pre != NULL &&
- pv->frame_tmp[c][f].mem_pre != pv->frame_tmp[c][f].mem)
+ if (pv->frame[f].plane[c].mem_pre != NULL &&
+ pv->frame[f].plane[c].mem_pre != pv->frame[f].plane[c].mem)
{
- free(pv->frame_tmp[c][f].mem_pre);
- pv->frame_tmp[c][f].mem_pre = NULL;
+ free(pv->frame[f].plane[c].mem_pre);
+ pv->frame[f].plane[c].mem_pre = NULL;
}
- if (pv->frame_tmp[c][f].mem != NULL)
+ if (pv->frame[f].plane[c].mem != NULL)
{
- free(pv->frame_tmp[c][f].mem);
- pv->frame_tmp[c][f].mem = NULL;
+ free(pv->frame[f].plane[c].mem);
+ pv->frame[f].plane[c].mem = NULL;
}
}
}
+ for (int ii = 0; ii < pv->thread_count + pv->max_frames; ii++)
+ {
+ for (int c = 0; c < 3; c++)
+ {
+ hb_lock_close(&pv->frame[ii].plane[c].mutex);
+ }
+ }
+
+ free(pv->frame);
+ free(pv->thread_data);
free(pv);
filter->private_data = NULL;
}
-static int hb_nlmeans_work(hb_filter_object_t *filter,
- hb_buffer_t **buf_in,
- hb_buffer_t **buf_out )
+static void nlmeans_filter_thread(void *thread_args_v)
{
- hb_filter_private_t *pv = filter->private_data;
- hb_buffer_t *in = *buf_in, *out;
+ nlmeans_thread_arg_t *thread_data = thread_args_v;
+ hb_filter_private_t *pv = thread_data->pv;
+ int segment = thread_data->segment;
- if (in->size <= 0)
+ hb_log("NLMeans Denoise thread started for segment %d", segment);
+
+ while (1)
{
- *buf_out = in;
- *buf_in = NULL;
- return HB_FILTER_DONE;
- }
+ // Wait until there is work to do.
+ taskset_thread_wait4start(&pv->taskset, segment);
- out = hb_video_buffer_init(in->f.width, in->f.height);
+ if (taskset_thread_stop(&pv->taskset, segment))
+ {
+ break;
+ }
- for (int c = 0; c < 3; c++)
- {
+ Frame *frame = &pv->frame[segment];
+ hb_buffer_t *buf;
+ buf = hb_frame_buffer_init(frame->fmt, frame->width, frame->height);
- if (pv->strength[c] == 0)
+ for (int c = 0; c < 3; c++)
{
- out->plane[c].data = in->plane[c].data;
- continue;
+ if (pv->strength[c] == 0)
+ {
+ nlmeans_deborder(&frame->plane[c], buf->plane[c].data,
+ buf->plane[c].width, buf->plane[c].stride,
+ buf->plane[c].height);
+ continue;
+ }
+ if (pv->prefilter[c] & NLMEANS_PREFILTER_MODE_PASSTHRU)
+ {
+ nlmeans_prefilter(&pv->frame->plane[c], pv->prefilter[c]);
+ nlmeans_deborder(&frame->plane[c], buf->plane[c].data,
+ buf->plane[c].width, buf->plane[c].stride,
+ buf->plane[c].height);
+ continue;
+ }
+
+ // Process current plane
+ nlmeans_plane(frame,
+ pv->prefilter[c],
+ c,
+ pv->nframes[c],
+ buf->plane[c].data,
+ buf->plane[c].width,
+ buf->plane[c].stride,
+ buf->plane[c].height,
+ pv->strength[c],
+ pv->origin_tune[c],
+ pv->patch_size[c],
+ pv->range[c]);
}
+ buf->s = pv->frame[segment].s;
+ thread_data->out = buf;
+
+ // Finished this segment, notify.
+ taskset_thread_complete(&pv->taskset, segment);
+ }
+ taskset_thread_complete(&pv->taskset, segment);
+}
+
+static void nlmeans_add_frame(hb_filter_private_t *pv, hb_buffer_t *buf)
+{
+ for (int c = 0; c < 3; c++)
+ {
+ // Extend copy of plane with extra border and place in buffer
+ int border = ((pv->range[c] + 2) / 2 + 15) / 16 * 16;
+ nlmeans_alloc(buf->plane[c].data,
+ buf->plane[c].width,
+ buf->plane[c].stride,
+ buf->plane[c].height,
+ &pv->frame[pv->next_frame].plane[c],
+ border);
+ pv->frame[pv->next_frame].s = buf->s;
+ pv->frame[pv->next_frame].width = buf->f.width;
+ pv->frame[pv->next_frame].height = buf->f.height;
+ pv->frame[pv->next_frame].fmt = buf->f.fmt;
+ }
+ pv->next_frame++;
+}
+
+static hb_buffer_t * nlmeans_filter(hb_filter_private_t *pv)
+{
+ if (pv->next_frame < pv->max_frames + pv->thread_count)
+ return NULL;
- int frames = pv->frames[c];
+ taskset_cycle(&pv->taskset);
- // Release last frame in buffer
- if (pv->frame_tmp[c][frames-1].mem_pre != NULL &&
- pv->frame_tmp[c][frames-1].mem_pre != pv->frame_tmp[c][frames-1].mem)
+ // Free buffers that are not needed for next taskset cycle
+ for (int c = 0; c < 3; c++)
+ {
+ for (int t = 0; t < pv->thread_count; t++)
{
- free(pv->frame_tmp[c][frames-1].mem_pre);
- pv->frame_tmp[c][frames-1].mem_pre = NULL;
+ // Release last frame in buffer
+ if (pv->frame[t].plane[c].mem_pre != NULL &&
+ pv->frame[t].plane[c].mem_pre != pv->frame[t].plane[c].mem)
+ {
+ free(pv->frame[t].plane[c].mem_pre);
+ pv->frame[t].plane[c].mem_pre = NULL;
+ }
+ if (pv->frame[t].plane[c].mem != NULL)
+ {
+ free(pv->frame[t].plane[c].mem);
+ pv->frame[t].plane[c].mem = NULL;
+ }
}
- if (pv->frame_tmp[c][frames-1].mem != NULL)
+ }
+ // Shift frames in buffer down
+ for (int f = 0; f < pv->max_frames; f++)
+ {
+ // Don't move the mutex!
+ Frame frame = pv->frame[f];
+ pv->frame[f] = pv->frame[f+pv->thread_count];
+ for (int c = 0; c < 3; c++)
{
- free(pv->frame_tmp[c][frames-1].mem);
- pv->frame_tmp[c][frames-1].mem = NULL;
+ pv->frame[f].plane[c].mutex = frame.plane[c].mutex;
+ pv->frame[f+pv->thread_count].plane[c].mem_pre = NULL;
+ pv->frame[f+pv->thread_count].plane[c].mem = NULL;
}
- pv->frame_ready[c][frames-1] = 0;
+ }
+ pv->next_frame -= pv->thread_count;
- // Shift frames in buffer down one level
- for (int f = frames-1; f > 0; f--)
+ // Collect results from taskset
+ hb_buffer_t *last = NULL, *out = NULL;
+ for (int t = 0; t < pv->thread_count; t++)
+ {
+ if (out == NULL)
{
- pv->frame_tmp[c][f] = pv->frame_tmp[c][f-1];
- pv->frame_ready[c][f] = pv->frame_ready[c][f-1];
+ out = last = pv->thread_data[t]->out;
}
+ else
+ {
+ last->next = pv->thread_data[t]->out;
+ last = pv->thread_data[t]->out;
+ }
+ }
+ return out;
+}
- // Extend copy of plane with extra border and place in buffer
- int border = ((pv->range[c] + 2) / 2 + 15) /16*16;
- int w = in->plane[c].stride + 2*border;
- int h = in->plane[c].height + 2*border;
- nlmeans_alloc(in->plane[c].data,
- in->plane[c].stride,
- in->plane[c].height,
- &pv->frame_tmp[c][0],
- w,
- h,
- border);
- nlmeans_prefilter(&pv->frame_tmp[c][0], pv->prefilter[c]);
- pv->frame_ready[c][0] = 1;
+static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
+{
+ hb_buffer_t *out = NULL, *last = NULL;
+
+ for (int f = 0; f < pv->next_frame; f++)
+ {
+ Frame *frame = &pv->frame[f];
+ hb_buffer_t *buf;
+ buf = hb_frame_buffer_init(frame->fmt, frame->width, frame->height);
- if (pv->prefilter[c] & NLMEANS_PREFILTER_MODE_PASSTHRU)
+ for (int c = 0; c < 3; c++)
{
- nlmeans_deborder(pv->frame_tmp[c][0].mem_pre, out->plane[c].data, w, h, border);
- continue;
+ if (pv->strength[c] == 0)
+ {
+ nlmeans_deborder(&frame->plane[c], buf->plane[c].data,
+ buf->plane[c].width, buf->plane[c].stride,
+ buf->plane[c].height);
+ continue;
+ }
+ if (pv->prefilter[c] & NLMEANS_PREFILTER_MODE_PASSTHRU)
+ {
+ nlmeans_prefilter(&pv->frame[f].plane[c], pv->prefilter[c]);
+ nlmeans_deborder(&frame->plane[c], buf->plane[c].data,
+ buf->plane[c].width, buf->plane[c].stride,
+ buf->plane[c].height);
+ continue;
+ }
+
+ int nframes = pv->next_frame - f;
+ if (pv->nframes[c] < nframes)
+ nframes = pv->nframes[c];
+ // Process current plane
+ nlmeans_plane(frame,
+ pv->prefilter[c],
+ c,
+ nframes,
+ buf->plane[c].data,
+ buf->plane[c].width,
+ buf->plane[c].stride,
+ buf->plane[c].height,
+ pv->strength[c],
+ pv->origin_tune[c],
+ pv->patch_size[c],
+ pv->range[c]);
+ }
+ buf->s = frame->s;
+ if (out == NULL)
+ {
+ out = last = buf;
}
+ else
+ {
+ last->next = buf;
+ last = buf;
+ }
+ }
+ return out;
+}
- // Process current plane
- nlmeans_plane(pv->frame_tmp[c],
- pv->frame_ready[c],
- out->plane[c].data,
- in->plane[c].stride,
- in->plane[c].height,
- pv->strength[c],
- pv->origin_tune[c],
- pv->patch_size[c],
- pv->range[c]);
+static int nlmeans_work(hb_filter_object_t *filter,
+ hb_buffer_t **buf_in,
+ hb_buffer_t **buf_out )
+{
+ hb_filter_private_t *pv = filter->private_data;
+ hb_buffer_t *in = *buf_in;
- }
+ if (in->size <= 0)
+ {
+ hb_buffer_t *last;
+ // Flush buffered frames
+ last = *buf_out = nlmeans_filter_flush(pv);
- out->s = in->s;
- hb_buffer_move_subs(out, in);
+ // And terminate the buffer list with a null buffer
+ if (last != NULL)
+ {
+ while (last->next != NULL)
+ last = last->next;
+ last->next = in;
+ }
+ else
+ {
+ *buf_out = in;
+ }
+ *buf_in = NULL;
+ return HB_FILTER_DONE;
+ }
- *buf_out = out;
+ nlmeans_add_frame(pv, in);
+ *buf_out = nlmeans_filter(pv);
return HB_FILTER_OK;
}