aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/sys/Makefile.am2
-rw-r--r--include/sys/arc.h75
-rw-r--r--include/sys/arc_impl.h117
-rw-r--r--include/sys/multilist.h105
-rw-r--r--include/sys/trace_arc.h54
-rw-r--r--include/sys/trace_multilist.h76
-rw-r--r--include/sys/zfs_context.h2
7 files changed, 363 insertions, 68 deletions
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 5211e6564..fd80b34ee 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -33,6 +33,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/efi_partition.h \
$(top_srcdir)/include/sys/metaslab.h \
$(top_srcdir)/include/sys/metaslab_impl.h \
+ $(top_srcdir)/include/sys/multilist.h \
$(top_srcdir)/include/sys/nvpair.h \
$(top_srcdir)/include/sys/nvpair_impl.h \
$(top_srcdir)/include/sys/range_tree.h \
@@ -53,6 +54,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/trace_dbuf.h \
$(top_srcdir)/include/sys/trace_dmu.h \
$(top_srcdir)/include/sys/trace_dnode.h \
+ $(top_srcdir)/include/sys/trace_multilist.h \
$(top_srcdir)/include/sys/trace_txg.h \
$(top_srcdir)/include/sys/trace_zil.h \
$(top_srcdir)/include/sys/trace_zrlock.h \
diff --git a/include/sys/arc.h b/include/sys/arc.h
index 215c75b6d..0961d4b4d 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -38,6 +38,12 @@ extern "C" {
#include <sys/spa.h>
#include <sys/refcount.h>
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define ARC_EVICT_ALL -1ULL
+
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef struct arc_prune arc_prune_t;
@@ -53,10 +59,65 @@ arc_done_func_t arc_getbuf_func;
struct arc_prune {
arc_prune_func_t *p_pfunc;
void *p_private;
+ uint64_t p_adjust;
list_node_t p_node;
refcount_t p_refcnt;
};
+typedef enum arc_strategy {
+ ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */
+ ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */
+} arc_strategy_t;
+
+typedef enum arc_flags
+{
+ /*
+ * Public flags that can be passed into the ARC by external consumers.
+ */
+ ARC_FLAG_NONE = 1 << 0, /* No flags set */
+ ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */
+ ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */
+ ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */
+ ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
+ ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
+ ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */
+
+ /*
+ * Private ARC flags. These flags are private ARC only flags that
+ * will show up in b_flags in the arc_hdr_buf_t. These flags should
+ * only be set by ARC code.
+ */
+ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
+ ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
+ ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
+ ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
+ ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
+ ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
+ ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
+ ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
+ ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
+ /* indicates that the buffer contains metadata (otherwise, data) */
+ ARC_FLAG_BUFC_METADATA = 1 << 16,
+
+ /* Flags specifying whether optional hdr struct fields are defined */
+ ARC_FLAG_HAS_L1HDR = 1 << 17,
+ ARC_FLAG_HAS_L2HDR = 1 << 18,
+
+ /*
+ * The arc buffer's compression mode is stored in the top 7 bits of the
+ * flags field, so these dummy flags are included so that MDB can
+ * interpret the enum properly.
+ */
+ ARC_FLAG_COMPRESS_0 = 1 << 24,
+ ARC_FLAG_COMPRESS_1 = 1 << 25,
+ ARC_FLAG_COMPRESS_2 = 1 << 26,
+ ARC_FLAG_COMPRESS_3 = 1 << 27,
+ ARC_FLAG_COMPRESS_4 = 1 << 28,
+ ARC_FLAG_COMPRESS_5 = 1 << 29,
+ ARC_FLAG_COMPRESS_6 = 1 << 30
+
+} arc_flags_t;
+
struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
@@ -71,15 +132,6 @@ typedef enum arc_buf_contents {
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_NUMTYPES
} arc_buf_contents_t;
-/*
- * These are the flags we pass into calls to the arc
- */
-#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
-#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
-#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
-#define ARC_CACHED (1 << 4) /* I/O was already in cache */
-#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
-#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */
/*
* The following breakdows of arc_size exist for kstat only.
@@ -106,7 +158,6 @@ typedef enum arc_state_type {
typedef struct arc_buf_info {
arc_state_type_t abi_state_type;
arc_buf_contents_t abi_state_contents;
- uint64_t abi_state_index;
uint32_t abi_flags;
uint32_t abi_datacnt;
uint64_t abi_size;
@@ -146,7 +197,7 @@ int arc_referenced(arc_buf_t *buf);
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, zio_priority_t priority, int flags,
- uint32_t *arc_flags, const zbookmark_phys_t *zb);
+ arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
@@ -160,7 +211,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
boolean_t arc_clear_callback(arc_buf_t *buf);
-void arc_flush(spa_t *spa);
+void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index e7068ea18..54f5e9f40 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -67,15 +67,25 @@ extern "C" {
*/
typedef struct arc_state {
- list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
- uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
- uint64_t arcs_size; /* total amount of data in this state */
- kmutex_t arcs_mtx;
+ /*
+ * list of evictable buffers
+ */
+ multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of evictable data in this state
+ */
+ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of data in this state; this includes: evictable,
+ * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+ */
+ uint64_t arcs_size;
+ /*
+ * supports the "dbufs" kstat
+ */
arc_state_type_t arcs_state;
} arc_state_t;
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
-
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@@ -96,31 +106,49 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
- uint64_t b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
- arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
- uint32_t b_flags;
uint32_t b_datacnt;
-
- arc_callback_t *b_acb;
+ /* for waiting on writes to complete */
kcondvar_t b_cv;
- /* immutable */
- arc_buf_contents_t b_type;
- uint64_t b_size;
- uint64_t b_spa;
/* protected by arc state mutex */
arc_state_t *b_state;
- list_node_t b_arc_node;
+ multilist_node_t b_arc_node;
/* updated atomically */
clock_t b_arc_access;
@@ -133,9 +161,10 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
- l2arc_buf_hdr_t *b_l2hdr;
- list_node_t b_l2node;
-};
+ arc_callback_t *b_acb;
+ /* temporary buffer holder for in-flight compressed data */
+ void *b_tmp_cdata;
+} l1arc_buf_hdr_t;
typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
@@ -146,15 +175,51 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
- list_t *l2ad_buflist; /* buffer list */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
} l2arc_dev_t;
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+ /* real alloc'd buffer size depending on b_compress applied */
+ uint32_t b_hits;
+ int32_t b_asize;
+
+ list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ /*
+ * Even though this checksum is only set/verified when a buffer is in
+ * the L1 cache, it needs to be in the set of common fields because it
+ * must be preserved from the time before a buffer is written out to
+ * L2ARC until after it is read back in.
+ */
+ zio_cksum_t *b_freeze_cksum;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /* immutable */
+ int32_t b_size;
+ uint64_t b_spa;
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
+};
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/multilist.h b/include/sys/multilist.h
new file mode 100644
index 000000000..98d707dd7
--- /dev/null
+++ b/include/sys/multilist.h
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_MULTILIST_H
+#define _SYS_MULTILIST_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef list_node_t multilist_node_t;
+typedef struct multilist multilist_t;
+typedef struct multilist_sublist multilist_sublist_t;
+typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
+
+struct multilist_sublist {
+ /*
+ * The mutex used internally to implement thread safe insertions
+ * and removals to this individual sublist. It can also be locked
+ * by a consumer using multilist_sublist_{lock,unlock}, which is
+ * useful if a consumer needs to traverse the list in a thread
+ * safe manner.
+ */
+ kmutex_t mls_lock;
+ /*
+ * The actual list object containing all objects in this sublist.
+ */
+ list_t mls_list;
+ /*
+ * Pad to cache line, in an effort to try and prevent cache line
+ * contention.
+ */
+} ____cacheline_aligned;
+
+struct multilist {
+ /*
+ * This is used to get to the multilist_node_t structure given
+ * the void *object contained on the list.
+ */
+ size_t ml_offset;
+ /*
+ * The number of sublists used internally by this multilist.
+ */
+ uint64_t ml_num_sublists;
+ /*
+ * The array of pointers to the actual sublists.
+ */
+ multilist_sublist_t *ml_sublists;
+ /*
+ * Pointer to function which determines the sublist to use
+ * when inserting and removing objects from this multilist.
+ * Please see the comment above multilist_create for details.
+ */
+ multilist_sublist_index_func_t *ml_index_func;
+};
+
+void multilist_destroy(multilist_t *);
+void multilist_create(multilist_t *, size_t, size_t, unsigned int,
+ multilist_sublist_index_func_t *);
+
+void multilist_insert(multilist_t *, void *);
+void multilist_remove(multilist_t *, void *);
+int multilist_is_empty(multilist_t *);
+
+unsigned int multilist_get_num_sublists(multilist_t *);
+unsigned int multilist_get_random_index(multilist_t *);
+
+multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_unlock(multilist_sublist_t *);
+
+void multilist_sublist_insert_head(multilist_sublist_t *, void *);
+void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
+void multilist_sublist_remove(multilist_sublist_t *, void *);
+
+void *multilist_sublist_head(multilist_sublist_t *);
+void *multilist_sublist_tail(multilist_sublist_t *);
+void *multilist_sublist_next(multilist_sublist_t *, void *);
+void *multilist_sublist_prev(multilist_sublist_t *, void *);
+
+void multilist_link_init(multilist_node_t *);
+int multilist_link_active(multilist_node_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MULTILIST_H */
diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h
index 8b885eff7..b9df228ea 100644
--- a/include/sys/trace_arc.h
+++ b/include/sys/trace_arc.h
@@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
TP_STRUCT__entry(
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
- __field(uint64_t, hdr_cksum0)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_datacnt)
__field(arc_buf_contents_t, hdr_type)
@@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
__entry->hdr_dva_word[0] = ab->b_dva.dva_word[0];
__entry->hdr_dva_word[1] = ab->b_dva.dva_word[1];
__entry->hdr_birth = ab->b_birth;
- __entry->hdr_cksum0 = ab->b_cksum0;
__entry->hdr_flags = ab->b_flags;
- __entry->hdr_datacnt = ab->b_datacnt;
- __entry->hdr_type = ab->b_type;
+ __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt;
__entry->hdr_size = ab->b_size;
__entry->hdr_spa = ab->b_spa;
- __entry->hdr_state_type = ab->b_state->arcs_state;
- __entry->hdr_access = ab->b_arc_access;
- __entry->hdr_mru_hits = ab->b_mru_hits;
- __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits;
- __entry->hdr_mfu_hits = ab->b_mfu_hits;
- __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits;
- __entry->hdr_l2_hits = ab->b_l2_hits;
- __entry->hdr_refcount = ab->b_refcnt.rc_count;
+ __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state;
+ __entry->hdr_access = ab->b_l1hdr.b_arc_access;
+ __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits;
+ __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits;
+ __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits;
+ __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits;
+ __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits;
+ __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count;
),
- TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
+ TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
"flags 0x%x datacnt %u type %u size %llu spa %llu "
"state_type %u access %lu mru_hits %u mru_ghost_hits %u "
"mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
- __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
+ __entry->hdr_birth, __entry->hdr_flags,
__entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
__entry->hdr_spa, __entry->hdr_state_type,
__entry->hdr_access, __entry->hdr_mru_hits,
@@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
TP_STRUCT__entry(
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
- __field(uint64_t, hdr_cksum0)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_datacnt)
__field(arc_buf_contents_t, hdr_type)
@@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0];
__entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1];
__entry->hdr_birth = hdr->b_birth;
- __entry->hdr_cksum0 = hdr->b_cksum0;
__entry->hdr_flags = hdr->b_flags;
- __entry->hdr_datacnt = hdr->b_datacnt;
- __entry->hdr_type = hdr->b_type;
+ __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt;
__entry->hdr_size = hdr->b_size;
__entry->hdr_spa = hdr->b_spa;
- __entry->hdr_state_type = hdr->b_state->arcs_state;
- __entry->hdr_access = hdr->b_arc_access;
- __entry->hdr_mru_hits = hdr->b_mru_hits;
- __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits;
- __entry->hdr_mfu_hits = hdr->b_mfu_hits;
- __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
- __entry->hdr_l2_hits = hdr->b_l2_hits;
- __entry->hdr_refcount = hdr->b_refcnt.rc_count;
+ __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state;
+ __entry->hdr_access = hdr->b_l1hdr.b_arc_access;
+ __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits;
+ __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
+ __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
+ __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
+ __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits;
+ __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count;
__entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0];
__entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1];
@@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->zb_level = zb->zb_level;
__entry->zb_blkid = zb->zb_blkid;
),
- TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
- "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u "
+ TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
+ "flags 0x%x datacnt %u size %llu spa %llu state_type %u "
"access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
"mfu_ghost_hits %u l2_hits %u refcount %lli } "
"bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
@@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
"lsize %llu } zb { objset %llu object %llu level %lli "
"blkid %llu }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
- __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
- __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
+ __entry->hdr_birth, __entry->hdr_flags,
+ __entry->hdr_datacnt, __entry->hdr_size,
__entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
__entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
__entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,
diff --git a/include/sys/trace_multilist.h b/include/sys/trace_multilist.h
new file mode 100644
index 000000000..11d2f2701
--- /dev/null
+++ b/include/sys/trace_multilist.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM zfs
+
+#if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MULTILIST_H
+
+#include <linux/tracepoint.h>
+#include <sys/types.h>
+
+/*
+ * Generic support for three argument tracepoints of the form:
+ *
+ * DTRACE_PROBE3(...,
+ * multilist_t *, ...,
+ * unsigned int, ...,
+ * void *, ...);
+ */
+
+DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class,
+ TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj),
+ TP_ARGS(ml, sublist_idx, obj),
+ TP_STRUCT__entry(
+ __field(size_t, ml_offset)
+ __field(uint64_t, ml_num_sublists)
+
+ __field(unsigned int, sublist_idx)
+ ),
+ TP_fast_assign(
+ __entry->ml_offset = ml->ml_offset;
+ __entry->ml_num_sublists = ml->ml_num_sublists;
+
+ __entry->sublist_idx = sublist_idx;
+ ),
+ TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ",
+ __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx)
+);
+
+#define DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \
+DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \
+ TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \
+ TP_ARGS(ml, sublist_idx, obj))
+DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert);
+DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove);
+
+#endif /* _TRACE_MULTILIST_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH sys
+#define TRACE_INCLUDE_FILE trace_multilist
+#include <trace/define_trace.h>
+
+#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 3dc54f1d7..8b9a5f46f 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -468,6 +468,7 @@ extern void taskq_init_ent(taskq_ent_t *);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *);
extern void taskq_wait_id(taskq_t *, taskqid_t);
+extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
extern int taskq_member(taskq_t *, kthread_t *);
extern int taskq_cancel_id(taskq_t *, taskqid_t);
extern void system_taskq_init(void);
@@ -609,6 +610,7 @@ extern void delay(clock_t ticks);
} while (0);
#define max_ncpus 64
+#define num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN))
#define minclsyspri 60
#define maxclsyspri 99