summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/sys/vdev.h3
-rw-r--r--include/sys/vdev_impl.h17
-rw-r--r--include/sys/zio.h15
-rw-r--r--man/man4/zfs.46
-rw-r--r--module/zfs/spa_misc.c2
-rw-r--r--module/zfs/txg.c13
-rw-r--r--module/zfs/vdev.c16
-rw-r--r--module/zfs/vdev_queue.c305
8 files changed, 205 insertions, 172 deletions
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 26c834ff5..03e1f438a 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
-extern int vdev_queue_length(vdev_t *vd);
+extern uint32_t vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
+extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 74b3737d8..2b22b973b 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -130,27 +130,24 @@ typedef const struct vdev_ops {
/*
* Virtual device properties
*/
-typedef struct vdev_queue_class {
- uint32_t vqc_active;
-
- /*
- * Sorted by offset or timestamp, depending on if the queue is
- * LBA-ordered vs FIFO.
- */
- avl_tree_t vqc_queued_tree;
+typedef union vdev_queue_class {
+ list_t vqc_list;
+ avl_tree_t vqc_tree;
} vdev_queue_class_t;
struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
- avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
- avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset;
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
+ uint32_t vq_cqueued; /* Classes with queued I/Os. */
+ uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
+ uint32_t vq_active; /* Number of active I/Os. */
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
+ list_t vq_active_list; /* List of active I/Os. */
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index ec32211f6..85217b873 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -436,6 +436,12 @@ typedef struct zio_link {
list_node_t zl_child_node;
} zio_link_t;
+enum zio_qstate {
+ ZIO_QS_NONE = 0,
+ ZIO_QS_QUEUED,
+ ZIO_QS_ACTIVE,
+};
+
struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
@@ -479,6 +485,12 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops;
metaslab_class_t *io_metaslab_class; /* dva throttle class */
+ enum zio_qstate io_queue_state; /* vdev queue state */
+ union {
+ list_node_t l;
+ avl_node_t a;
+ } io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
+ avl_node_t io_offset_node; /* vdev offset queues */
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_queued_timestamp;
@@ -486,9 +498,6 @@ struct zio {
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
- avl_node_t io_queue_node;
- avl_node_t io_offset_node;
- avl_node_t io_alloc_node;
zio_alloc_list_t io_alloc_list;
/* Internal pipeline state */
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 5fbd9d7db..04bbbc5fd 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
Flush dirty data to disk at least every this many seconds (maximum TXG
duration).
.
-.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
-Allow TRIM I/O operations to be aggregated.
-This is normally not helpful because the extents to be trimmed
-will have been already been aggregated by the metaslab.
-This option is provided for debugging and performance analysis.
-.
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
Max vdev I/O aggregation size.
.
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 9ef948e9e..8dc83445e 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
- sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+ sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index ec61cabca..a67c04344 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
- mutex_enter(&tl->tl_lock);
- for (int i = 0; i < TXG_SIZE; i++) {
- if (!txg_list_empty_impl(tl, i)) {
- mutex_exit(&tl->tl_lock);
- return (B_FALSE);
- }
- }
- mutex_exit(&tl->tl_lock);
- return (B_TRUE);
+ boolean_t res = B_TRUE;
+ for (int i = 0; i < TXG_SIZE; i++)
+ res &= (tl->tl_head[i] == NULL);
+ return (res);
}
/*
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 612e66c3a..30551feb6 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
- for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
- vsx->vsx_active_queue[t] =
- vd->vdev_queue.vq_class[t].vqc_active;
- vsx->vsx_pend_queue[t] = avl_numnodes(
- &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+ for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+ vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
+ vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
}
}
}
@@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
- if (avl_numnodes(&vq->vq_active_tree) > 0) {
+ if (vq->vq_active > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
- zfs_dbgmsg("slow vdev: %s has %lu active IOs",
- vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+ zfs_dbgmsg("slow vdev: %s has %u active IOs",
+ vd->vdev_path, vq->vq_active);
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
- fio = avl_first(&vq->vq_active_tree);
+ fio = list_head(&vq->vq_active_list);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index abb7d0662..08d918467 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300;
*/
uint_t zfs_vdev_def_queue_depth = 32;
-/*
- * Allow TRIM I/Os to be aggregated. This should normally not be needed since
- * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
- * by the TRIM code in zfs_trim.c.
- */
-static uint_t zfs_vdev_aggregate_trim = 0;
-
static int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
@@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
return (TREE_PCMP(z1, z2));
}
-static inline avl_tree_t *
-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
-{
- return (&vq->vq_class[p].vqc_queued_tree);
-}
-
-static inline avl_tree_t *
-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
-{
- ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
- if (t == ZIO_TYPE_READ)
- return (&vq->vq_read_offset_tree);
- else if (t == ZIO_TYPE_WRITE)
- return (&vq->vq_write_offset_tree);
- else
- return (&vq->vq_trim_offset_tree);
-}
+#define VDQ_T_SHIFT 29
static int
-vdev_queue_timestamp_compare(const void *x1, const void *x2)
+vdev_queue_to_compare(const void *x1, const void *x2)
{
const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2;
- int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
+ int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
+ z2->io_timestamp >> VDQ_T_SHIFT);
+ int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
+ int cmp = tcmp ? tcmp : ocmp;
- if (likely(cmp))
+ if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
return (cmp);
return (TREE_PCMP(z1, z2));
}
+static inline boolean_t
+vdev_queue_class_fifo(zio_priority_t p)
+{
+ return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
+ p == ZIO_PRIORITY_TRIM);
+}
+
+static void
+vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
+{
+ zio_priority_t p = zio->io_priority;
+ vq->vq_cqueued |= 1U << p;
+ if (vdev_queue_class_fifo(p))
+ list_insert_tail(&vq->vq_class[p].vqc_list, zio);
+ else
+ avl_add(&vq->vq_class[p].vqc_tree, zio);
+}
+
+static void
+vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ zio_priority_t p = zio->io_priority;
+ uint32_t empty;
+ if (vdev_queue_class_fifo(p)) {
+ list_t *list = &vq->vq_class[p].vqc_list;
+ list_remove(list, zio);
+ empty = list_is_empty(list);
+ } else {
+ avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
+ avl_remove(tree, zio);
+ empty = avl_is_empty(tree);
+ }
+ vq->vq_cqueued &= ~(empty << p);
+}
+
static uint_t
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
{
@@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa)
}
static uint_t
-vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
@@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_max_active);
case ZIO_PRIORITY_ASYNC_WRITE:
- return (vdev_queue_max_async_writes(spa));
+ return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
case ZIO_PRIORITY_SCRUB:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
@@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq)
{
- spa_t *spa = vq->vq_vdev->vdev_spa;
- zio_priority_t p, n;
+ uint32_t cq = vq->vq_cqueued;
+ zio_priority_t p, p1;
- if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+ if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
/*
@@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* Do round-robin to reduce starvation due to zfs_vdev_max_active
* and vq_nia_credit limits.
*/
- for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
- p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
- vq->vq_class[p].vqc_active <
- vdev_queue_class_min_active(vq, p)) {
- vq->vq_last_prio = p;
- return (p);
- }
+ p1 = vq->vq_last_prio + 1;
+ if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
+ p1 = 0;
+ for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_min_active(vq, p))
+ goto found;
+ }
+ for (p = 0; p < p1; p++) {
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_min_active(vq, p))
+ goto found;
}
/*
@@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
- vq->vq_class[p].vqc_active <
- vdev_queue_class_max_active(spa, vq, p)) {
- vq->vq_last_prio = p;
- return (p);
- }
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_max_active(vq, p))
+ break;
}
- /* No eligible queued i/os */
- return (ZIO_PRIORITY_NUM_QUEUEABLE);
+found:
+ vq->vq_last_prio = p;
+ return (p);
}
void
@@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd)
vdev_queue_t *vq = &vd->vdev_queue;
zio_priority_t p;
- mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
vq->vq_vdev = vd;
- taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
-
- avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_queue_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- int (*compfn) (const void *, const void *);
-
- /*
- * The synchronous/trim i/o queues are dispatched in FIFO rather
- * than LBA order. This provides more consistent latency for
- * these i/os.
- */
- if (p == ZIO_PRIORITY_SYNC_READ ||
- p == ZIO_PRIORITY_SYNC_WRITE ||
- p == ZIO_PRIORITY_TRIM) {
- compfn = vdev_queue_timestamp_compare;
+ if (vdev_queue_class_fifo(p)) {
+ list_create(&vq->vq_class[p].vqc_list,
+ sizeof (zio_t),
+ offsetof(struct zio, io_queue_node.l));
} else {
- compfn = vdev_queue_offset_compare;
+ avl_create(&vq->vq_class[p].vqc_tree,
+ vdev_queue_to_compare, sizeof (zio_t),
+ offsetof(struct zio, io_queue_node.a));
}
- avl_create(vdev_queue_class_tree(vq, p), compfn,
- sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
+ avl_create(&vq->vq_read_offset_tree,
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(&vq->vq_write_offset_tree,
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
vq->vq_last_offset = 0;
+ list_create(&vq->vq_active_list, sizeof (struct zio),
+ offsetof(struct zio, io_queue_node.l));
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
}
void
@@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
- for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
- avl_destroy(vdev_queue_class_tree(vq, p));
- avl_destroy(&vq->vq_active_tree);
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if (vdev_queue_class_fifo(p))
+ list_destroy(&vq->vq_class[p].vqc_list);
+ else
+ avl_destroy(&vq->vq_class[p].vqc_tree);
+ }
+ avl_destroy(&vq->vq_read_offset_tree);
+ avl_destroy(&vq->vq_write_offset_tree);
+ list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
}
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
- avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
+ zio->io_queue_state = ZIO_QS_QUEUED;
+ vdev_queue_class_add(vq, zio);
+ if (zio->io_type == ZIO_TYPE_READ)
+ avl_add(&vq->vq_read_offset_tree, zio);
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ avl_add(&vq->vq_write_offset_tree, zio);
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
- avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
+ vdev_queue_class_remove(vq, zio);
+ if (zio->io_type == ZIO_TYPE_READ)
+ avl_remove(&vq->vq_read_offset_tree, zio);
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ avl_remove(&vq->vq_write_offset_tree, zio);
+ zio->io_queue_state = ZIO_QS_NONE;
}
static boolean_t
@@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- vq->vq_class[zio->io_priority].vqc_active++;
+ vq->vq_cactive[zio->io_priority]++;
+ vq->vq_active++;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1;
} else if (vq->vq_ia_active > 0) {
vq->vq_nia_credit--;
}
- avl_add(&vq->vq_active_tree, zio);
+ zio->io_queue_state = ZIO_QS_ACTIVE;
+ list_insert_tail(&vq->vq_active_list, zio);
}
static void
@@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- vq->vq_class[zio->io_priority].vqc_active--;
+ vq->vq_cactive[zio->io_priority]--;
+ vq->vq_active--;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0;
@@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
- avl_remove(&vq->vq_active_tree, zio);
+ list_remove(&vq->vq_active_list, zio);
+ zio->io_queue_state = ZIO_QS_NONE;
}
static void
@@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
uint64_t maxgap = 0;
uint64_t size;
uint64_t limit;
- int maxblocksize;
boolean_t stretch = B_FALSE;
- avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
- zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
uint64_t next_offset;
abd_t *abd;
+ avl_tree_t *t;
+
+ /*
+ * TRIM aggregation should not be needed since code in zfs_trim.c can
+ * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
+ */
+ if (zio->io_type == ZIO_TYPE_TRIM)
+ return (NULL);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
+ return (NULL);
- maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
if (vq->vq_vdev->vdev_nonrot)
limit = zfs_vdev_aggregation_limit_non_rotating;
else
limit = zfs_vdev_aggregation_limit;
- limit = MIN(limit, maxblocksize);
-
- if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
- return (NULL);
-
- /*
- * While TRIM commands could be aggregated based on offset this
- * behavior is disabled until it's determined to be beneficial.
- */
- if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+ if (limit == 0)
return (NULL);
+ limit = MIN(limit, SPA_MAXBLOCKSIZE);
/*
* I/Os to distributed spares are directly dispatched to the dRAID
@@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
first = last = zio;
- if (zio->io_type == ZIO_TYPE_READ)
+ if (zio->io_type == ZIO_TYPE_READ) {
maxgap = zfs_vdev_read_gap_limit;
+ t = &vq->vq_read_offset_tree;
+ } else {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ t = &vq->vq_write_offset_tree;
+ }
/*
* We can aggregate I/Os that are sufficiently adjacent and of
@@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-optional I/O.
*/
+ zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= limit &&
@@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
(IO_SPAN(first, dio) <= limit ||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
- IO_SPAN(first, dio) <= maxblocksize &&
+ IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
IO_GAP(last, dio) <= maxgap &&
dio->io_type == zio->io_type) {
last = dio;
@@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
return (NULL);
size = IO_SPAN(first, last);
- ASSERT3U(size, <=, maxblocksize);
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
abd = abd_alloc_gang();
if (abd == NULL)
@@ -824,19 +847,30 @@ again:
return (NULL);
}
- /*
- * For LBA-ordered queues (async / scrub / initializing), issue the
- * i/o which follows the most recently issued i/o in LBA (offset) order.
- *
- * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
- */
- tree = vdev_queue_class_tree(vq, p);
- vq->vq_io_search.io_timestamp = 0;
- vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
- VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
- zio = avl_nearest(tree, idx, AVL_AFTER);
- if (zio == NULL)
- zio = avl_first(tree);
+ if (vdev_queue_class_fifo(p)) {
+ zio = list_head(&vq->vq_class[p].vqc_list);
+ } else {
+ /*
+ * For LBA-ordered queues (async / scrub / initializing),
+ * issue the I/O which follows the most recently issued I/O
+ * in LBA (offset) order, but to avoid starvation only within
+ * the same 0.5 second interval as the first I/O.
+ */
+ tree = &vq->vq_class[p].vqc_tree;
+ zio = aio = avl_first(tree);
+ if (zio->io_offset < vq->vq_last_offset) {
+ vq->vq_io_search.io_timestamp = zio->io_timestamp;
+ vq->vq_io_search.io_offset = vq->vq_last_offset;
+ zio = avl_find(tree, &vq->vq_io_search, &idx);
+ if (zio == NULL) {
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ if (zio == NULL ||
+ (zio->io_timestamp >> VDQ_T_SHIFT) !=
+ (aio->io_timestamp >> VDQ_T_SHIFT))
+ zio = aio;
+ }
+ }
+ }
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);
@@ -967,7 +1001,6 @@ void
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
- avl_tree_t *tree;
/*
* ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* Otherwise, the zio is currently active and we cannot change its
* priority.
*/
- tree = vdev_queue_class_tree(vq, zio->io_priority);
- if (avl_find(tree, zio, NULL) == zio) {
- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ if (zio->io_queue_state == ZIO_QS_QUEUED) {
+ vdev_queue_class_remove(vq, zio);
zio->io_priority = priority;
- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
- } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+ vdev_queue_class_add(vq, zio);
+ } else if (zio->io_queue_state == ZIO_QS_NONE) {
zio->io_priority = priority;
}
@@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* vq_lock mutex use here, instead we prefer to keep it lock free for
* performance.
*/
-int
+uint32_t
vdev_queue_length(vdev_t *vd)
{
- return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+ return (vd->vdev_queue.vq_active);
}
uint64_t
@@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd)
return (vd->vdev_queue.vq_last_offset);
}
+uint64_t
+vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+ if (vdev_queue_class_fifo(p))
+ return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
+ else
+ return (avl_numnodes(&vq->vq_class[p].vqc_tree));
+}
+
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
"Max vdev I/O aggregation size");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW,
- "Allow TRIM I/O to be aggregated");
-
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
"Aggregate read I/O over gap");