aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJustin T. Gibbs <[email protected]>2015-04-11 14:51:06 -0400
committerBrian Behlendorf <[email protected]>2015-04-24 15:16:56 -0700
commitec8501ee1274205f277a7287c3de8119d361afaf (patch)
treeeaf3ba75d849fa44a96b0c2bbb000c9fc3991fe9
parent0bf8501ae1f7c995d54c6061d8da29419ec4fed0 (diff)
5313 Allow I/Os to be aggregated across ZIO priority classes
Reviewed by: Andriy Gapon <[email protected]> Reviewed by: Will Andrews <[email protected]> Reviewed by: Matt Ahrens <[email protected]> Reviewed by: George Wilson <[email protected]> Approved by: Robert Mustacchi <[email protected]> References: https://www.illumos.org/issues/5313 https://github.com/illumos/illumos-gate/commit/fe319232 Ported-by: DHE <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #3280
-rw-r--r--include/sys/vdev_impl.h2
-rw-r--r--include/sys/zio.h1
-rw-r--r--module/zfs/vdev_queue.c79
3 files changed, 51 insertions, 31 deletions
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index a8dc9510e..1048dec5e 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -113,6 +113,8 @@ struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
avl_tree_t vq_active_tree;
+ avl_tree_t vq_read_offset_tree;
+ avl_tree_t vq_write_offset_tree;
uint64_t vq_last_offset;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 18e7a40a3..0368d9c59 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -427,6 +427,7 @@ struct zio {
hrtime_t io_delta; /* vdev queue service delta */
uint64_t io_delay; /* vdev disk service delta (ticks) */
avl_node_t io_queue_node;
+ avl_node_t io_offset_node;
/* Internal pipeline state */
enum zio_flag io_flags;
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 3fa4219f2..cf0301649 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -190,6 +190,22 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
return (0);
}
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+ return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+ ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
+ if (t == ZIO_TYPE_READ)
+ return (&vq->vq_read_offset_tree);
+ else
+ return (&vq->vq_write_offset_tree);
+}
+
int
vdev_queue_timestamp_compare(const void *x1, const void *x2)
{
@@ -303,7 +319,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
/* find a queue that has not reached its minimum # outstanding i/os */
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_min_active(p))
return (p);
@@ -314,7 +330,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_max_active(spa, p))
return (p);
@@ -335,20 +351,27 @@ vdev_queue_init(vdev_t *vd)
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ int (*compfn) (const void *, const void *);
+
/*
- * The synchronous i/o queues are FIFO rather than LBA ordered.
- * This provides more consistent latency for these i/os, and
- * they tend to not be tightly clustered anyway so there is
- * little to no throughput loss.
+ * The synchronous i/o queues are dispatched in FIFO rather
+ * than LBA order. This provides more consistent latency for
+ * these i/os.
*/
- boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ ||
- p == ZIO_PRIORITY_SYNC_WRITE);
- avl_create(&vq->vq_class[p].vqc_queued_tree,
- fifo ? vdev_queue_timestamp_compare :
- vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+ compfn = vdev_queue_timestamp_compare;
+ else
+ compfn = vdev_queue_offset_compare;
+ avl_create(vdev_queue_class_tree(vq, p), compfn,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
}
@@ -359,8 +382,10 @@ vdev_queue_fini(vdev_t *vd)
zio_priority_t p;
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
- avl_destroy(&vq->vq_class[p].vqc_queued_tree);
+ avl_destroy(vdev_queue_class_tree(vq, p));
avl_destroy(&vq->vq_active_tree);
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
mutex_destroy(&vq->vq_lock);
}
@@ -372,7 +397,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
if (ssh->kstat != NULL) {
mutex_enter(&ssh->lock);
@@ -388,7 +414,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
if (ssh->kstat != NULL) {
mutex_enter(&ssh->lock);
@@ -472,8 +499,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
uint64_t maxgap = 0;
uint64_t size;
boolean_t stretch = B_FALSE;
- vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
- avl_tree_t *t = &vqc->vqc_queued_tree;
+ avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
@@ -486,15 +512,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
zfs_vdev_aggregation_limit =
MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
- /*
- * The synchronous i/o queues are not sorted by LBA, so we can't
- * find adjacent i/os. These i/os tend to not be tightly clustered,
- * or too large to aggregate, so this has little impact on performance.
- */
- if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
- zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
- return (NULL);
-
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)
@@ -627,7 +644,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
zio_t *zio, *aio;
zio_priority_t p;
avl_index_t idx;
- vdev_queue_class_t *vqc;
+ avl_tree_t *tree;
again:
ASSERT(MUTEX_HELD(&vq->vq_lock));
@@ -645,14 +662,14 @@ again:
*
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
*/
- vqc = &vq->vq_class[p];
+ tree = vdev_queue_class_tree(vq, p);
vq->vq_io_search.io_timestamp = 0;
vq->vq_io_search.io_offset = vq->vq_last_offset + 1;
- VERIFY3P(avl_find(&vqc->vqc_queued_tree, &vq->vq_io_search,
+ VERIFY3P(avl_find(tree, &vq->vq_io_search,
&idx), ==, NULL);
- zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
+ zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL)
- zio = avl_first(&vqc->vqc_queued_tree);
+ zio = avl_first(tree);
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);