summaryrefslogtreecommitdiffstats
path: root/module/zfs/zio.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/zio.c')
-rw-r--r--module/zfs/zio.c543
1 files changed, 444 insertions, 99 deletions
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 8a063ab7f..0147cb17c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -39,6 +39,7 @@
#include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
+#include <sys/metaslab_impl.h>
#include <sys/time.h>
#include <sys/trace_zio.h>
@@ -48,9 +49,15 @@
* ==========================================================================
*/
const char *zio_type_name[ZIO_TYPES] = {
+ /*
+ * Note: Linux kernel thread name length is limited
+ * so these names will differ from upstream open zfs.
+ */
"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
};
+int zio_dva_throttle_enabled = B_TRUE;
+
/*
* ==========================================================================
* I/O kmem caches
@@ -100,6 +107,8 @@ int zio_buf_debug_limit = 0;
static inline void __zio_execute(zio_t *zio);
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
void
zio_init(void)
{
@@ -368,52 +377,39 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
* I/O parent/child relationships and pipeline interlocks
* ==========================================================================
*/
-/*
- * NOTE - Callers to zio_walk_parents() and zio_walk_children must
- * continue calling these functions until they return NULL.
- * Otherwise, the next caller will pick up the list walk in
- * some indeterminate state. (Otherwise every caller would
- * have to pass in a cookie to keep the state represented by
- * io_walk_link, which gets annoying.)
- */
zio_t *
-zio_walk_parents(zio_t *cio)
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
{
- zio_link_t *zl = cio->io_walk_link;
list_t *pl = &cio->io_parent_list;
- zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
- cio->io_walk_link = zl;
-
- if (zl == NULL)
+ *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+ if (*zl == NULL)
return (NULL);
- ASSERT(zl->zl_child == cio);
- return (zl->zl_parent);
+ ASSERT((*zl)->zl_child == cio);
+ return ((*zl)->zl_parent);
}
zio_t *
-zio_walk_children(zio_t *pio)
+zio_walk_children(zio_t *pio, zio_link_t **zl)
{
- zio_link_t *zl = pio->io_walk_link;
list_t *cl = &pio->io_child_list;
- zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
- pio->io_walk_link = zl;
-
- if (zl == NULL)
+ *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+ if (*zl == NULL)
return (NULL);
- ASSERT(zl->zl_parent == pio);
- return (zl->zl_child);
+ ASSERT((*zl)->zl_parent == pio);
+ return ((*zl)->zl_child);
}
zio_t *
zio_unique_parent(zio_t *cio)
{
- zio_t *pio = zio_walk_parents(cio);
+ zio_link_t *zl = NULL;
+ zio_t *pio = zio_walk_parents(cio, &zl);
- VERIFY(zio_walk_parents(cio) == NULL);
+ VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
return (pio);
}
@@ -469,7 +465,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
-
kmem_cache_free(zio_link_cache, zl);
}
@@ -483,6 +478,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
ASSERT(zio->io_stall == NULL);
if (*countp != 0) {
zio->io_stage >>= 1;
+ ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
zio->io_stall = countp;
waiting = B_TRUE;
}
@@ -507,9 +503,18 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
+ zio_taskq_type_t type =
+ pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+ ZIO_TASKQ_INTERRUPT;
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
- __zio_execute(pio);
+ /*
+ * Dispatch the parent zio in its own taskq so that
+ * the child can continue to make progress. This also
+ * prevents overflowing the stack when we have deeply nested
+ * parent-child relationships.
+ */
+ zio_taskq_dispatch(pio, type, B_FALSE);
} else {
mutex_exit(&pio->io_lock);
}
@@ -522,6 +527,24 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
zio->io_error = zio->io_child_error[c];
}
+int
+zio_timestamp_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+ int cmp;
+
+ cmp = AVL_CMP(z1->io_queued_timestamp, z2->io_queued_timestamp);
+ if (likely(cmp))
+ return (cmp);
+
+ cmp = AVL_CMP(z1->io_offset, z2->io_offset);
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_PCMP(z1, z2));
+}
+
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
@@ -594,6 +617,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+ zio->io_pipeline_trace = ZIO_STAGE_OPEN;
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
@@ -797,7 +821,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
zio_t *zio;
zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
- ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
return (zio);
@@ -912,6 +936,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ ASSERT0(zio->io_queued_timestamp);
return (zio);
}
@@ -1031,9 +1056,31 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
if (flags & ZIO_FLAG_IO_REPAIR)
flags &= ~ZIO_FLAG_SPECULATIVE;
+ /*
+ * If we're creating a child I/O that is not associated with a
+ * top-level vdev, then the child zio is not an allocating I/O.
+ * If this is a retried I/O then we ignore it since we will
+ * have already processed the original allocating I/O.
+ */
+ if (flags & ZIO_FLAG_IO_ALLOCATING &&
+ (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+ metaslab_class_t *mc = spa_normal_class(pio->io_spa);
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(type == ZIO_TYPE_WRITE);
+ ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+ pio->io_child_type == ZIO_CHILD_GANG);
+
+ flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ }
+
+
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
zio->io_physdone = pio->io_physdone;
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
@@ -1131,40 +1178,16 @@ zio_read_bp_init(zio_t *zio)
static int
zio_write_bp_init(zio_t *zio)
{
- spa_t *spa = zio->io_spa;
- zio_prop_t *zp = &zio->io_prop;
- enum zio_compress compress = zp->zp_compress;
- blkptr_t *bp = zio->io_bp;
- uint64_t lsize = zio->io_lsize;
- uint64_t psize = zio->io_size;
- int pass = 1;
-
- EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
-
- /*
- * If our children haven't all reached the ready stage,
- * wait for them and then repeat this pipeline stage.
- */
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
- return (ZIO_PIPELINE_STOP);
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
- if (zio->io_children_ready != NULL) {
- /*
- * Now that all our children are ready, run the callback
- * associated with this zio in case it wants to modify the
- * data to be written.
- */
- ASSERT3U(zp->zp_level, >, 0);
- zio->io_children_ready(zio);
- }
-
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
if (zio->io_bp_override) {
+ blkptr_t *bp = zio->io_bp;
+ zio_prop_t *zp = &zio->io_prop;
+
ASSERT(bp->blk_birth != zio->io_txg);
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
@@ -1181,6 +1204,7 @@ zio_write_bp_init(zio_t *zio)
*/
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
ASSERT(!zp->zp_dedup);
+ ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
return (ZIO_PIPELINE_CONTINUE);
}
@@ -1198,10 +1222,56 @@ zio_write_bp_init(zio_t *zio)
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
return (ZIO_PIPELINE_CONTINUE);
}
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
zio->io_bp_override = NULL;
- BP_ZERO(bp);
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_write_compress(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_prop_t *zp = &zio->io_prop;
+ enum zio_compress compress = zp->zp_compress;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t lsize = zio->io_lsize;
+ uint64_t psize = zio->io_size;
+ int pass = 1;
+
+ EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
+
+ /*
+ * If our children haven't all reached the ready stage,
+ * wait for them and then repeat this pipeline stage.
+ */
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+ zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
+ return (ZIO_PIPELINE_STOP);
+
+ if (!IO_IS_ALLOCATING(zio))
+ return (ZIO_PIPELINE_CONTINUE);
+
+ if (zio->io_children_ready != NULL) {
+ /*
+ * Now that all our children are ready, run the callback
+ * associated with this zio in case it wants to modify the
+ * data to be written.
+ */
+ ASSERT3U(zp->zp_level, >, 0);
+ zio->io_children_ready(zio);
}
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+ ASSERT(zio->io_bp_override == NULL);
+
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
/*
* We're rewriting an existing block, which means we're
@@ -1273,6 +1343,15 @@ zio_write_bp_init(zio_t *zio)
psize, lsize, NULL);
}
}
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
+ zio->io_bp_override = NULL;
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
+
} else {
ASSERT3U(psize, !=, 0);
@@ -1328,7 +1407,6 @@ zio_write_bp_init(zio_t *zio)
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
}
}
-
return (ZIO_PIPELINE_CONTINUE);
}
@@ -1559,6 +1637,8 @@ __zio_execute(zio_t *zio)
{
zio->io_executor = curthread;
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+
while (zio->io_stage < ZIO_STAGE_DONE) {
enum zio_stage pipeline = zio->io_pipeline;
enum zio_stage stage = zio->io_stage;
@@ -1603,6 +1683,7 @@ __zio_execute(zio_t *zio)
}
zio->io_stage = stage;
+ zio->io_pipeline_trace |= zio->io_stage;
rv = zio_pipeline[highbit64(stage) - 1](zio);
if (rv == ZIO_PIPELINE_STOP)
@@ -1627,6 +1708,8 @@ zio_wait(zio_t *zio)
ASSERT(zio->io_executor == NULL);
zio->io_waiter = curthread;
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
__zio_execute(zio);
@@ -1663,6 +1746,8 @@ zio_nowait(zio_t *zio)
zio_add_child(pio, zio);
}
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
__zio_execute(zio);
}
@@ -1677,6 +1762,7 @@ zio_reexecute(zio_t *pio)
{
zio_t *cio, *cio_next;
int c, w;
+ zio_link_t *zl = NULL;
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
@@ -1688,6 +1774,7 @@ zio_reexecute(zio_t *pio)
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
pio->io_flags |= ZIO_FLAG_REEXECUTED;
+ pio->io_pipeline_trace = 0;
pio->io_error = 0;
for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
@@ -1704,8 +1791,8 @@ zio_reexecute(zio_t *pio)
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
- for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
- cio_next = zio_walk_children(pio);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
mutex_enter(&pio->io_lock);
for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
@@ -1718,8 +1805,10 @@ zio_reexecute(zio_t *pio)
* We don't reexecute "The Godfather" I/O here as it's the
* responsibility of the caller to wait on him.
*/
- if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+ pio->io_queued_timestamp = gethrtime();
__zio_execute(pio);
+ }
}
void
@@ -2120,6 +2209,7 @@ static int
zio_write_gang_block(zio_t *pio)
{
spa_t *spa = pio->io_spa;
+ metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = pio->io_bp;
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
@@ -2133,10 +2223,44 @@ zio_write_gang_block(zio_t *pio)
zio_prop_t zp;
int g, error;
- error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
- bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
- METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
+ int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ flags |= METASLAB_ASYNC_ALLOC;
+ VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+
+ /*
+ * The logical zio has already placed a reservation for
+ * 'copies' allocation slots but gang blocks may require
+ * additional copies. These additional copies
+ * (i.e. gbh_copies - copies) are guaranteed to succeed
+ * since metaslab_class_throttle_reserve() always allows
+ * additional reservations for gang blocks.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+ pio, flags));
+ }
+
+ error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio);
if (error) {
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * If we failed to allocate the gang block header then
+ * we remove any additional allocation reservations that
+ * we placed here. The original reservation will
+ * be removed when the logical I/O goes to the ready
+ * stage.
+ */
+ metaslab_class_throttle_unreserve(mc,
+ gbh_copies - copies, pio);
+ }
+
pio->io_error = error;
return (ZIO_PIPELINE_CONTINUE);
}
@@ -2162,6 +2286,8 @@ zio_write_gang_block(zio_t *pio)
* Create and nowait the gang children.
*/
for (g = 0; resid != 0; resid -= lsize, g++) {
+ zio_t *cio;
+
lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
SPA_MINBLOCKSIZE);
ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
@@ -2175,11 +2301,26 @@ zio_write_gang_block(zio_t *pio)
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
- zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+ cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize,
lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL,
&gn->gn_child[g], pio->io_priority,
- ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark));
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * Gang children won't throttle but we should
+ * account for their work, so reserve an allocation
+ * slot for them here.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zp.zp_copies, cio, flags));
+ }
+ zio_nowait(cio);
+
}
/*
@@ -2478,6 +2619,7 @@ zio_ddt_child_write_ready(zio_t *zio)
ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp = &dde->dde_phys[p];
zio_t *pio;
+ zio_link_t *zl;
if (zio->io_error)
return;
@@ -2488,7 +2630,8 @@ zio_ddt_child_write_ready(zio_t *zio)
ddt_phys_fill(ddp, zio->io_bp);
- while ((pio = zio_walk_parents(zio)) != NULL)
+ zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
ddt_exit(ddt);
@@ -2509,7 +2652,8 @@ zio_ddt_child_write_done(zio_t *zio)
dde->dde_lead_zio[p] = NULL;
if (zio->io_error == 0) {
- while (zio_walk_parents(zio) != NULL)
+ zio_link_t *zl = NULL;
+ while (zio_walk_parents(zio, &zl) != NULL)
ddt_phys_addref(ddp);
} else {
ddt_phys_clear(ddp);
@@ -2691,6 +2835,97 @@ zio_ddt_free(zio_t *zio)
* Allocate and free blocks
* ==========================================================================
*/
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa)
+{
+ zio_t *zio;
+
+ ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+
+ zio = avl_first(&spa->spa_alloc_tree);
+ if (zio == NULL)
+ return (NULL);
+
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Try to place a reservation for this zio. If we're unable to
+ * reserve then we throttle.
+ */
+ if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+ zio->io_prop.zp_copies, zio, 0)) {
+ return (NULL);
+ }
+
+ avl_remove(&spa->spa_alloc_tree, zio);
+ ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+ return (zio);
+}
+
+static int
+zio_dva_throttle(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_t *nio;
+
+ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+ !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+ zio->io_child_type == ZIO_CHILD_GANG ||
+ zio->io_flags & ZIO_FLAG_NODATA) {
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+ ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+ mutex_enter(&spa->spa_alloc_lock);
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ avl_add(&spa->spa_alloc_tree, zio);
+
+ nio = zio_io_to_allocate(zio->io_spa);
+ mutex_exit(&spa->spa_alloc_lock);
+
+ if (nio == zio)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ if (nio != NULL) {
+ ASSERT3U(nio->io_queued_timestamp, <=,
+ zio->io_queued_timestamp);
+ ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+ /*
+ * We are passing control to a new zio so make sure that
+ * it is processed by a different thread. We do this to
+ * avoid stack overflows that can occur when parents are
+ * throttled and children are making progress. We allow
+ * it to go to the head of the taskq since it's already
+ * been waiting.
+ */
+ zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
+ }
+ return (ZIO_PIPELINE_STOP);
+}
+
+void
+zio_allocate_dispatch(spa_t *spa)
+{
+ zio_t *zio;
+
+ mutex_enter(&spa->spa_alloc_lock);
+ zio = zio_io_to_allocate(spa);
+ mutex_exit(&spa->spa_alloc_lock);
+ if (zio == NULL)
+ return;
+
+ ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+ ASSERT0(zio->io_error);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
static int
zio_dva_allocate(zio_t *zio)
{
@@ -2711,19 +2946,18 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
- /*
- * The dump device does not support gang blocks so allocation on
- * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
- * the "fast" gang feature.
- */
- flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
- flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
- METASLAB_GANG_CHILD : 0;
flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
+ if (zio->io_flags & ZIO_FLAG_NODATA)
+ flags |= METASLAB_DONT_THROTTLE;
+ if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
+ flags |= METASLAB_GANG_CHILD;
+ if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
+ flags |= METASLAB_ASYNC_ALLOC;
+
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio);
- if (error) {
+ if (error != 0) {
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
error);
@@ -2790,21 +3024,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
ASSERT(txg > spa_syncing_txg(spa));
- /*
- * ZIL blocks are always contiguous (i.e. not gang blocks) so we
- * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
- * when allocating them.
- */
if (use_slog) {
error = metaslab_alloc(spa, spa_log_class(spa), size,
- new_bp, 1, txg, NULL,
- METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
+ new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL);
}
if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
- new_bp, 1, txg, NULL,
- METASLAB_FASTWRITE);
+ new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL);
}
if (error == 0) {
@@ -2875,6 +3102,8 @@ zio_vdev_io_start(zio_t *zio)
return (ZIO_PIPELINE_STOP);
}
+ ASSERT3P(zio->io_logical, !=, zio);
+
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
@@ -3252,6 +3481,7 @@ zio_ready(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
+ zio_link_t *zl = NULL;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
@@ -3269,12 +3499,26 @@ zio_ready(zio_t *zio)
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
- if (zio->io_error)
+ if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ /*
+ * We were unable to allocate anything, unreserve and
+ * issue the next I/O to allocate.
+ */
+ metaslab_class_throttle_unreserve(
+ spa_normal_class(zio->io_spa),
+ zio->io_prop.zp_copies, zio);
+ zio_allocate_dispatch(zio->io_spa);
+ }
+ }
+
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_READY] = 1;
- pio = zio_walk_parents(zio);
+ pio = zio_walk_parents(zio, &zl);
mutex_exit(&zio->io_lock);
/*
@@ -3285,7 +3529,7 @@ zio_ready(zio_t *zio)
* all parents must wait for us to be done before they can be done.
*/
for (; pio != NULL; pio = pio_next) {
- pio_next = zio_walk_parents(zio);
+ pio_next = zio_walk_parents(zio, &zl);
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
@@ -3305,11 +3549,76 @@ zio_ready(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+ zio_t *lio = zio->io_logical;
+ zio_t *pio = zio_unique_parent(zio);
+ vdev_t *vd = zio->io_vd;
+ int flags = METASLAB_ASYNC_ALLOC;
+
+ ASSERT3P(zio->io_bp, !=, NULL);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ ASSERT(vd != NULL);
+ ASSERT3P(vd, ==, vd->vdev_top);
+ ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
+ ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+ ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * Parents of gang children can have two flavors -- ones that
+ * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+ * and ones that allocated the constituent blocks. The allocation
+ * throttle needs to know the allocating parent zio so we must find
+ * it here.
+ */
+ if (pio->io_child_type == ZIO_CHILD_GANG) {
+ /*
+ * If our parent is a rewrite gang child then our grandparent
+ * would have been the one that performed the allocation.
+ */
+ if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+ pio = zio_unique_parent(pio);
+ flags |= METASLAB_GANG_CHILD;
+ }
+
+ ASSERT(IO_IS_ALLOCATING(pio));
+ ASSERT3P(zio, !=, zio->io_logical);
+ ASSERT(zio->io_logical != NULL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+
+ mutex_enter(&pio->io_lock);
+ metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+ mutex_exit(&pio->io_lock);
+
+ metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
+ 1, pio);
+
+ /*
+ * Call into the pipeline to see if there is more work that
+ * needs to be done. If there is work to be done it will be
+ * dispatched to another taskq thread.
+ */
+ zio_allocate_dispatch(zio->io_spa);
+}
+
static int
zio_done(zio_t *zio)
{
+ /*
+ * Always attempt to keep stack usage minimal here since
+ * we can be called recurisvely up to 19 levels deep.
+ */
zio_t *pio, *pio_next;
int c, w;
+ zio_link_t *zl = NULL;
/*
* If our children haven't all completed,
@@ -3321,6 +3630,33 @@ zio_done(zio_t *zio)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
+ /*
+ * If the allocation throttle is enabled, then update the accounting.
+ * We only track child I/Os that are part of an allocating async
+ * write. We must do this since the allocation is performed
+ * by the logical I/O but the actual write is done by child I/Os.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ ASSERT(spa_normal_class(
+ zio->io_spa)->mc_alloc_throttle_enabled);
+ zio_dva_throttle_done(zio);
+ }
+
+ /*
+ * If the allocation throttle is enabled, verify that
+ * we have decremented the refcounts for every I/O that was throttled.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(zio->io_bp != NULL);
+ metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio);
+ VERIFY(refcount_not_held(
+ &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio));
+ }
+
+
for (c = 0; c < ZIO_CHILD_TYPES; c++)
for (w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
@@ -3506,13 +3842,15 @@ zio_done(zio_t *zio)
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
- for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
- zio_link_t *zl = zio->io_walk_link;
- pio_next = zio_walk_parents(zio);
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+ pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
- zio_remove_child(pio, zio, zl);
+ zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
}
@@ -3579,10 +3917,11 @@ zio_done(zio_t *zio)
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
- for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
- zio_link_t *zl = zio->io_walk_link;
- pio_next = zio_walk_parents(zio);
- zio_remove_child(pio, zio, zl);
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
+ zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
@@ -3606,9 +3945,10 @@ zio_done(zio_t *zio)
static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
zio_read_bp_init,
+ zio_write_bp_init,
zio_free_bp_init,
zio_issue_async,
- zio_write_bp_init,
+ zio_write_compress,
zio_checksum_generate,
zio_nop_write,
zio_ddt_read_start,
@@ -3617,6 +3957,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_ddt_free,
zio_gang_assemble,
zio_gang_issue,
+ zio_dva_throttle,
zio_dva_allocate,
zio_dva_free,
zio_dva_claim,
@@ -3778,4 +4119,8 @@ MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
module_param(zfs_sync_pass_rewrite, int, 0644);
MODULE_PARM_DESC(zfs_sync_pass_rewrite,
"Rewrite new bps starting in this pass");
+
+module_param(zio_dva_throttle_enabled, int, 0644);
+MODULE_PARM_DESC(zio_dva_throttle_enabled,
+ "Throttle block allocations in the ZIO pipeline");
#endif