diff options
Diffstat (limited to 'module/zfs/zio.c')
-rw-r--r-- | module/zfs/zio.c | 543 |
1 files changed, 444 insertions, 99 deletions
diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 8a063ab7f..0147cb17c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -39,6 +39,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/metaslab_impl.h> #include <sys/time.h> #include <sys/trace_zio.h> @@ -48,9 +49,15 @@ * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { + /* + * Note: Linux kernel thread name length is limited + * so these names will differ from upstream open zfs. + */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" }; +int zio_dva_throttle_enabled = B_TRUE; + /* * ========================================================================== * I/O kmem caches @@ -100,6 +107,8 @@ int zio_buf_debug_limit = 0; static inline void __zio_execute(zio_t *zio); +static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); + void zio_init(void) { @@ -368,52 +377,39 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ -/* - * NOTE - Callers to zio_walk_parents() and zio_walk_children must - * continue calling these functions until they return NULL. - * Otherwise, the next caller will pick up the list walk in - * some indeterminate state. (Otherwise every caller would - * have to pass in a cookie to keep the state represented by - * io_walk_link, which gets annoying.) - */ zio_t * -zio_walk_parents(zio_t *cio) +zio_walk_parents(zio_t *cio, zio_link_t **zl) { - zio_link_t *zl = cio->io_walk_link; list_t *pl = &cio->io_parent_list; - zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); - cio->io_walk_link = zl; - - if (zl == NULL) + *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); + if (*zl == NULL) return (NULL); - ASSERT(zl->zl_child == cio); - return (zl->zl_parent); + ASSERT((*zl)->zl_child == cio); + return ((*zl)->zl_parent); } zio_t * -zio_walk_children(zio_t *pio) +zio_walk_children(zio_t *pio, zio_link_t **zl) { - zio_link_t *zl = pio->io_walk_link; list_t *cl = &pio->io_child_list; - zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); - pio->io_walk_link = zl; - - if (zl == NULL) + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); + if (*zl == NULL) return (NULL); - ASSERT(zl->zl_parent == pio); - return (zl->zl_child); + ASSERT((*zl)->zl_parent == pio); + return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { - zio_t *pio = zio_walk_parents(cio); + zio_link_t *zl = NULL; + zio_t *pio = zio_walk_parents(cio, &zl); - VERIFY(zio_walk_parents(cio) == NULL); + VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } @@ -469,7 +465,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); - kmem_cache_free(zio_link_cache, zl); } @@ -483,6 +478,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) ASSERT(zio->io_stall == NULL); if (*countp != 0) { zio->io_stage >>= 1; + ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; } @@ -507,9 +503,18 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) (*countp)--; if (*countp == 0 && pio->io_stall == countp) { + zio_taskq_type_t type = + pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : + ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); - __zio_execute(pio); + /* + * Dispatch the parent zio in its own taskq so that + * the child can continue to make progress. This also + * prevents overflowing the stack when we have deeply nested + * parent-child relationships. + */ + zio_taskq_dispatch(pio, type, B_FALSE); } else { mutex_exit(&pio->io_lock); } @@ -522,6 +527,24 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) zio->io_error = zio->io_child_error[c]; } +int +zio_timestamp_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + int cmp; + + cmp = AVL_CMP(z1->io_queued_timestamp, z2->io_queued_timestamp); + if (likely(cmp)) + return (cmp); + + cmp = AVL_CMP(z1->io_offset, z2->io_offset); + if (likely(cmp)) + return (cmp); + + return (AVL_PCMP(z1, z2)); +} + /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) @@ -594,6 +617,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); @@ -797,7 +821,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, - ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); @@ -912,6 +936,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + ASSERT0(zio->io_queued_timestamp); return (zio); } @@ -1031,9 +1056,31 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; + /* + * If we're creating a child I/O that is not associated with a + * top-level vdev, then the child zio is not an allocating I/O. + * If this is a retried I/O then we ignore it since we will + * have already processed the original allocating I/O. + */ + if (flags & ZIO_FLAG_IO_ALLOCATING && + (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { + metaslab_class_t *mc = spa_normal_class(pio->io_spa); + + ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(type == ZIO_TYPE_WRITE); + ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); + ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || + pio->io_child_type == ZIO_CHILD_GANG); + + flags &= ~ZIO_FLAG_IO_ALLOCATING; + } + + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) @@ -1131,40 +1178,16 @@ zio_read_bp_init(zio_t *zio) static int zio_write_bp_init(zio_t *zio) { - spa_t *spa = zio->io_spa; - zio_prop_t *zp = &zio->io_prop; - enum zio_compress compress = zp->zp_compress; - blkptr_t *bp = zio->io_bp; - uint64_t lsize = zio->io_lsize; - uint64_t psize = zio->io_size; - int pass = 1; - - EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); - - /* - * If our children haven't all reached the ready stage, - * wait for them and then repeat this pipeline stage. - */ - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || - zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) - return (ZIO_PIPELINE_STOP); if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); - if (zio->io_children_ready != NULL) { - /* - * Now that all our children are ready, run the callback - * associated with this zio in case it wants to modify the - * data to be written. - */ - ASSERT3U(zp->zp_level, >, 0); - zio->io_children_ready(zio); - } - ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { + blkptr_t *bp = zio->io_bp; + zio_prop_t *zp = &zio->io_prop; + ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); @@ -1181,6 +1204,7 @@ zio_write_bp_init(zio_t *zio) */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (ZIO_PIPELINE_CONTINUE); } @@ -1198,10 +1222,56 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ zio->io_bp_override = NULL; - BP_ZERO(bp); + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_write_compress(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_prop_t *zp = &zio->io_prop; + enum zio_compress compress = zp->zp_compress; + blkptr_t *bp = zio->io_bp; + uint64_t lsize = zio->io_lsize; + uint64_t psize = zio->io_size; + int pass = 1; + + EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); + + /* + * If our children haven't all reached the ready stage, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || + zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + + if (!IO_IS_ALLOCATING(zio)) + return (ZIO_PIPELINE_CONTINUE); + + if (zio->io_children_ready != NULL) { + /* + * Now that all our children are ready, run the callback + * associated with this zio in case it wants to modify the + * data to be written. + */ + ASSERT3U(zp->zp_level, >, 0); + zio->io_children_ready(zio); } + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + ASSERT(zio->io_bp_override == NULL); + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're @@ -1273,6 +1343,15 @@ zio_write_bp_init(zio_t *zio) psize, lsize, NULL); } } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } else { ASSERT3U(psize, !=, 0); @@ -1328,7 +1407,6 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } - return (ZIO_PIPELINE_CONTINUE); } @@ -1559,6 +1637,8 @@ __zio_execute(zio_t *zio) { zio->io_executor = curthread; + ASSERT3U(zio->io_queued_timestamp, >, 0); + while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; @@ -1603,6 +1683,7 @@ __zio_execute(zio_t *zio) } zio->io_stage = stage; + zio->io_pipeline_trace |= zio->io_stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) @@ -1627,6 +1708,8 @@ zio_wait(zio_t *zio) ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); @@ -1663,6 +1746,8 @@ zio_nowait(zio_t *zio) zio_add_child(pio, zio); } + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } @@ -1677,6 +1762,7 @@ zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; int c, w; + zio_link_t *zl = NULL; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); @@ -1688,6 +1774,7 @@ zio_reexecute(zio_t *pio) pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; + pio->io_pipeline_trace = 0; pio->io_error = 0; for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; @@ -1704,8 +1791,8 @@ zio_reexecute(zio_t *pio) * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ - for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { - cio_next = zio_walk_children(pio); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); mutex_enter(&pio->io_lock); for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; @@ -1718,8 +1805,10 @@ zio_reexecute(zio_t *pio) * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on him. */ - if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { + pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); + } } void @@ -2120,6 +2209,7 @@ static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; + metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -2133,10 +2223,44 @@ zio_write_gang_block(zio_t *pio) zio_prop_t zp; int g, error; - error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, - bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, - METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); + int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + flags |= METASLAB_ASYNC_ALLOC; + VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); + + /* + * The logical zio has already placed a reservation for + * 'copies' allocation slots but gang blocks may require + * additional copies. These additional copies + * (i.e. gbh_copies - copies) are guaranteed to succeed + * since metaslab_class_throttle_reserve() always allows + * additional reservations for gang blocks. + */ + VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, + pio, flags)); + } + + error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio); if (error) { + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + /* + * If we failed to allocate the gang block header then + * we remove any additional allocation reservations that + * we placed here. The original reservation will + * be removed when the logical I/O goes to the ready + * stage. + */ + metaslab_class_throttle_unreserve(mc, + gbh_copies - copies, pio); + } + pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } @@ -2162,6 +2286,8 @@ zio_write_gang_block(zio_t *pio) * Create and nowait the gang children. */ for (g = 0; resid != 0; resid -= lsize, g++) { + zio_t *cio; + lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); @@ -2175,11 +2301,26 @@ zio_write_gang_block(zio_t *pio) zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; - zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], + cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, &gn->gn_child[g], pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + /* + * Gang children won't throttle but we should + * account for their work, so reserve an allocation + * slot for them here. + */ + VERIFY(metaslab_class_throttle_reserve(mc, + zp.zp_copies, cio, flags)); + } + zio_nowait(cio); + } /* @@ -2478,6 +2619,7 @@ zio_ddt_child_write_ready(zio_t *zio) ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; + zio_link_t *zl; if (zio->io_error) return; @@ -2488,7 +2630,8 @@ zio_ddt_child_write_ready(zio_t *zio) ddt_phys_fill(ddp, zio->io_bp); - while ((pio = zio_walk_parents(zio)) != NULL) + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); @@ -2509,7 +2652,8 @@ zio_ddt_child_write_done(zio_t *zio) dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { - while (zio_walk_parents(zio) != NULL) + zio_link_t *zl = NULL; + while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); @@ -2691,6 +2835,97 @@ zio_ddt_free(zio_t *zio) * Allocate and free blocks * ========================================================================== */ + +static zio_t * +zio_io_to_allocate(spa_t *spa) +{ + zio_t *zio; + + ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); + + zio = avl_first(&spa->spa_alloc_tree); + if (zio == NULL) + return (NULL); + + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Try to place a reservation for this zio. If we're unable to + * reserve then we throttle. + */ + if (!metaslab_class_throttle_reserve(spa_normal_class(spa), + zio->io_prop.zp_copies, zio, 0)) { + return (NULL); + } + + avl_remove(&spa->spa_alloc_tree, zio); + ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); + + return (zio); +} + +static int +zio_dva_throttle(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_t *nio; + + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || + !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || + zio->io_child_type == ZIO_CHILD_GANG || + zio->io_flags & ZIO_FLAG_NODATA) { + return (ZIO_PIPELINE_CONTINUE); + } + + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + ASSERT3U(zio->io_queued_timestamp, >, 0); + ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); + + mutex_enter(&spa->spa_alloc_lock); + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + avl_add(&spa->spa_alloc_tree, zio); + + nio = zio_io_to_allocate(zio->io_spa); + mutex_exit(&spa->spa_alloc_lock); + + if (nio == zio) + return (ZIO_PIPELINE_CONTINUE); + + if (nio != NULL) { + ASSERT3U(nio->io_queued_timestamp, <=, + zio->io_queued_timestamp); + ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); + /* + * We are passing control to a new zio so make sure that + * it is processed by a different thread. We do this to + * avoid stack overflows that can occur when parents are + * throttled and children are making progress. We allow + * it to go to the head of the taskq since it's already + * been waiting. + */ + zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); + } + return (ZIO_PIPELINE_STOP); +} + +void +zio_allocate_dispatch(spa_t *spa) +{ + zio_t *zio; + + mutex_enter(&spa->spa_alloc_lock); + zio = zio_io_to_allocate(spa); + mutex_exit(&spa->spa_alloc_lock); + if (zio == NULL) + return; + + ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); + ASSERT0(zio->io_error); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); +} + static int zio_dva_allocate(zio_t *zio) { @@ -2711,19 +2946,18 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - /* - * The dump device does not support gang blocks so allocation on - * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid - * the "fast" gang feature. - */ - flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; - flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? - METASLAB_GANG_CHILD : 0; flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; + if (zio->io_flags & ZIO_FLAG_NODATA) + flags |= METASLAB_DONT_THROTTLE; + if (zio->io_flags & ZIO_FLAG_GANG_CHILD) + flags |= METASLAB_GANG_CHILD; + if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) + flags |= METASLAB_ASYNC_ALLOC; + error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio); - if (error) { + if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); @@ -2790,21 +3024,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, ASSERT(txg > spa_syncing_txg(spa)); - /* - * ZIL blocks are always contiguous (i.e. not gang blocks) so we - * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" - * when allocating them. - */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, NULL, - METASLAB_FASTWRITE | METASLAB_GANG_AVOID); + new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, - METASLAB_FASTWRITE); + new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL); } if (error == 0) { @@ -2875,6 +3102,8 @@ zio_vdev_io_start(zio_t *zio) return (ZIO_PIPELINE_STOP); } + ASSERT3P(zio->io_logical, !=, zio); + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -3252,6 +3481,7 @@ zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; + zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) @@ -3269,12 +3499,26 @@ zio_ready(zio_t *zio) if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; - if (zio->io_error) + if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + /* + * We were unable to allocate anything, unreserve and + * issue the next I/O to allocate. + */ + metaslab_class_throttle_unreserve( + spa_normal_class(zio->io_spa), + zio->io_prop.zp_copies, zio); + zio_allocate_dispatch(zio->io_spa); + } + } + mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; - pio = zio_walk_parents(zio); + pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* @@ -3285,7 +3529,7 @@ zio_ready(zio_t *zio) * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { - pio_next = zio_walk_parents(zio); + pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } @@ -3305,11 +3549,76 @@ zio_ready(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } +/* + * Update the allocation throttle accounting. + */ +static void +zio_dva_throttle_done(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + zio_t *pio = zio_unique_parent(zio); + vdev_t *vd = zio->io_vd; + int flags = METASLAB_ASYNC_ALLOC; + + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT(vd != NULL); + ASSERT3P(vd, ==, vd->vdev_top); + ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); + ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); + ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); + + /* + * Parents of gang children can have two flavors -- ones that + * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) + * and ones that allocated the constituent blocks. The allocation + * throttle needs to know the allocating parent zio so we must find + * it here. + */ + if (pio->io_child_type == ZIO_CHILD_GANG) { + /* + * If our parent is a rewrite gang child then our grandparent + * would have been the one that performed the allocation. + */ + if (pio->io_flags & ZIO_FLAG_IO_REWRITE) + pio = zio_unique_parent(pio); + flags |= METASLAB_GANG_CHILD; + } + + ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT3P(zio, !=, zio->io_logical); + ASSERT(zio->io_logical != NULL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); + ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); + + mutex_enter(&pio->io_lock); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); + mutex_exit(&pio->io_lock); + + metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), + 1, pio); + + /* + * Call into the pipeline to see if there is more work that + * needs to be done. If there is work to be done it will be + * dispatched to another taskq thread. + */ + zio_allocate_dispatch(zio->io_spa); +} + static int zio_done(zio_t *zio) { + /* + * Always attempt to keep stack usage minimal here since + * we can be called recurisvely up to 19 levels deep. + */ zio_t *pio, *pio_next; int c, w; + zio_link_t *zl = NULL; /* * If our children haven't all completed, @@ -3321,6 +3630,33 @@ zio_done(zio_t *zio) zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); + /* + * If the allocation throttle is enabled, then update the accounting. + * We only track child I/Os that are part of an allocating async + * write. We must do this since the allocation is performed + * by the logical I/O but the actual write is done by child I/Os. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && + zio->io_child_type == ZIO_CHILD_VDEV) { + ASSERT(spa_normal_class( + zio->io_spa)->mc_alloc_throttle_enabled); + zio_dva_throttle_done(zio); + } + + /* + * If the allocation throttle is enabled, verify that + * we have decremented the refcounts for every I/O that was throttled. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(zio->io_bp != NULL); + metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio); + VERIFY(refcount_not_held( + &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio)); + } + + for (c = 0; c < ZIO_CHILD_TYPES; c++) for (w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); @@ -3506,13 +3842,15 @@ zio_done(zio_t *zio) * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ - for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { - zio_link_t *zl = zio->io_walk_link; - pio_next = zio_walk_parents(zio); + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; + pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { - zio_remove_child(pio, zio, zl); + zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } @@ -3579,10 +3917,11 @@ zio_done(zio_t *zio) zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); - for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { - zio_link_t *zl = zio->io_walk_link; - pio_next = zio_walk_parents(zio); - zio_remove_child(pio, zio, zl); + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); + zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } @@ -3606,9 +3945,10 @@ zio_done(zio_t *zio) static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, + zio_write_bp_init, zio_free_bp_init, zio_issue_async, - zio_write_bp_init, + zio_write_compress, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, @@ -3617,6 +3957,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_ddt_free, zio_gang_assemble, zio_gang_issue, + zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, @@ -3778,4 +4119,8 @@ MODULE_PARM_DESC(zfs_sync_pass_dont_compress, module_param(zfs_sync_pass_rewrite, int, 0644); MODULE_PARM_DESC(zfs_sync_pass_rewrite, "Rewrite new bps starting in this pass"); + +module_param(zio_dva_throttle_enabled, int, 0644); +MODULE_PARM_DESC(zio_dva_throttle_enabled, + "Throttle block allocations in the ZIO pipeline"); #endif |