diff options
Diffstat (limited to 'module')
-rw-r--r-- | module/zfs/metaslab.c | 340 | ||||
-rw-r--r-- | module/zfs/refcount.c | 65 | ||||
-rw-r--r-- | module/zfs/spa.c | 49 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 6 | ||||
-rw-r--r-- | module/zfs/vdev.c | 9 | ||||
-rw-r--r-- | module/zfs/vdev_cache.c | 6 | ||||
-rw-r--r-- | module/zfs/vdev_mirror.c | 5 | ||||
-rw-r--r-- | module/zfs/vdev_queue.c | 25 | ||||
-rw-r--r-- | module/zfs/zio.c | 543 |
9 files changed, 888 insertions, 160 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 9de65c86e..e54eeeae2 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -36,17 +36,8 @@ #define WITH_DF_BLOCK_ALLOCATOR -/* - * Allow allocations to switch to gang blocks quickly. We do this to - * avoid having to load lots of space_maps in a given txg. There are, - * however, some cases where we want to avoid "fast" ganging and instead - * we want to do an exhaustive search of all metaslabs on this device. - * Currently we don't allow any gang, slog, or dump device related allocations - * to "fast" gang. - */ -#define CAN_FASTGANG(flags) \ - (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ - METASLAB_GANG_AVOID))) +#define GANG_ALLOCATION(flags) \ + ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) @@ -198,6 +189,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; + mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); + refcount_create_tracked(&mc->mc_alloc_slots); return (mc); } @@ -211,6 +204,8 @@ metaslab_class_destroy(metaslab_class_t *mc) ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); + refcount_destroy(&mc->mc_alloc_slots); + mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -414,9 +409,10 @@ metaslab_compare(const void *x1, const void *x2) /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below - * the zfs_mg_noalloc_threshold. If a metaslab group transitions - * from allocatable to non-allocatable or vice versa then the metaslab - * group's class is updated to reflect the transition. + * the zfs_mg_noalloc_threshold or has a fragmentation value that is + * greater than zfs_mg_fragmentation_threshold. If a metaslab group + * transitions from allocatable to non-allocatable or vice versa then the + * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) @@ -425,22 +421,45 @@ metaslab_group_alloc_update(metaslab_group_t *mg) metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; + boolean_t was_initialized; ASSERT(vd == vd->vdev_top); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; + was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); + mutex_enter(&mc->mc_lock); + + /* + * If the metaslab group was just added then it won't + * have any space until we finish syncing out this txg. + * At that point we will consider it initialized and available + * for allocations. We also don't consider non-activated + * metaslab groups (e.g. vdevs that are in the middle of being removed) + * to be initialized, because they can't be used for allocation. + */ + mg->mg_initialized = metaslab_group_initialized(mg); + if (!was_initialized && mg->mg_initialized) { + mc->mc_groups++; + } else if (was_initialized && !mg->mg_initialized) { + ASSERT3U(mc->mc_groups, >, 0); + mc->mc_groups--; + } + if (mg->mg_initialized) + mg->mg_no_free_space = B_FALSE; + /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ - mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && + mg->mg_allocatable = (mg->mg_activation_count > 0 && + mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); @@ -463,6 +482,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } @@ -479,6 +499,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; + mg->mg_initialized = B_FALSE; + mg->mg_no_free_space = B_TRUE; + refcount_create_tracked(&mg->mg_alloc_queue_depth); mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); @@ -501,6 +524,7 @@ metaslab_group_destroy(metaslab_group_t *mg) taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); + refcount_destroy(&mg->mg_alloc_queue_depth); kmem_free(mg, sizeof (metaslab_group_t)); } @@ -570,6 +594,15 @@ metaslab_group_passivate(metaslab_group_t *mg) mg->mg_next = NULL; } +boolean_t +metaslab_group_initialized(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + vdev_stat_t *vs = &vd->vdev_stat; + + return (vs->vs_space != 0 && mg->mg_activation_count > 0); +} + uint64_t metaslab_group_get_space(metaslab_group_t *mg) { @@ -742,30 +775,97 @@ metaslab_group_fragmentation(metaslab_group_t *mg) * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group - * that can still handle allocations. + * that can still handle allocations. If the allocation throttle is enabled + * then we skip allocations to devices that have reached their maximum + * allocation queue depth unless the selected metaslab group is the only + * eligible group remaining. */ static boolean_t -metaslab_group_allocatable(metaslab_group_t *mg) +metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, + uint64_t psize) { - vdev_t *vd = mg->mg_vd; - spa_t *spa = vd->vdev_spa; + spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* - * We use two key metrics to determine if a metaslab group is - * considered allocatable -- free space and fragmentation. If - * the free space is greater than the free space threshold and - * the fragmentation is less than the fragmentation threshold then - * consider the group allocatable. There are two case when we will - * not consider these key metrics. The first is if the group is - * associated with a slog device and the second is if all groups - * in this metaslab class have already been consider ineligible + * We can only consider skipping this metaslab group if it's + * in the normal metaslab class and there are other metaslab + * groups to select from. Otherwise, we always consider it eligible * for allocations. */ - return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && - (mg->mg_fragmentation == ZFS_FRAG_INVALID || - mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || - mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); + if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) + return (B_TRUE); + + /* + * If the metaslab group's mg_allocatable flag is set (see comments + * in metaslab_group_alloc_update() for more information) and + * the allocation throttle is disabled then allow allocations to this + * device. However, if the allocation throttle is enabled then + * check if we have reached our allocation limit (mg_alloc_queue_depth) + * to determine if we should allow allocations to this metaslab group. + * If all metaslab groups are no longer considered allocatable + * (mc_alloc_groups == 0) or we're trying to allocate the smallest + * gang block size then we allow allocations on this metaslab group + * regardless of the mg_allocatable or throttle settings. + */ + if (mg->mg_allocatable) { + metaslab_group_t *mgp; + int64_t qdepth; + uint64_t qmax = mg->mg_max_alloc_queue_depth; + + if (!mc->mc_alloc_throttle_enabled) + return (B_TRUE); + + /* + * If this metaslab group does not have any free space, then + * there is no point in looking further. + */ + if (mg->mg_no_free_space) + return (B_FALSE); + + qdepth = refcount_count(&mg->mg_alloc_queue_depth); + + /* + * If this metaslab group is below its qmax or it's + * the only allocatable metasable group, then attempt + * to allocate from it. + */ + if (qdepth < qmax || mc->mc_alloc_groups == 1) + return (B_TRUE); + ASSERT3U(mc->mc_alloc_groups, >, 1); + + /* + * Since this metaslab group is at or over its qmax, we + * need to determine if there are metaslab groups after this + * one that might be able to handle this allocation. This is + * racy since we can't hold the locks for all metaslab + * groups at the same time when we make this check. + */ + for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { + qmax = mgp->mg_max_alloc_queue_depth; + + qdepth = refcount_count(&mgp->mg_alloc_queue_depth); + + /* + * If there is another metaslab group that + * might be able to handle the allocation, then + * we return false so that we skip this group. + */ + if (qdepth < qmax && !mgp->mg_no_free_space) + return (B_FALSE); + } + + /* + * We didn't find another group to handle the allocation + * so we can't skip this metaslab group even though + * we are at or over our qmax. + */ + return (B_TRUE); + + } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { + return (B_TRUE); + } + return (B_FALSE); } /* @@ -2054,8 +2154,62 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) return (0); } +/* + * ========================================================================== + * Metaslab block operations + * ========================================================================== + */ + +static void +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) +{ + metaslab_group_t *mg; + + if (!(flags & METASLAB_ASYNC_ALLOC) || + flags & METASLAB_DONT_THROTTLE) + return; + + mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_add(&mg->mg_alloc_queue_depth, tag); +} + +void +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) +{ + metaslab_group_t *mg; + + if (!(flags & METASLAB_ASYNC_ALLOC) || + flags & METASLAB_DONT_THROTTLE) + return; + + mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); +} + +void +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) +{ +#ifdef ZFS_DEBUG + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int d; + + for (d = 0; d < ndvas; d++) { + uint64_t vdev = DVA_GET_VDEV(&dva[d]); + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); + } +#endif +} + static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, +metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) { spa_t *spa = mg->mg_vd->vdev_spa; @@ -2082,10 +2236,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, if (msp->ms_weight < asize) { spa_dbgmsg(spa, "%s: failed to meet weight " "requirement: vdev %llu, txg %llu, mg %p, " - "msp %p, psize %llu, asize %llu, " + "msp %p, asize %llu, " "weight %llu", spa_name(spa), mg->mg_vd->vdev_id, txg, - mg, msp, psize, asize, msp->ms_weight); + mg, msp, asize, msp->ms_weight); mutex_exit(&mg->mg_lock); return (-1ULL); } @@ -2167,7 +2321,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, msp->ms_access_txg = txg + metaslab_unload_delay; mutex_exit(&msp->ms_lock); - return (offset); } @@ -2184,7 +2337,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, int all_zero; int zio_lock = B_FALSE; boolean_t allocatable; - uint64_t offset = -1ULL; uint64_t asize; uint64_t distance; @@ -2262,8 +2414,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, top: all_zero = B_TRUE; do { - ASSERT(mg->mg_activation_count == 1); + uint64_t offset; + ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* @@ -2279,24 +2432,23 @@ top: /* * Determine if the selected metaslab group is eligible - * for allocations. If we're ganging or have requested - * an allocation for the smallest gang block size - * then we don't want to avoid allocating to the this - * metaslab group. If we're in this condition we should - * try to allocate from any device possible so that we - * don't inadvertently return ENOSPC and suspend the pool + * for allocations. If we're ganging then don't allow + * this metaslab group to skip allocations since that would + * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ - if (allocatable && CAN_FASTGANG(flags) && - psize > SPA_GANGBLOCKSIZE) - allocatable = metaslab_group_allocatable(mg); + if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { + allocatable = metaslab_group_allocatable(mg, rotor, + psize); + } if (!allocatable) goto next; + ASSERT(mg->mg_initialized); + /* - * Avoid writing single-copy data to a failing vdev - * unless the user instructs us that it is okay. + * Avoid writing single-copy data to a failing vdev. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && @@ -2316,8 +2468,31 @@ top: asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, psize, asize, txg, distance, - dva, d); + offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + + mutex_enter(&mg->mg_lock); + if (offset == -1ULL) { + mg->mg_failed_allocations++; + if (asize == SPA_GANGBLOCKSIZE) { + /* + * This metaslab group was unable to allocate + * the minimum gang block size so it must be + * out of space. We must notify the allocation + * throttle to start skipping allocation + * attempts to this metaslab group until more + * space becomes available. + * + * Note: this failure cannot be caused by the + * allocation throttle since the allocation + * throttle is only responsible for skipping + * devices and not failing block allocations. + */ + mg->mg_no_free_space = B_TRUE; + } + } + mg->mg_allocations++; + mutex_exit(&mg->mg_lock); + if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -2517,9 +2692,62 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) return (0); } +/* + * Reserve some allocation slots. The reservation system must be called + * before we call into the allocator. If there aren't any available slots + * then the I/O will be throttled until an I/O completes and its slots are + * freed up. The function returns true if it was successful in placing + * the reservation. + */ +boolean_t +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, + int flags) +{ + uint64_t available_slots = 0; + uint64_t reserved_slots; + boolean_t slot_reserved = B_FALSE; + + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + + reserved_slots = refcount_count(&mc->mc_alloc_slots); + if (reserved_slots < mc->mc_alloc_max_slots) + available_slots = mc->mc_alloc_max_slots - reserved_slots; + + if (slots <= available_slots || GANG_ALLOCATION(flags)) { + int d; + + /* + * We reserve the slots individually so that we can unreserve + * them individually when an I/O completes. + */ + for (d = 0; d < slots; d++) { + reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); + } + zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; + slot_reserved = B_TRUE; + } + + mutex_exit(&mc->mc_lock); + return (slot_reserved); +} + +void +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) +{ + int d; + + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + for (d = 0; d < slots; d++) { + (void) refcount_remove(&mc->mc_alloc_slots, zio); + } + mutex_exit(&mc->mc_lock); +} + int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -2545,11 +2773,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, if (error != 0) { for (d--; d >= 0; d--) { metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + metaslab_group_alloc_decrement(spa, + DVA_GET_VDEV(&dva[d]), zio, flags); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); + } else { + /* + * Update the metaslab group's queue depth + * based on the newly allocated dva. + */ + metaslab_group_alloc_increment(spa, + DVA_GET_VDEV(&dva[d]), zio, flags); } + } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 1903c5954..6f8f4db08 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -69,6 +69,13 @@ refcount_create(refcount_t *rc) } void +refcount_create_tracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_TRUE; +} + +void refcount_create_untracked(refcount_t *rc) { refcount_create(rc); @@ -251,4 +258,60 @@ refcount_transfer_ownership(refcount_t *rc, void *current_holder, ASSERT(found); mutex_exit(&rc->rc_mtx); } + +/* + * If tracking is enabled, return true if a reference exists that matches + * the "holder" tag. If tracking is disabled, then return true if a reference + * might be held. + */ +boolean_t +refcount_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (rc->rc_count > 0); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_FALSE); +} + +/* + * If tracking is enabled, return true if a reference does not exist that + * matches the "holder" tag. If tracking is disabled, always return true + * since the reference might not be held. + */ +boolean_t +refcount_not_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_FALSE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_TRUE); +} #endif /* ZFS_DEBUG */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9c29543b9..0cf07be9b 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1363,7 +1363,6 @@ spa_unload(spa_t *spa) ddt_unload(spa); - /* * Drop and purge level 2 cache */ @@ -3813,6 +3812,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; + spa->spa_load_state = SPA_LOAD_CREATE; /* * Create "The Godfather" zio to hold all async IOs @@ -3997,6 +3997,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); + spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); @@ -5312,7 +5313,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) + nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; int i, j; @@ -6466,10 +6467,14 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; + metaslab_class_t *mc; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int error; + uint32_t max_queue_depth = zfs_vdev_async_write_max_active * + zfs_vdev_queue_depth_pct / 100; + uint64_t queue_depth_total; int c; VERIFY(spa_writeable(spa)); @@ -6482,6 +6487,10 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. @@ -6536,6 +6545,38 @@ spa_sync(spa_t *spa, uint64_t txg) } /* + * Set the top-level vdev's max queue depth. Evaluate each + * top-level's async write queue depth in case it changed. + * The max queue depth will not change in the middle of syncing + * out this txg. + */ + queue_depth_total = 0; + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg == NULL || mg->mg_class != spa_normal_class(spa) || + !metaslab_group_initialized(mg)) + continue; + + /* + * It is safe to do a lock-free check here because only async + * allocations look at mg_max_alloc_queue_depth, and async + * allocations all happen from spa_sync(). + */ + ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + mg->mg_max_alloc_queue_depth = max_queue_depth; + queue_depth_total += mg->mg_max_alloc_queue_depth; + } + mc = spa_normal_class(spa); + ASSERT0(refcount_count(&mc->mc_alloc_slots)); + mc->mc_alloc_max_slots = queue_depth_total; + mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + + ASSERT3U(mc->mc_alloc_max_slots, <=, + max_queue_depth * rvd->vdev_children); + + /* * Iterate to convergence. */ do { @@ -6689,6 +6730,10 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * Update usable space statistics. */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 595e594ca..6ec05214e 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -564,6 +564,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -596,6 +597,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); + avl_create(&spa->spa_alloc_tree, zio_timestamp_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + /* * Every pool starts with the default cachefile */ @@ -673,6 +677,7 @@ spa_remove(spa_t *spa) kmem_free(dp, sizeof (spa_config_dirent_t)); } + avl_destroy(&spa->spa_alloc_tree); list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); @@ -696,6 +701,7 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); + mutex_destroy(&spa->spa_alloc_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 104db3d15..5ff5cf3b1 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -351,6 +351,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); @@ -681,6 +682,7 @@ vdev_free(vdev_t *vd) } mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); @@ -990,6 +992,7 @@ vdev_probe_done(zio_t *zio) zio_buf_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; + zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; @@ -1009,7 +1012,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); - while ((pio = zio_walk_parents(zio)) != NULL) + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); @@ -2754,7 +2758,8 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && !vd->vdev_ishole); + !vd->vdev_cant_write && !vd->vdev_ishole && + vd->vdev_mg->mg_initialized); } boolean_t diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index d7de7c5c9..321ea4a2f 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -214,6 +214,7 @@ vdev_cache_fill(zio_t *fio) vdev_cache_t *vc = &vd->vdev_cache; vdev_cache_entry_t *ve = fio->io_private; zio_t *pio; + zio_link_t *zl; ASSERT(fio->io_size == VCBS); @@ -233,7 +234,8 @@ vdev_cache_fill(zio_t *fio) * any reads that were queued up before the missed update are still * valid, so we can satisfy them from this line before we evict it. */ - while ((pio = zio_walk_parents(fio)) != NULL) + zl = NULL; + while ((pio = zio_walk_parents(fio, &zl)) != NULL) vdev_cache_hit(vc, ve, pio); if (fio->io_error || ve->ve_missed_update) diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index d3dbdca79..780311195 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -266,9 +266,10 @@ vdev_mirror_scrub_done(zio_t *zio) if (zio->io_error == 0) { zio_t *pio; + zio_link_t *zl = NULL; mutex_enter(&zio->io_lock); - while ((pio = zio_walk_parents(zio)) != NULL) { + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); bcopy(zio->io_data, pio->io_data, pio->io_size); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 4cffa500b..8f394eef5 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -33,6 +33,7 @@ #include <sys/zio.h> #include <sys/avl.h> #include <sys/dsl_pool.h> +#include <sys/metaslab_impl.h> #include <sys/spa.h> #include <sys/spa_impl.h> #include <sys/kstat.h> @@ -171,6 +172,23 @@ int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; +/* + * Define the queue depth percentage for each top-level. This percentage is + * used in conjunction with zfs_vdev_async_max_active to determine how many + * allocations a specific top-level vdev should handle. Once the queue depth + * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 + * then allocator will stop allocating blocks on that top-level device. + * The default kernel setting is 1000% which will yield 100 allocations per + * device. For userland testing, the default setting is 300% which equates + * to 30 allocations per device. + */ +#ifdef _KERNEL +int zfs_vdev_queue_depth_pct = 1000; +#else +int zfs_vdev_queue_depth_pct = 300; +#endif + + int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -476,7 +494,8 @@ vdev_queue_agg_io_done(zio_t *aio) { if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; - while ((pio = zio_walk_parents(aio)) != NULL) { + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(aio, &zl)) != NULL) { bcopy((char *)aio->io_data + (pio->io_offset - aio->io_offset), pio->io_data, pio->io_size); } @@ -856,4 +875,8 @@ MODULE_PARM_DESC(zfs_vdev_sync_write_max_active, module_param(zfs_vdev_sync_write_min_active, int, 0644); MODULE_PARM_DESC(zfs_vdev_sync_write_min_active, "Min active sync write I/Os per vdev"); + +module_param(zfs_vdev_queue_depth_pct, int, 0644); +MODULE_PARM_DESC(zfs_vdev_queue_depth_pct, + "Queue depth percentage for each top-level vdev"); #endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 8a063ab7f..0147cb17c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -39,6 +39,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/metaslab_impl.h> #include <sys/time.h> #include <sys/trace_zio.h> @@ -48,9 +49,15 @@ * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { + /* + * Note: Linux kernel thread name length is limited + * so these names will differ from upstream open zfs. + */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" }; +int zio_dva_throttle_enabled = B_TRUE; + /* * ========================================================================== * I/O kmem caches @@ -100,6 +107,8 @@ int zio_buf_debug_limit = 0; static inline void __zio_execute(zio_t *zio); +static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); + void zio_init(void) { @@ -368,52 +377,39 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ -/* - * NOTE - Callers to zio_walk_parents() and zio_walk_children must - * continue calling these functions until they return NULL. - * Otherwise, the next caller will pick up the list walk in - * some indeterminate state. (Otherwise every caller would - * have to pass in a cookie to keep the state represented by - * io_walk_link, which gets annoying.) - */ zio_t * -zio_walk_parents(zio_t *cio) +zio_walk_parents(zio_t *cio, zio_link_t **zl) { - zio_link_t *zl = cio->io_walk_link; list_t *pl = &cio->io_parent_list; - zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); - cio->io_walk_link = zl; - - if (zl == NULL) + *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); + if (*zl == NULL) return (NULL); - ASSERT(zl->zl_child == cio); - return (zl->zl_parent); + ASSERT((*zl)->zl_child == cio); + return ((*zl)->zl_parent); } zio_t * -zio_walk_children(zio_t *pio) +zio_walk_children(zio_t *pio, zio_link_t **zl) { - zio_link_t *zl = pio->io_walk_link; list_t *cl = &pio->io_child_list; - zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); - pio->io_walk_link = zl; - - if (zl == NULL) + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); + if (*zl == NULL) return (NULL); - ASSERT(zl->zl_parent == pio); - return (zl->zl_child); + ASSERT((*zl)->zl_parent == pio); + return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { - zio_t *pio = zio_walk_parents(cio); + zio_link_t *zl = NULL; + zio_t *pio = zio_walk_parents(cio, &zl); - VERIFY(zio_walk_parents(cio) == NULL); + VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } @@ -469,7 +465,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); - kmem_cache_free(zio_link_cache, zl); } @@ -483,6 +478,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) ASSERT(zio->io_stall == NULL); if (*countp != 0) { zio->io_stage >>= 1; + ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; } @@ -507,9 +503,18 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) (*countp)--; if (*countp == 0 && pio->io_stall == countp) { + zio_taskq_type_t type = + pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : + ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); - __zio_execute(pio); + /* + * Dispatch the parent zio in its own taskq so that + * the child can continue to make progress. This also + * prevents overflowing the stack when we have deeply nested + * parent-child relationships. + */ + zio_taskq_dispatch(pio, type, B_FALSE); } else { mutex_exit(&pio->io_lock); } @@ -522,6 +527,24 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) zio->io_error = zio->io_child_error[c]; } +int +zio_timestamp_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + int cmp; + + cmp = AVL_CMP(z1->io_queued_timestamp, z2->io_queued_timestamp); + if (likely(cmp)) + return (cmp); + + cmp = AVL_CMP(z1->io_offset, z2->io_offset); + if (likely(cmp)) + return (cmp); + + return (AVL_PCMP(z1, z2)); +} + /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) @@ -594,6 +617,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); @@ -797,7 +821,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, - ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); @@ -912,6 +936,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + ASSERT0(zio->io_queued_timestamp); return (zio); } @@ -1031,9 +1056,31 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; + /* + * If we're creating a child I/O that is not associated with a + * top-level vdev, then the child zio is not an allocating I/O. + * If this is a retried I/O then we ignore it since we will + * have already processed the original allocating I/O. + */ + if (flags & ZIO_FLAG_IO_ALLOCATING && + (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { + metaslab_class_t *mc = spa_normal_class(pio->io_spa); + + ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(type == ZIO_TYPE_WRITE); + ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); + ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || + pio->io_child_type == ZIO_CHILD_GANG); + + flags &= ~ZIO_FLAG_IO_ALLOCATING; + } + + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) @@ -1131,40 +1178,16 @@ zio_read_bp_init(zio_t *zio) static int zio_write_bp_init(zio_t *zio) { - spa_t *spa = zio->io_spa; - zio_prop_t *zp = &zio->io_prop; - enum zio_compress compress = zp->zp_compress; - blkptr_t *bp = zio->io_bp; - uint64_t lsize = zio->io_lsize; - uint64_t psize = zio->io_size; - int pass = 1; - - EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); - - /* - * If our children haven't all reached the ready stage, - * wait for them and then repeat this pipeline stage. - */ - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || - zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) - return (ZIO_PIPELINE_STOP); if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); - if (zio->io_children_ready != NULL) { - /* - * Now that all our children are ready, run the callback - * associated with this zio in case it wants to modify the - * data to be written. - */ - ASSERT3U(zp->zp_level, >, 0); - zio->io_children_ready(zio); - } - ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { + blkptr_t *bp = zio->io_bp; + zio_prop_t *zp = &zio->io_prop; + ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); @@ -1181,6 +1204,7 @@ zio_write_bp_init(zio_t *zio) */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (ZIO_PIPELINE_CONTINUE); } @@ -1198,10 +1222,56 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ zio->io_bp_override = NULL; - BP_ZERO(bp); + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_write_compress(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_prop_t *zp = &zio->io_prop; + enum zio_compress compress = zp->zp_compress; + blkptr_t *bp = zio->io_bp; + uint64_t lsize = zio->io_lsize; + uint64_t psize = zio->io_size; + int pass = 1; + + EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); + + /* + * If our children haven't all reached the ready stage, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || + zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + + if (!IO_IS_ALLOCATING(zio)) + return (ZIO_PIPELINE_CONTINUE); + + if (zio->io_children_ready != NULL) { + /* + * Now that all our children are ready, run the callback + * associated with this zio in case it wants to modify the + * data to be written. + */ + ASSERT3U(zp->zp_level, >, 0); + zio->io_children_ready(zio); } + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + ASSERT(zio->io_bp_override == NULL); + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're @@ -1273,6 +1343,15 @@ zio_write_bp_init(zio_t *zio) psize, lsize, NULL); } } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } else { ASSERT3U(psize, !=, 0); @@ -1328,7 +1407,6 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } - return (ZIO_PIPELINE_CONTINUE); } @@ -1559,6 +1637,8 @@ __zio_execute(zio_t *zio) { zio->io_executor = curthread; + ASSERT3U(zio->io_queued_timestamp, >, 0); + while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; @@ -1603,6 +1683,7 @@ __zio_execute(zio_t *zio) } zio->io_stage = stage; + zio->io_pipeline_trace |= zio->io_stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) @@ -1627,6 +1708,8 @@ zio_wait(zio_t *zio) ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); @@ -1663,6 +1746,8 @@ zio_nowait(zio_t *zio) zio_add_child(pio, zio); } + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } @@ -1677,6 +1762,7 @@ zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; int c, w; + zio_link_t *zl = NULL; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); @@ -1688,6 +1774,7 @@ zio_reexecute(zio_t *pio) pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; + pio->io_pipeline_trace = 0; pio->io_error = 0; for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; @@ -1704,8 +1791,8 @@ zio_reexecute(zio_t *pio) * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ - for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { - cio_next = zio_walk_children(pio); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); mutex_enter(&pio->io_lock); for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; @@ -1718,8 +1805,10 @@ zio_reexecute(zio_t *pio) * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on him. */ - if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { + pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); + } } void @@ -2120,6 +2209,7 @@ static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; + metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -2133,10 +2223,44 @@ zio_write_gang_block(zio_t *pio) zio_prop_t zp; int g, error; - error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, - bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, - METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); + int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + flags |= METASLAB_ASYNC_ALLOC; + VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); + + /* + * The logical zio has already placed a reservation for + * 'copies' allocation slots but gang blocks may require + * additional copies. These additional copies + * (i.e. gbh_copies - copies) are guaranteed to succeed + * since metaslab_class_throttle_reserve() always allows + * additional reservations for gang blocks. + */ + VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, + pio, flags)); + } + + error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio); if (error) { + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + /* + * If we failed to allocate the gang block header then + * we remove any additional allocation reservations that + * we placed here. The original reservation will + * be removed when the logical I/O goes to the ready + * stage. + */ + metaslab_class_throttle_unreserve(mc, + gbh_copies - copies, pio); + } + pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } @@ -2162,6 +2286,8 @@ zio_write_gang_block(zio_t *pio) * Create and nowait the gang children. */ for (g = 0; resid != 0; resid -= lsize, g++) { + zio_t *cio; + lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); @@ -2175,11 +2301,26 @@ zio_write_gang_block(zio_t *pio) zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; - zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], + cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, &gn->gn_child[g], pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + /* + * Gang children won't throttle but we should + * account for their work, so reserve an allocation + * slot for them here. + */ + VERIFY(metaslab_class_throttle_reserve(mc, + zp.zp_copies, cio, flags)); + } + zio_nowait(cio); + } /* @@ -2478,6 +2619,7 @@ zio_ddt_child_write_ready(zio_t *zio) ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; + zio_link_t *zl; if (zio->io_error) return; @@ -2488,7 +2630,8 @@ zio_ddt_child_write_ready(zio_t *zio) ddt_phys_fill(ddp, zio->io_bp); - while ((pio = zio_walk_parents(zio)) != NULL) + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); @@ -2509,7 +2652,8 @@ zio_ddt_child_write_done(zio_t *zio) dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { - while (zio_walk_parents(zio) != NULL) + zio_link_t *zl = NULL; + while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); @@ -2691,6 +2835,97 @@ zio_ddt_free(zio_t *zio) * Allocate and free blocks * ========================================================================== */ + +static zio_t * +zio_io_to_allocate(spa_t *spa) +{ + zio_t *zio; + + ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); + + zio = avl_first(&spa->spa_alloc_tree); + if (zio == NULL) + return (NULL); + + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Try to place a reservation for this zio. If we're unable to + * reserve then we throttle. + */ + if (!metaslab_class_throttle_reserve(spa_normal_class(spa), + zio->io_prop.zp_copies, zio, 0)) { + return (NULL); + } + + avl_remove(&spa->spa_alloc_tree, zio); + ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); + + return (zio); +} + +static int +zio_dva_throttle(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_t *nio; + + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || + !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || + zio->io_child_type == ZIO_CHILD_GANG || + zio->io_flags & ZIO_FLAG_NODATA) { + return (ZIO_PIPELINE_CONTINUE); + } + + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + ASSERT3U(zio->io_queued_timestamp, >, 0); + ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); + + mutex_enter(&spa->spa_alloc_lock); + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + avl_add(&spa->spa_alloc_tree, zio); + + nio = zio_io_to_allocate(zio->io_spa); + mutex_exit(&spa->spa_alloc_lock); + + if (nio == zio) + return (ZIO_PIPELINE_CONTINUE); + + if (nio != NULL) { + ASSERT3U(nio->io_queued_timestamp, <=, + zio->io_queued_timestamp); + ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); + /* + * We are passing control to a new zio so make sure that + * it is processed by a different thread. We do this to + * avoid stack overflows that can occur when parents are + * throttled and children are making progress. We allow + * it to go to the head of the taskq since it's already + * been waiting. + */ + zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); + } + return (ZIO_PIPELINE_STOP); +} + +void +zio_allocate_dispatch(spa_t *spa) +{ + zio_t *zio; + + mutex_enter(&spa->spa_alloc_lock); + zio = zio_io_to_allocate(spa); + mutex_exit(&spa->spa_alloc_lock); + if (zio == NULL) + return; + + ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); + ASSERT0(zio->io_error); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); +} + static int zio_dva_allocate(zio_t *zio) { @@ -2711,19 +2946,18 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - /* - * The dump device does not support gang blocks so allocation on - * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid - * the "fast" gang feature. - */ - flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; - flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? - METASLAB_GANG_CHILD : 0; flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; + if (zio->io_flags & ZIO_FLAG_NODATA) + flags |= METASLAB_DONT_THROTTLE; + if (zio->io_flags & ZIO_FLAG_GANG_CHILD) + flags |= METASLAB_GANG_CHILD; + if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) + flags |= METASLAB_ASYNC_ALLOC; + error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio); - if (error) { + if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); @@ -2790,21 +3024,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, ASSERT(txg > spa_syncing_txg(spa)); - /* - * ZIL blocks are always contiguous (i.e. not gang blocks) so we - * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" - * when allocating them. - */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, NULL, - METASLAB_FASTWRITE | METASLAB_GANG_AVOID); + new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, - METASLAB_FASTWRITE); + new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL); } if (error == 0) { @@ -2875,6 +3102,8 @@ zio_vdev_io_start(zio_t *zio) return (ZIO_PIPELINE_STOP); } + ASSERT3P(zio->io_logical, !=, zio); + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -3252,6 +3481,7 @@ zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; + zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) @@ -3269,12 +3499,26 @@ zio_ready(zio_t *zio) if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; - if (zio->io_error) + if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + /* + * We were unable to allocate anything, unreserve and + * issue the next I/O to allocate. + */ + metaslab_class_throttle_unreserve( + spa_normal_class(zio->io_spa), + zio->io_prop.zp_copies, zio); + zio_allocate_dispatch(zio->io_spa); + } + } + mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; - pio = zio_walk_parents(zio); + pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* @@ -3285,7 +3529,7 @@ zio_ready(zio_t *zio) * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { - pio_next = zio_walk_parents(zio); + pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } @@ -3305,11 +3549,76 @@ zio_ready(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } +/* + * Update the allocation throttle accounting. + */ +static void +zio_dva_throttle_done(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + zio_t *pio = zio_unique_parent(zio); + vdev_t *vd = zio->io_vd; + int flags = METASLAB_ASYNC_ALLOC; + + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT(vd != NULL); + ASSERT3P(vd, ==, vd->vdev_top); + ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); + ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); + ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); + + /* + * Parents of gang children can have two flavors -- ones that + * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) + * and ones that allocated the constituent blocks. The allocation + * throttle needs to know the allocating parent zio so we must find + * it here. + */ + if (pio->io_child_type == ZIO_CHILD_GANG) { + /* + * If our parent is a rewrite gang child then our grandparent + * would have been the one that performed the allocation. + */ + if (pio->io_flags & ZIO_FLAG_IO_REWRITE) + pio = zio_unique_parent(pio); + flags |= METASLAB_GANG_CHILD; + } + + ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT3P(zio, !=, zio->io_logical); + ASSERT(zio->io_logical != NULL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); + ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); + + mutex_enter(&pio->io_lock); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); + mutex_exit(&pio->io_lock); + + metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), + 1, pio); + + /* + * Call into the pipeline to see if there is more work that + * needs to be done. If there is work to be done it will be + * dispatched to another taskq thread. + */ + zio_allocate_dispatch(zio->io_spa); +} + static int zio_done(zio_t *zio) { + /* + * Always attempt to keep stack usage minimal here since + * we can be called recurisvely up to 19 levels deep. + */ zio_t *pio, *pio_next; int c, w; + zio_link_t *zl = NULL; /* * If our children haven't all completed, @@ -3321,6 +3630,33 @@ zio_done(zio_t *zio) zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); + /* + * If the allocation throttle is enabled, then update the accounting. + * We only track child I/Os that are part of an allocating async + * write. We must do this since the allocation is performed + * by the logical I/O but the actual write is done by child I/Os. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && + zio->io_child_type == ZIO_CHILD_VDEV) { + ASSERT(spa_normal_class( + zio->io_spa)->mc_alloc_throttle_enabled); + zio_dva_throttle_done(zio); + } + + /* + * If the allocation throttle is enabled, verify that + * we have decremented the refcounts for every I/O that was throttled. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(zio->io_bp != NULL); + metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio); + VERIFY(refcount_not_held( + &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio)); + } + + for (c = 0; c < ZIO_CHILD_TYPES; c++) for (w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); @@ -3506,13 +3842,15 @@ zio_done(zio_t *zio) * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ - for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { - zio_link_t *zl = zio->io_walk_link; - pio_next = zio_walk_parents(zio); + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; + pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { - zio_remove_child(pio, zio, zl); + zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } @@ -3579,10 +3917,11 @@ zio_done(zio_t *zio) zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); - for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { - zio_link_t *zl = zio->io_walk_link; - pio_next = zio_walk_parents(zio); - zio_remove_child(pio, zio, zl); + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); + zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } @@ -3606,9 +3945,10 @@ zio_done(zio_t *zio) static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, + zio_write_bp_init, zio_free_bp_init, zio_issue_async, - zio_write_bp_init, + zio_write_compress, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, @@ -3617,6 +3957,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_ddt_free, zio_gang_assemble, zio_gang_issue, + zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, @@ -3778,4 +4119,8 @@ MODULE_PARM_DESC(zfs_sync_pass_dont_compress, module_param(zfs_sync_pass_rewrite, int, 0644); MODULE_PARM_DESC(zfs_sync_pass_rewrite, "Rewrite new bps starting in this pass"); + +module_param(zio_dva_throttle_enabled, int, 0644); +MODULE_PARM_DESC(zio_dva_throttle_enabled, + "Throttle block allocations in the ZIO pipeline"); #endif |