diff options
Diffstat (limited to 'module/zfs/metaslab.c')
-rw-r--r-- | module/zfs/metaslab.c | 976 |
1 files changed, 770 insertions, 206 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 5da929b48..0b22aa875 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -56,12 +56,21 @@ unsigned long metaslab_aliquot = 512 << 10; unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* - * Since we can touch multiple metaslabs (and their respective space maps) - * with each transaction group, we benefit from having a smaller space map + * In pools where the log space map feature is not enabled we touch + * multiple metaslabs (and their respective space maps) with each + * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered - * around the disk. + * around the disk. So a sane default for the space map block size + * is 8~16K. */ -int zfs_metaslab_sm_blksz = (1 << 12); +int zfs_metaslab_sm_blksz_no_log = (1 << 14); + +/* + * When the log space map feature is enabled, we accumulate a lot of + * changes per metaslab that are flushed once in a while so we benefit + * from a bigger block size like 128K for the metaslab space maps. + */ +int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. @@ -270,6 +279,7 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); +static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); #ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; #endif @@ -540,67 +550,6 @@ metaslab_compare(const void *x1, const void *x2) return (AVL_CMP(m1->ms_start, m2->ms_start)); } -uint64_t -metaslab_allocated_space(metaslab_t *msp) -{ - return (msp->ms_allocated_space); -} - -/* - * Verify that the space accounting on disk matches the in-core range_trees. - */ -static void -metaslab_verify_space(metaslab_t *msp, uint64_t txg) -{ - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t allocating = 0; - uint64_t sm_free_space, msp_free_space; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(!msp->ms_condensing); - - if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) - return; - - /* - * We can only verify the metaslab space when we're called - * from syncing context with a loaded metaslab that has an - * allocated space map. Calling this in non-syncing context - * does not provide a consistent view of the metaslab since - * we're performing allocations in the future. - */ - if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || - !msp->ms_loaded) - return; - - /* - * Even though the smp_alloc field can get negative (e.g. - * see vdev_checkpoint_sm), that should never be the case - * when it come's to a metaslab's space map. - */ - ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); - - sm_free_space = msp->ms_size - metaslab_allocated_space(msp); - - /* - * Account for future allocations since we would have - * already deducted that space from the ms_allocatable. - */ - for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - allocating += - range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); - } - - ASSERT3U(msp->ms_deferspace, ==, - range_tree_space(msp->ms_defer[0]) + - range_tree_space(msp->ms_defer[1])); - - msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + - msp->ms_deferspace + range_tree_space(msp->ms_freed); - - VERIFY3U(sm_free_space, ==, msp_free_space); -} - /* * ========================================================================== * Metaslab groups @@ -689,6 +638,25 @@ metaslab_group_alloc_update(metaslab_group_t *mg) mutex_exit(&mg->mg_lock); } +int +metaslab_sort_by_flushed(const void *va, const void *vb) +{ + const metaslab_t *a = va; + const metaslab_t *b = vb; + + int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); + if (likely(cmp)) + return (cmp); + + uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; + uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; + cmp = AVL_CMP(a_vdev_id, b_vdev_id); + if (cmp) + return (cmp); + + return (AVL_CMP(a->ms_id, b->ms_id)); +} + metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { @@ -703,7 +671,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, - sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); + sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; @@ -900,7 +868,6 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - ASSERT(msp != NULL); /* skip if not active or not a member */ if (msp->ms_sm == NULL || msp->ms_group != mg) @@ -1454,6 +1421,101 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; * ========================================================================== */ +/* + * Wait for any in-progress metaslab loads to complete. + */ +void +metaslab_load_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_loading) { + ASSERT(!msp->ms_loaded); + cv_wait(&msp->ms_load_cv, &msp->ms_lock); + } +} + +/* + * Wait for any in-progress flushing to complete. + */ +void +metaslab_flush_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_flushing) + cv_wait(&msp->ms_flush_cv, &msp->ms_lock); +} + +uint64_t +metaslab_allocated_space(metaslab_t *msp) +{ + return (msp->ms_allocated_space); +} + +/* + * Verify that the space accounting on disk matches the in-core range_trees. + */ +static void +metaslab_verify_space(metaslab_t *msp, uint64_t txg) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t allocating = 0; + uint64_t sm_free_space, msp_free_space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_condensing); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can only verify the metaslab space when we're called + * from syncing context with a loaded metaslab that has an + * allocated space map. Calling this in non-syncing context + * does not provide a consistent view of the metaslab since + * we're performing allocations in the future. + */ + if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || + !msp->ms_loaded) + return; + + /* + * Even though the smp_alloc field can get negative, + * when it comes to a metaslab's space map, that should + * never be the case. + */ + ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); + + ASSERT3U(space_map_allocated(msp->ms_sm), >=, + range_tree_space(msp->ms_unflushed_frees)); + + ASSERT3U(metaslab_allocated_space(msp), ==, + space_map_allocated(msp->ms_sm) + + range_tree_space(msp->ms_unflushed_allocs) - + range_tree_space(msp->ms_unflushed_frees)); + + sm_free_space = msp->ms_size - metaslab_allocated_space(msp); + + /* + * Account for future allocations since we would have + * already deducted that space from the ms_allocatable. + */ + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { + allocating += + range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); + } + + ASSERT3U(msp->ms_deferspace, ==, + range_tree_space(msp->ms_defer[0]) + + range_tree_space(msp->ms_defer[1])); + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + + msp->ms_deferspace + range_tree_space(msp->ms_freed); + + VERIFY3U(sm_free_space, ==, msp_free_space); +} + static void metaslab_aux_histograms_clear(metaslab_t *msp) { @@ -1651,20 +1713,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) VERIFY3U(msp->ms_weight, ==, weight); } -/* - * Wait for any in-progress metaslab loads to complete. - */ -static void -metaslab_load_wait(metaslab_t *msp) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - while (msp->ms_loading) { - ASSERT(!msp->ms_loaded); - cv_wait(&msp->ms_load_cv, &msp->ms_lock); - } -} - static int metaslab_load_impl(metaslab_t *msp) { @@ -1679,13 +1727,19 @@ metaslab_load_impl(metaslab_t *msp) * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * - * metaslab_sync() can append to the space map while we are loading. - * Therefore we load only entries that existed when we started the - * load. Additionally, metaslab_sync_done() has to wait for the load - * to complete because there are potential races like metaslab_load() - * loading parts of the space map that are currently being appended - * by metaslab_sync(). If we didn't, the ms_allocatable would have - * entries that metaslab_sync_done() would try to re-add later. + * If we are using the log space maps, metaslab_sync() can't write to + * the metaslab's space map while we are loading as we only write to + * it when we are flushing the metaslab, and that can't happen while + * we are loading it. + * + * If we are not using log space maps though, metaslab_sync() can + * append to the space map while we are loading. Therefore we load + * only entries that existed when we started the load. Additionally, + * metaslab_sync_done() has to wait for the load to complete because + * there are potential races like metaslab_load() loading parts of the + * space map that are currently being appended by metaslab_sync(). If + * we didn't, the ms_allocatable would have entries that + * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, @@ -1695,6 +1749,7 @@ metaslab_load_impl(metaslab_t *msp) uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); + hrtime_t load_start = gethrtime(); if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); @@ -1706,18 +1761,37 @@ metaslab_load_impl(metaslab_t *msp) */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); + + if (msp->ms_freed != NULL) { + /* + * If the ms_sm doesn't exist, this means that this + * metaslab hasn't gone through metaslab_sync() and + * thus has never been dirtied. So we shouldn't + * expect any unflushed allocs or frees from previous + * TXGs. + * + * Note: ms_freed and all the other trees except for + * the ms_allocatable, can be NULL at this point only + * if this is a new metaslab of a vdev that just got + * expanded. + */ + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from - * changing the ms_sm and the metaslab's range trees while we are - * about to use them and populate the ms_allocatable. The ms_lock - * is insufficient for this because metaslab_sync() doesn't hold - * the ms_lock while writing the ms_checkpointing tree to disk. + * changing the ms_sm (or log_sm) and the metaslab's range trees + * while we are about to use them and populate the ms_allocatable. + * The ms_lock is insufficient for this because metaslab_sync() doesn't + * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + ASSERT(!msp->ms_condensing); + ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); @@ -1728,10 +1802,60 @@ metaslab_load_impl(metaslab_t *msp) msp->ms_loaded = B_TRUE; /* - * The ms_allocatable contains the segments that exist in the - * ms_defer trees [see ms_synced_length]. Thus we need to remove - * them from ms_allocatable as they will be added again in + * Apply all the unflushed changes to ms_allocatable right + * away so any manipulations we do below have a clear view + * of what is allocated and what is free. + */ + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_remove, msp->ms_allocatable); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_add, msp->ms_allocatable); + + msp->ms_loaded = B_TRUE; + + ASSERT3P(msp->ms_group, !=, NULL); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + if (spa_syncing_log_sm(spa) != NULL) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LOG_SPACEMAP)); + + /* + * If we use a log space map we add all the segments + * that are in ms_unflushed_frees so they are available + * for allocation. + * + * ms_allocatable needs to contain all free segments + * that are ready for allocations (thus not segments + * from ms_freeing, ms_freed, and the ms_defer trees). + * But if we grab the lock in this code path at a sync + * pass later that 1, then it also contains the + * segments of ms_freed (they were added to it earlier + * in this path through ms_unflushed_frees). So we + * need to remove all the segments that exist in + * ms_freed from ms_allocatable as they will be added + * later in metaslab_sync_done(). + * + * When there's no log space map, the ms_allocatable + * correctly doesn't contain any segments that exist + * in ms_freed [see ms_synced_length]. + */ + range_tree_walk(msp->ms_freed, + range_tree_remove, msp->ms_allocatable); + } + + /* + * If we are not using the log space map, ms_allocatable + * contains the segments that exist in the ms_defer trees + * [see ms_synced_length]. Thus we need to remove them + * from ms_allocatable as they will be added again in * metaslab_sync_done(). + * + * If we are using the log space map, ms_allocatable still + * contains the segments that exist in the ms_defer trees. + * Not because it read them through the ms_sm though. But + * because these segments are part of ms_unflushed_frees + * whose segments we add to ms_allocatable earlier in this + * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], @@ -1756,10 +1880,26 @@ metaslab_load_impl(metaslab_t *msp) ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_block_maxsize(msp); - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + hrtime_t load_end = gethrtime(); + if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { + zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, smp_length %llu, " + "unflushed_allocs %llu, unflushed_frees %llu, " + "freed %llu, defer %llu + %llu, " + "loading_time %lld ms", + spa_syncing_txg(spa), spa_name(spa), + msp->ms_group->mg_vd->vdev_id, msp->ms_id, + space_map_length(msp->ms_sm), + range_tree_space(msp->ms_unflushed_allocs), + range_tree_space(msp->ms_unflushed_frees), + range_tree_space(msp->ms_freed), + range_tree_space(msp->ms_defer[0]), + range_tree_space(msp->ms_defer[1]), + (longlong_t)((load_end - load_start) / 1000000)); + } + metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); - return (0); } @@ -1778,8 +1918,32 @@ metaslab_load(metaslab_t *msp) VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); + /* + * We set the loading flag BEFORE potentially dropping the lock to + * wait for an ongoing flush (see ms_flushing below). This way other + * threads know that there is already a thread that is loading this + * metaslab. + */ msp->ms_loading = B_TRUE; + + /* + * Wait for any in-progress flushing to finish as we drop the ms_lock + * both here (during space_map_load()) and in metaslab_flush() (when + * we flush our changes to the ms_sm). + */ + if (msp->ms_flushing) + metaslab_flush_wait(msp); + + /* + * In the possibility that we were waiting for the metaslab to be + * flushed (where we temporarily dropped the ms_lock), ensure that + * no one else loaded the metaslab somehow. + */ + ASSERT(!msp->ms_loaded); + int error = metaslab_load_impl(msp); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); @@ -1806,7 +1970,7 @@ metaslab_unload(metaslab_t *msp) * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information - * available in-core, whether it is loaded or not + * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation @@ -1816,7 +1980,7 @@ metaslab_unload(metaslab_t *msp) metaslab_recalculate_weight_and_sort(msp); } -static void +void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { @@ -1830,8 +1994,8 @@ metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, } int -metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, - metaslab_t **msp) +metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, + uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; @@ -1843,6 +2007,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; @@ -1905,17 +2070,6 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_allocated_space(ms), 0, 0); } - /* - * If metaslab_debug_load is set and we're initializing a metaslab - * that has an allocated space map object then load the space map - * so that we can verify frees. - */ - if (metaslab_debug_load && ms->ms_sm != NULL) { - mutex_enter(&ms->ms_lock); - VERIFY0(metaslab_load(ms)); - mutex_exit(&ms->ms_lock); - } - if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); @@ -1926,11 +2080,42 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, return (0); } +static void +metaslab_fini_flush_data(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + if (metaslab_unflushed_txg(msp) == 0) { + ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), + ==, NULL); + return; + } + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + mutex_enter(&spa->spa_flushed_ms_lock); + avl_remove(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); + spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); +} + +uint64_t +metaslab_unflushed_changes_memused(metaslab_t *ms) +{ + return ((range_tree_numsegs(ms->ms_unflushed_allocs) + + range_tree_numsegs(ms->ms_unflushed_frees)) * + sizeof (range_seg_t)); +} + void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + + metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); @@ -1940,13 +2125,22 @@ metaslab_fini(metaslab_t *msp) -metaslab_allocated_space(msp), 0, -msp->ms_size); space_map_close(msp->ms_sm); + msp->ms_sm = NULL; metaslab_unload(msp); - range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_allocs); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_frees); + for (int t = 0; t < TXG_SIZE; t++) { range_tree_destroy(msp->ms_allocating[t]); } @@ -1966,6 +2160,7 @@ metaslab_fini(metaslab_t *msp) mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); + cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); @@ -2207,9 +2402,9 @@ metaslab_weight_from_range_tree(metaslab_t *msp) } /* - * Calculate the weight based on the on-disk histogram. This should only - * be called after a sync pass has completely finished since the on-disk - * information is updated in metaslab_sync(). + * Calculate the weight based on the on-disk histogram. Should be applied + * only to unloaded metaslabs (i.e no incoming allocations) in-order to + * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) @@ -2283,7 +2478,6 @@ metaslab_segment_weight(metaslab_t *msp) } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); - return (weight); } @@ -2651,18 +2845,19 @@ metaslab_group_preload(metaslab_group_t *mg) } /* - * Determine if the space map's on-disk footprint is past our tolerance - * for inefficiency. We would like to use the following criteria to make - * our decision: + * Determine if the space map's on-disk footprint is past our tolerance for + * inefficiency. We would like to use the following criteria to make our + * decision: * - * 1. The size of the space map object should not dramatically increase as a - * result of writing out the free space range tree. + * 1. Do not condense if the size of the space map object would dramatically + * increase as a result of writing out the free space range tree. * - * 2. The minimal on-disk space map representation is zfs_condense_pct/100 - * times the size than the free space range tree representation - * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). + * 2. Condense if the on on-disk space map representation is at least + * zfs_condense_pct/100 times the size of the optimal representation + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * - * 3. The on-disk size of the space map should actually decrease. + * 3. Do not condense if the on-disk size of the space map does not actually + * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. @@ -2676,27 +2871,11 @@ metaslab_should_condense(metaslab_t *msp) space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; - uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); - - /* - * Allocations and frees in early passes are generally more space - * efficient (in terms of blocks described in space map entries) - * than the ones in later passes (e.g. we don't compress after - * sync pass 5) and condensing a metaslab multiple times in a txg - * could degrade performance. - * - * Thus we prefer condensing each metaslab at most once every txg at - * the earliest sync pass possible. If a metaslab is eligible for - * condensing again after being considered for condensing within the - * same txg, it will hopefully be dirty in the next txg where it will - * be condensed at an earlier pass. - */ - if (msp->ms_condense_checked_txg == current_txg) - return (B_FALSE); - msp->ms_condense_checked_txg = current_txg; + ASSERT(sm != NULL); + ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for @@ -2706,97 +2885,343 @@ metaslab_should_condense(metaslab_t *msp) msp->ms_condense_wanted) return (B_TRUE); - uint64_t object_size = space_map_length(msp->ms_sm); + uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); + uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); - dmu_object_info_t doi; - dmu_object_info_from_db(sm->sm_dbuf, &doi); - uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); - return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. - * The minimized form consists of a small number of allocations followed by - * the entries of the free range tree. + * The minimized form consists of a small number of allocations followed + * by the entries of the free range tree (ms_allocatable). The condensed + * spacemap contains all the entries of previous TXGs (including those in + * the pool-wide log spacemaps; thus this is effectively a superset of + * metaslab_flush()), but this TXG's entries still need to be written. */ static void -metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) +metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); + ASSERT(msp->ms_sm != NULL); + /* + * In order to condense the space map, we need to change it so it + * only describes which segments are currently allocated and free. + * + * All the current free space resides in the ms_allocatable, all + * the ms_defer trees, and all the ms_allocating trees. We ignore + * ms_freed because it is empty because we're in sync pass 1. We + * ignore ms_freeing because these changes are not yet reflected + * in the spacemap (they will be written later this txg). + * + * So to truncate the space map to represent all the entries of + * previous TXGs we do the following: + * + * 1] We create a range tree (condense tree) that is 100% allocated. + * 2] We remove from it all segments found in the ms_defer trees + * as those segments are marked as free in the original space + * map. We do the same with the ms_allocating trees for the same + * reason. Removing these segments should be a relatively + * inexpensive operation since we expect these trees to have a + * small number of nodes. + * 3] We vacate any unflushed allocs as they should already exist + * in the condense tree. Then we vacate any unflushed frees as + * they should already be part of ms_allocatable. + * 4] At this point, we would ideally like to remove all segments + * in the ms_allocatable tree from the condense tree. This way + * we would write all the entries of the condense tree as the + * condensed space map, which would only contain allocated + * segments with everything else assumed to be freed. + * + * Doing so can be prohibitively expensive as ms_allocatable can + * be large, and therefore computationally expensive to subtract + * from the condense_tree. Instead we first sync out the + * condense_tree and then the ms_allocatable, in the condensed + * space map. While this is not optimal, it is typically close to + * optimal and more importantly much cheaper to compute. + * + * 5] Finally, as both of the unflushed trees were written to our + * new and condensed metaslab space map, we basically flushed + * all the unflushed changes to disk, thus we call + * metaslab_flush_update(). + */ + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, - msp->ms_group->mg_vd->vdev_spa->spa_name, - space_map_length(msp->ms_sm), + spa->spa_name, space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_allocatable->rt_root), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; - /* - * Create an range tree that is 100% allocated. We remove segments - * that have been freed in this txg, any deferred frees that exist, - * and any allocation in the future. Removing segments should be - * a relatively inexpensive operation since we expect these trees to - * have a small number of nodes. - */ condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); - range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); - range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); - for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, condense_tree); } - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_remove, condense_tree); } + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + /* - * We're about to drop the metaslab's lock thus allowing - * other consumers to change it's content. Set the - * metaslab's ms_condensing flag to ensure that - * allocations on this metaslab do not occur while we're - * in the middle of committing it to disk. This is only critical - * for ms_allocatable as all other range trees use per txg + * We're about to drop the metaslab's lock thus allowing other + * consumers to change it's content. Set the metaslab's ms_condensing + * flag to ensure that allocations on this metaslab do not occur + * while we're in the middle of committing it to disk. This is only + * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); - space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); + uint64_t object = space_map_object(msp->ms_sm); + space_map_truncate(sm, + spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? + zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); + + /* + * space_map_truncate() may have reallocated the spacemap object. + * If so, update the vdev_ms_array. + */ + if (space_map_object(msp->ms_sm) != object) { + object = space_map_object(msp->ms_sm); + dmu_write(spa->spa_meta_objset, + msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &object, tx); + } /* - * While we would ideally like to create a space map representation - * that consists only of allocation records, doing so can be - * prohibitively expensive because the in-core free tree can be - * large, and therefore computationally expensive to subtract - * from the condense_tree. Instead we sync out two trees, a cheap - * allocation only tree followed by the in-core free tree. While not - * optimal, this is typically close to optimal, and much cheaper to - * compute. + * Note: + * When the log space map feature is enabled, each space map will + * always have ALLOCS followed by FREES for each sync pass. This is + * typically true even when the log space map feature is disabled, + * except from the case where a metaslab goes through metaslab_sync() + * and gets condensed. In that case the metaslab's space map will have + * ALLOCS followed by FREES (due to condensing) followed by ALLOCS + * followed by FREES (due to space_map_write() in metaslab_sync()) for + * sync pass 1. */ space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - - space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); + msp->ms_condensing = B_FALSE; + metaslab_flush_update(msp, tx); +} + +/* + * Called when the metaslab has been flushed (its own spacemap now reflects + * all the contents of the pool-wide spacemap log). Updates the metaslab's + * metadata and any pool-wide related log space map data (e.g. summary, + * obsolete logs, etc..) to reflect that. + */ +static void +metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + + /* + * Just because a metaslab got flushed, that doesn't mean that + * it will pass through metaslab_sync_done(). Thus, make sure to + * update ms_synced_length here in case it doesn't. + */ + msp->ms_synced_length = space_map_length(msp->ms_sm); + + /* + * We may end up here from metaslab_condense() without the + * feature being active. In that case this is a no-op. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + ASSERT(spa_syncing_log_sm(spa) != NULL); + ASSERT(msp->ms_sm != NULL); + ASSERT(metaslab_unflushed_txg(msp) != 0); + ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); + + VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); + + /* update metaslab's position in our flushing tree */ + uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); + mutex_enter(&spa->spa_flushed_ms_lock); + avl_remove(&spa->spa_metaslabs_by_flushed, msp); + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + /* update metaslab counts of spa_log_sm_t nodes */ + spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); + spa_log_sm_increment_current_mscount(spa); + + /* cleanup obsolete logs if any */ + uint64_t log_blocks_before = spa_log_sm_nblocks(spa); + spa_cleanup_old_sm_logs(spa, tx); + uint64_t log_blocks_after = spa_log_sm_nblocks(spa); + VERIFY3U(log_blocks_after, <=, log_blocks_before); + + /* update log space map summary */ + uint64_t blocks_gone = log_blocks_before - log_blocks_after; + spa_log_summary_add_flushed_metaslab(spa); + spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); + spa_log_summary_decrement_blkcount(spa, blocks_gone); +} + +boolean_t +metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + ASSERT(msp->ms_sm != NULL); + ASSERT(metaslab_unflushed_txg(msp) != 0); + ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); + + /* + * There is nothing wrong with flushing the same metaslab twice, as + * this codepath should work on that case. However, the current + * flushing scheme makes sure to avoid this situation as we would be + * making all these calls without having anything meaningful to write + * to disk. We assert this behavior here. + */ + ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); + + /* + * We can not flush while loading, because then we would + * not load the ms_unflushed_{allocs,frees}. + */ + if (msp->ms_loading) + return (B_FALSE); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + /* + * Metaslab condensing is effectively flushing. Therefore if the + * metaslab can be condensed we can just condense it instead of + * flushing it. + * + * Note that metaslab_condense() does call metaslab_flush_update() + * so we can just return immediately after condensing. We also + * don't need to care about setting ms_flushing or broadcasting + * ms_flush_cv, even if we temporarily drop the ms_lock in + * metaslab_condense(), as the metaslab is already loaded. + */ + if (msp->ms_loaded && metaslab_should_condense(msp)) { + metaslab_group_t *mg = msp->ms_group; + + /* + * For all histogram operations below refer to the + * comments of metaslab_sync() where we follow a + * similar procedure. + */ + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); + + metaslab_condense(msp, tx); + + space_map_histogram_clear(msp->ms_sm); + space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); + ASSERT(range_tree_is_empty(msp->ms_freed)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defer[t], tx); + } + metaslab_aux_histograms_update(msp); + + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + + /* + * Since we recreated the histogram (and potentially + * the ms_sm too while condensing) ensure that the + * weight is updated too because we are not guaranteed + * that this metaslab is dirty and will go through + * metaslab_sync_done(). + */ + metaslab_recalculate_weight_and_sort(msp); + return (B_TRUE); + } + + msp->ms_flushing = B_TRUE; + uint64_t sm_len_before = space_map_length(msp->ms_sm); + + mutex_exit(&msp->ms_lock); + space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, + SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + + uint64_t sm_len_after = space_map_length(msp->ms_sm); + if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { + zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " + "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa), + msp->ms_group->mg_vd->vdev_id, msp->ms_id, + range_tree_space(msp->ms_unflushed_allocs), + range_tree_space(msp->ms_unflushed_frees), + (sm_len_after - sm_len_before)); + } + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + metaslab_flush_update(msp, tx); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + msp->ms_flushing = B_FALSE; + cv_broadcast(&msp->ms_flush_cv); + return (B_TRUE); } /* @@ -2811,7 +3236,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) objset_t *mos = spa_meta_objset(spa); range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; - uint64_t object = space_map_object(msp->ms_sm); ASSERT(!vd->vdev_ishole); @@ -2858,25 +3282,53 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - if (msp->ms_sm == NULL) { - uint64_t new_object; + /* + * Generate a log space map if one doesn't exist already. + */ + spa_generate_syncing_log_sm(spa, tx); - new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); + if (msp->ms_sm == NULL) { + uint64_t new_object = space_map_alloc(mos, + spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? + zfs_metaslab_sm_blksz_with_log : + zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &new_object, tx); + VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); - ASSERT(msp->ms_sm != NULL); + + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } + if (metaslab_unflushed_txg(msp) == 0 && + spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + ASSERT(spa_syncing_log_sm(spa) != NULL); + + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + spa_log_sm_increment_current_mscount(spa); + spa_log_summary_add_flushed_metaslab(spa); + + ASSERT(msp->ms_sm != NULL); + mutex_enter(&spa->spa_flushed_ms_lock); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + } + if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, - vdev_standard_sm_blksz, tx); + zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, @@ -2905,10 +3357,39 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); - if (msp->ms_loaded && metaslab_should_condense(msp)) { - metaslab_condense(msp, txg, tx); + if (spa->spa_sync_pass == 1 && msp->ms_loaded && + metaslab_should_condense(msp)) + metaslab_condense(msp, tx); + + /* + * We'll be going to disk to sync our space accounting, thus we + * drop the ms_lock during that time so allocations coming from + * open-context (ZIL) for future TXGs do not block. + */ + mutex_exit(&msp->ms_lock); + space_map_t *log_sm = spa_syncing_log_sm(spa); + if (log_sm != NULL) { + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + + space_map_write(log_sm, alloctree, SM_ALLOC, + vd->vdev_id, tx); + space_map_write(log_sm, msp->ms_freeing, SM_FREE, + vd->vdev_id, tx); + mutex_enter(&msp->ms_lock); + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_remove_xor_add(alloctree, + msp->ms_unflushed_frees, msp->ms_unflushed_allocs); + range_tree_remove_xor_add(msp->ms_freeing, + msp->ms_unflushed_allocs, msp->ms_unflushed_frees); + spa->spa_unflushed_stats.sus_memused += + metaslab_unflushed_changes_memused(msp); } else { - mutex_exit(&msp->ms_lock); + ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, @@ -2928,7 +3409,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the - * ms_lock while writing to the checkpoint space map. + * ms_lock while writing to the checkpoint space map, for the + * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, @@ -2996,6 +3478,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. + * + * Keep in mind that even if we are currently using a log spacemap + * we want current frees to end up in the ms_allocatable (but not + * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); @@ -3015,11 +3501,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_exit(&msp->ms_lock); - if (object != space_map_object(msp->ms_sm)) { - object = space_map_object(msp->ms_sm); - dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * - msp->ms_id, sizeof (uint64_t), &object, tx); - } + /* + * Verify that the space map object ID has been recorded in the + * vdev_ms_array. + */ + uint64_t object; + VERIFY0(dmu_read(mos, vd->vdev_ms_array, + msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); + VERIFY3U(object, ==, space_map_object(msp->ms_sm)); + mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } @@ -3084,14 +3574,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) msp->ms_freed = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ASSERT(msp->ms_defer[t] == NULL); - + ASSERT3P(msp->ms_defer[t], ==, NULL); msp->ms_defer[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_checkpointing, ==, NULL); msp->ms_checkpointing = range_tree_create(NULL, NULL); + ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); + msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); + ASSERT3P(msp->ms_unflushed_frees, ==, NULL); + msp->ms_unflushed_frees = range_tree_create(NULL, NULL); + metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } ASSERT0(range_tree_space(msp->ms_freeing)); @@ -3108,21 +3602,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - range_tree_space(msp->ms_freed); + if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); } - metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); - /* - * If there's a metaslab_load() in progress, wait for it to complete - * so that we have a consistent view of the in-core space map. - */ - metaslab_load_wait(msp); + if (spa_syncing_log_sm(spa) == NULL) { + /* + * If there's a metaslab_load() in progress and we don't have + * a log space map, it means that we probably wrote to the + * metaslab's space map. If this is the case, we need to + * make sure that we wait for the load to complete so that we + * have a consistent view at the in-core side of the metaslab. + */ + metaslab_load_wait(msp); + } else { + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + } /* * When auto-trimming is enabled, free ranges which are added to @@ -3451,6 +3952,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; + ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); @@ -4864,12 +5366,23 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) offset, size); } - range_tree_verify_not_present(msp->ms_trim, offset, size); + /* + * Check all segments that currently exist in the freeing pipeline. + * + * It would intuitively make sense to also check the current allocating + * tree since metaslab_unalloc_dva() exists for extents that are + * allocated and freed in the same sync pass withing the same txg. + * Unfortunately there are places (e.g. the ZIL) where we allocate a + * segment but then we free part of it within the same txg + * [see zil_sync()]. Thus, we don't call range_tree_verify() in the + * current allocating tree. + */ range_tree_verify_not_present(msp->ms_freeing, offset, size); range_tree_verify_not_present(msp->ms_checkpointing, offset, size); range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) range_tree_verify_not_present(msp->ms_defer[j], offset, size); + range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } @@ -4979,6 +5492,57 @@ metaslab_enable(metaslab_t *msp, boolean_t sync) mutex_exit(&mg->mg_ms_disabled_lock); } +static void +metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) +{ + vdev_t *vd = ms->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + metaslab_unflushed_phys_t entry = { + .msp_unflushed_txg = metaslab_unflushed_txg(ms), + }; + uint64_t entry_size = sizeof (entry); + uint64_t entry_offset = ms->ms_id * entry_size; + + uint64_t object = 0; + int err = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, + &object); + if (err == ENOENT) { + object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, + &object, tx)); + } else { + VERIFY0(err); + } + + dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, + &entry, tx); +} + +void +metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) +{ + spa_t *spa = ms->ms_group->mg_vd->vdev_spa; + + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + ms->ms_unflushed_txg = txg; + metaslab_update_ondisk_flush_data(ms, tx); +} + +uint64_t +metaslab_unflushed_txg(metaslab_t *ms) +{ + return (ms->ms_unflushed_txg); +} + #if defined(_KERNEL) /* BEGIN CSTYLED */ module_param(metaslab_aliquot, ulong, 0644); |