diff options
author | Serapheim Dimitropoulos <[email protected]> | 2016-12-16 14:11:29 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-06-26 10:07:42 -0700 |
commit | d2734cce68cf740e015312314415f9034c67851c (patch) | |
tree | b7a140a3cf2a19bb7c88f2d277f3b5a33c121cea /module/zfs/metaslab.c | |
parent | 88eaf610d9c7056f0946e5090cba1e6288ff2b70 (diff) |
OpenZFS 9166 - zfs storage pool checkpoint
Details about the motivation of this feature and its usage can
be found in this blogpost:
https://sdimitro.github.io/post/zpool-checkpoint/
A lightning talk of this feature can be found here:
https://www.youtube.com/watch?v=fPQA8K40jAM
Implementation details can be found in big block comment of
spa_checkpoint.c
Side-changes that are relevant to this commit but not explained
elsewhere:
* renames members of "struct metaslab trees to be shorter without
losing meaning
* space_map_{alloc,truncate}() accept a block size as a
parameter. The reason is that in the current state all space
maps that we allocate through the DMU use a global tunable
(space_map_blksz) which defauls to 4KB. This is ok for metaslab
space maps in terms of bandwirdth since they are scattered all
over the disk. But for other space maps this default is probably
not what we want. Examples are device removal's vdev_obsolete_sm
or vdev_chedkpoint_sm from this review. Both of these have a
1:1 relationship with each vdev and could benefit from a bigger
block size.
Porting notes:
* The part of dsl_scan_sync() which handles async destroys has
been moved into the new dsl_process_async_destroys() function.
* Remove "VERIFY(!(flags & FWRITE))" in "kernel.c" so zhack can write
to block device backed pools.
* ZTS:
* Fix get_txg() in zpool_sync_001_pos due to "checkpoint_txg".
* Don't use large dd block sizes on /dev/urandom under Linux in
checkpoint_capacity.
* Adopt Delphix-OS's setting of 4 (spa_asize_inflation =
SPA_DVAS_PER_BP + 1) for the checkpoint_capacity test to speed
its attempts to fill the pool
* Create the base and nested pools with sync=disabled to speed up
the "setup" phase.
* Clear labels in test pool between checkpoint tests to avoid
duplicate pool issues.
* The import_rewind_device_replaced test has been marked as "known
to fail" for the reasons listed in its DISCLAIMER.
* New module parameters:
zfs_spa_discard_memory_limit,
zfs_remove_max_bytes_pause (not documented - debugging only)
vdev_max_ms_count (formerly metaslabs_per_vdev)
vdev_min_ms_count
Authored by: Serapheim Dimitropoulos <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: John Kennedy <[email protected]>
Reviewed by: Dan Kimmel <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Approved by: Richard Lowe <[email protected]>
Ported-by: Tim Chase <[email protected]>
Signed-off-by: Tim Chase <[email protected]>
OpenZFS-issue: https://illumos.org/issues/9166
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7159fdb8
Closes #7570
Diffstat (limited to 'module/zfs/metaslab.c')
-rw-r--r-- | module/zfs/metaslab.c | 419 |
1 files changed, 275 insertions, 144 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index c11e459e0..76fa99e8b 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -34,6 +34,7 @@ #include <sys/spa_impl.h> #include <sys/zfeature.h> #include <sys/vdev_indirect_mapping.h> +#include <sys/zap.h> #define WITH_DF_BLOCK_ALLOCATOR @@ -54,6 +55,14 @@ unsigned long metaslab_aliquot = 512 << 10; unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* + * Since we can touch multiple metaslabs (and their respective space maps) + * with each transaction group, we benefit from having a smaller space map + * block size since it allows us to issue more I/O operations scattered + * around the disk. + */ +int zfs_metaslab_sm_blksz = (1 << 12); + +/* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. @@ -211,7 +220,7 @@ uint64_t metaslab_trace_max_entries = 5000; static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); -static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t); +static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); #ifdef _METASLAB_TRACING @@ -484,11 +493,11 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg) */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocated += - range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]); + range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } - msp_free_space = range_tree_space(msp->ms_tree) + allocated + - msp->ms_deferspace + range_tree_space(msp->ms_freedtree); + msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } @@ -1021,7 +1030,7 @@ metaslab_rangesize_compare(const void *x1, const void *x2) uint64_t metaslab_block_maxsize(metaslab_t *msp) { - avl_tree_t *t = &msp->ms_size_tree; + avl_tree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL || (rs = avl_last(t)) == NULL) @@ -1101,7 +1110,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size) */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - avl_tree_t *t = &msp->ms_tree->rt_root; + avl_tree_t *t = &msp->ms_allocatable->rt_root; return (metaslab_block_picker(t, cursor, size, align)); } @@ -1134,13 +1143,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - range_tree_t *rt = msp->ms_tree; + range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &rt->rt_root; uint64_t max_size = metaslab_block_maxsize(msp); int free_pct = range_tree_space(rt) * 100 / msp->ms_size; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); + ASSERT3U(avl_numnodes(t), ==, + avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); @@ -1151,7 +1161,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) */ if (max_size < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { - t = &msp->ms_size_tree; + t = &msp->ms_allocatable_by_size; *cursor = 0; } @@ -1178,8 +1188,8 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { - range_tree_t *rt = msp->ms_tree; - avl_tree_t *t = &msp->ms_size_tree; + range_tree_t *rt = msp->ms_allocatable; + avl_tree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; @@ -1192,7 +1202,7 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - rs = avl_last(&msp->ms_size_tree); + rs = avl_last(&msp->ms_allocatable_by_size); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) return (-1ULL); @@ -1232,7 +1242,7 @@ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &msp->ms_tree->rt_root; + avl_tree_t *t = &msp->ms_allocatable->rt_root; avl_index_t where; range_seg_t *rs, rsearch; uint64_t hbit = highbit64(size); @@ -1240,7 +1250,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) uint64_t max_size = metaslab_block_maxsize(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); + ASSERT3U(avl_numnodes(t), ==, + avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); @@ -1250,7 +1261,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) rs = avl_find(t, &rsearch, &where); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { - t = &msp->ms_size_tree; + t = &msp->ms_allocatable_by_size; rsearch.rs_start = 0; rsearch.rs_end = MIN(max_size, @@ -1316,13 +1327,15 @@ metaslab_load(metaslab_t *msp) /* * If the space map has not been allocated yet, then treat - * all the space in the metaslab as free and add it to the - * ms_tree. + * all the space in the metaslab as free and add it to ms_allocatable. */ - if (msp->ms_sm != NULL) - error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); - else - range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); + if (msp->ms_sm != NULL) { + error = space_map_load(msp->ms_sm, msp->ms_allocatable, + SM_FREE); + } else { + range_tree_add(msp->ms_allocatable, + msp->ms_start, msp->ms_size); + } success = (error == 0); @@ -1333,9 +1346,16 @@ metaslab_load(metaslab_t *msp) ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defertree[t], - range_tree_remove, msp->ms_tree); + /* + * If the metaslab already has a spacemap, then we need to + * remove all segments from the defer tree; otherwise, the + * metaslab is completely empty and we can skip this. + */ + if (msp->ms_sm != NULL) { + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); + } } msp->ms_max_size = metaslab_block_maxsize(msp); } @@ -1347,7 +1367,7 @@ void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); - range_tree_vacate(msp->ms_tree, NULL, NULL); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_max_size = 0; @@ -1393,8 +1413,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree, - metaslab_rangesize_compare, 0); + ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, + &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); @@ -1446,20 +1466,21 @@ metaslab_fini(metaslab_t *msp) space_map_close(msp->ms_sm); metaslab_unload(msp); - range_tree_destroy(msp->ms_tree); - range_tree_destroy(msp->ms_freeingtree); - range_tree_destroy(msp->ms_freedtree); + range_tree_destroy(msp->ms_allocatable); + range_tree_destroy(msp->ms_freeing); + range_tree_destroy(msp->ms_freed); for (int t = 0; t < TXG_SIZE; t++) { - range_tree_destroy(msp->ms_alloctree[t]); + range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_destroy(msp->ms_defertree[t]); + range_tree_destroy(msp->ms_defer[t]); } - ASSERT0(msp->ms_deferspace); + range_tree_destroy(msp->ms_checkpointing); + mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); @@ -1679,7 +1700,7 @@ metaslab_weight_from_range_tree(metaslab_t *msp) int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; - segments += msp->ms_tree->rt_histogram[i]; + segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map @@ -1895,7 +1916,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || - range_tree_space(msp->ms_tree) == 0); + range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); msp->ms_activation_weight = 0; @@ -2028,18 +2049,37 @@ metaslab_should_condense(metaslab_t *msp) range_seg_t *rs; uint64_t size, entries, segsz, object_size, optimal_size, record_size; dmu_object_info_t doi; - uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift; + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t vdev_blocksize = 1 << vd->vdev_ashift; + uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); /* - * Use the ms_size_tree range tree, which is ordered by size, to - * obtain the largest segment in the free tree. We always condense - * metaslabs that are empty and metaslabs for which a condense - * request has been made. + * Allocations and frees in early passes are generally more space + * efficient (in terms of blocks described in space map entries) + * than the ones in later passes (e.g. we don't compress after + * sync pass 5) and condensing a metaslab multiple times in a txg + * could degrade performance. + * + * Thus we prefer condensing each metaslab at most once every txg at + * the earliest sync pass possible. If a metaslab is eligible for + * condensing again after being considered for condensing within the + * same txg, it will hopefully be dirty in the next txg where it will + * be condensed at an earlier pass. + */ + if (msp->ms_condense_checked_txg == current_txg) + return (B_FALSE); + msp->ms_condense_checked_txg = current_txg; + + /* + * Use the ms_allocatable_by_size range tree, which is ordered by + * size, to obtain the largest segment in the free tree. We always + * condense metaslabs that are empty and metaslabs for which a + * condense request has been made. */ - rs = avl_last(&msp->ms_size_tree); + rs = avl_last(&msp->ms_allocatable_by_size); if (rs == NULL || msp->ms_condense_wanted) return (B_TRUE); @@ -2053,7 +2093,8 @@ metaslab_should_condense(metaslab_t *msp) entries = size / (MIN(size, SM_RUN_MAX)); segsz = entries * sizeof (uint64_t); - optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); + optimal_size = + sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root); object_size = space_map_length(msp->ms_sm); dmu_object_info_from_db(sm->sm_dbuf, &doi); @@ -2076,7 +2117,6 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) space_map_t *sm = msp->ms_sm; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(spa_sync_pass(msp->ms_group->mg_vd->vdev_spa), ==, 1); ASSERT(msp->ms_loaded); @@ -2084,7 +2124,8 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, msp->ms_group->mg_vd->vdev_spa->spa_name, - space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), + space_map_length(msp->ms_sm), + avl_numnodes(&msp->ms_allocatable->rt_root), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; @@ -2099,20 +2140,16 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); - /* - * Remove what's been freed in this txg from the condense_tree. - * Since we're in sync_pass 1, we know that all the frees from - * this txg are in the freeingtree. - */ - range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree); + range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); + range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defertree[t], + range_tree_walk(msp->ms_defer[t], range_tree_remove, condense_tree); } for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], + range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_remove, condense_tree); } @@ -2122,13 +2159,13 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) * metaslab's ms_condensing flag to ensure that * allocations on this metaslab do not occur while we're * in the middle of committing it to disk. This is only critical - * for the ms_tree as all other range trees use per txg + * for ms_allocatable as all other range trees use per txg * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); - space_map_truncate(sm, tx); + space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); /* * While we would ideally like to create a space map representation @@ -2144,7 +2181,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - space_map_write(sm, msp->ms_tree, SM_FREE, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, tx); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } @@ -2159,7 +2196,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); - range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; + range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; uint64_t object = space_map_object(msp->ms_sm); @@ -2168,23 +2205,24 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * This metaslab has just been added so there's no work to do now. */ - if (msp->ms_freeingtree == NULL) { + if (msp->ms_freeing == NULL) { ASSERT3P(alloctree, ==, NULL); return; } ASSERT3P(alloctree, !=, NULL); - ASSERT3P(msp->ms_freeingtree, !=, NULL); - ASSERT3P(msp->ms_freedtree, !=, NULL); + ASSERT3P(msp->ms_freeing, !=, NULL); + ASSERT3P(msp->ms_freed, !=, NULL); + ASSERT3P(msp->ms_checkpointing, !=, NULL); /* - * Normally, we don't want to process a metaslab if there - * are no allocations or frees to perform. However, if the metaslab - * is being forced to condense and it's loaded, we need to let it - * through. + * Normally, we don't want to process a metaslab if there are no + * allocations or frees to perform. However, if the metaslab is being + * forced to condense and it's loaded, we need to let it through. */ - if (range_tree_space(alloctree) == 0 && - range_tree_space(msp->ms_freeingtree) == 0 && + if (range_tree_is_empty(alloctree) && + range_tree_is_empty(msp->ms_freeing) && + range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted)) return; @@ -2193,10 +2231,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * The only state that can actually be changing concurrently with - * metaslab_sync() is the metaslab's ms_tree. No other thread can - * be modifying this txg's alloctree, freeingtree, freedtree, or - * space_map_phys_t. We drop ms_lock whenever we could call - * into the DMU, because the DMU can call down to us + * metaslab_sync() is the metaslab's ms_allocatable. No other + * thread can be modifying this txg's alloc, freeing, + * freed, or space_map_phys_t. We drop ms_lock whenever we + * could call into the DMU, because the DMU can call down to us * (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state @@ -2204,13 +2242,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * that the ms_lock is insufficient for this, because it is dropped * by space_map_write(). */ - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (msp->ms_sm == NULL) { uint64_t new_object; - new_object = space_map_alloc(mos, tx); + new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, @@ -2218,6 +2255,28 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT(msp->ms_sm != NULL); } + if (!range_tree_is_empty(msp->ms_checkpointing) && + vd->vdev_checkpoint_sm == NULL) { + ASSERT(spa_has_checkpoint(spa)); + + uint64_t new_object = space_map_alloc(mos, + vdev_standard_sm_blksz, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, + mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * We save the space map object as an entry in vdev_top_zap + * so it can be retrieved when the pool is reopened after an + * export or through zdb. + */ + VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (new_object), 1, &new_object, tx)); + } + mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); @@ -2230,16 +2289,40 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); - if (msp->ms_loaded && spa_sync_pass(spa) == 1 && - metaslab_should_condense(msp)) { + if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_condense(msp, txg, tx); } else { mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); - space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx); + space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx); mutex_enter(&msp->ms_lock); } + if (!range_tree_is_empty(msp->ms_checkpointing)) { + ASSERT(spa_has_checkpoint(spa)); + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * Since we are doing writes to disk and the ms_checkpointing + * tree won't be changing during that time, we drop the + * ms_lock while writing to the checkpoint space map. + */ + mutex_exit(&msp->ms_lock); + space_map_write(vd->vdev_checkpoint_sm, + msp->ms_checkpointing, SM_FREE, tx); + mutex_enter(&msp->ms_lock); + space_map_update(vd->vdev_checkpoint_sm); + + spa->spa_checkpoint_info.sci_dspace += + range_tree_space(msp->ms_checkpointing); + vd->vdev_stat.vs_checkpoint_space += + range_tree_space(msp->ms_checkpointing); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, + -vd->vdev_checkpoint_sm->sm_alloc); + + range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + } + if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate @@ -2248,7 +2331,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); - space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); + space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back @@ -2257,7 +2340,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ - space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx); + space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been @@ -2268,7 +2351,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, - msp->ms_defertree[t], tx); + msp->ms_defer[t], tx); } } @@ -2279,7 +2362,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * then we will lose some accuracy but will correct it the next * time we load the space map. */ - space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx); + space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); @@ -2287,21 +2370,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * For sync pass 1, we avoid traversing this txg's free range tree - * and instead will just swap the pointers for freeingtree and - * freedtree. We can safely do this since the freed_tree is + * and instead will just swap the pointers for freeing and + * freed. We can safely do this since the freed_tree is * guaranteed to be empty on the initial pass. */ if (spa_sync_pass(spa) == 1) { - range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree); + range_tree_swap(&msp->ms_freeing, &msp->ms_freed); } else { - range_tree_vacate(msp->ms_freeingtree, - range_tree_add, msp->ms_freedtree); + range_tree_vacate(msp->ms_freeing, + range_tree_add, msp->ms_freed); } range_tree_vacate(alloctree, NULL, NULL); - ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_freeingtree)); + ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) + & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); @@ -2336,29 +2421,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * If this metaslab is just becoming available, initialize its * range trees and add its capacity to the vdev. */ - if (msp->ms_freedtree == NULL) { + if (msp->ms_freed == NULL) { for (int t = 0; t < TXG_SIZE; t++) { - ASSERT(msp->ms_alloctree[t] == NULL); + ASSERT(msp->ms_allocating[t] == NULL); - msp->ms_alloctree[t] = range_tree_create(NULL, NULL); + msp->ms_allocating[t] = range_tree_create(NULL, NULL); } - ASSERT3P(msp->ms_freeingtree, ==, NULL); - msp->ms_freeingtree = range_tree_create(NULL, NULL); + ASSERT3P(msp->ms_freeing, ==, NULL); + msp->ms_freeing = range_tree_create(NULL, NULL); - ASSERT3P(msp->ms_freedtree, ==, NULL); - msp->ms_freedtree = range_tree_create(NULL, NULL); + ASSERT3P(msp->ms_freed, ==, NULL); + msp->ms_freed = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ASSERT(msp->ms_defertree[t] == NULL); + ASSERT(msp->ms_defer[t] == NULL); - msp->ms_defertree[t] = range_tree_create(NULL, NULL); + msp->ms_defer[t] = range_tree_create(NULL, NULL); } + ASSERT3P(msp->ms_checkpointing, ==, NULL); + msp->ms_checkpointing = range_tree_create(NULL, NULL); + vdev_space_update(vd, 0, 0, msp->ms_size); } + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); - defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; + defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); @@ -2369,7 +2459,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta = 0; alloc_delta = space_map_alloc_delta(msp->ms_sm); if (defer_allowed) { - defer_delta = range_tree_space(msp->ms_freedtree) - + defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); @@ -2385,19 +2475,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * Move the frees from the defer_tree back to the free - * range tree (if it's loaded). Swap the freed_tree and the - * defer_tree -- this is safe to do because we've just emptied out - * the defer_tree. + * range tree (if it's loaded). Swap the freed_tree and + * the defer_tree -- this is safe to do because we've + * just emptied out the defer_tree. */ range_tree_vacate(*defer_tree, - msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); + msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { - range_tree_swap(&msp->ms_freedtree, defer_tree); + range_tree_swap(&msp->ms_freed, defer_tree); } else { - range_tree_vacate(msp->ms_freedtree, - msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); + range_tree_vacate(msp->ms_freed, + msp->ms_loaded ? range_tree_add : NULL, + msp->ms_allocatable); } - space_map_update(msp->ms_sm); msp->ms_deferspace += defer_delta; @@ -2426,16 +2516,17 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( - msp->ms_alloctree[(txg + t) & TXG_MASK])); + msp->ms_allocating[(txg + t) & TXG_MASK])); } if (!metaslab_debug_unload) metaslab_unload(msp); } - ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_freeingtree)); - ASSERT0(range_tree_space(msp->ms_freedtree)); + ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_freed)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); } @@ -2666,7 +2757,7 @@ static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; - range_tree_t *rt = msp->ms_tree; + range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); @@ -2681,10 +2772,10 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); range_tree_remove(rt, start, size); - if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size); + range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); /* Track the last successful allocation */ msp->ms_alloc_txg = txg; @@ -3183,12 +3274,11 @@ next: void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, - uint64_t txg) + boolean_t checkpoint) { metaslab_t *msp; - ASSERTV(spa_t *spa = vd->vdev_spa); + spa_t *spa = vd->vdev_spa; - ASSERT3U(txg, ==, spa->spa_syncing_txg); ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); @@ -3202,11 +3292,19 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); + mutex_enter(&msp->ms_lock); - if (range_tree_space(msp->ms_freeingtree) == 0) { - vdev_dirty(vd, VDD_METASLAB, msp, txg); + if (range_tree_is_empty(msp->ms_freeing) && + range_tree_is_empty(msp->ms_checkpointing)) { + vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); + } + + if (checkpoint) { + ASSERT(spa_has_checkpoint(spa)); + range_tree_add(msp->ms_checkpointing, offset, asize); + } else { + range_tree_add(msp->ms_freeing, offset, asize); } - range_tree_add(msp->ms_freeingtree, offset, asize); mutex_exit(&msp->ms_lock); } @@ -3215,23 +3313,25 @@ void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { - uint64_t *txgp = arg; + boolean_t *checkpoint = arg; + + ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) - vdev_indirect_mark_obsolete(vd, offset, size, *txgp); + vdev_indirect_mark_obsolete(vd, offset, size); else - metaslab_free_impl(vd, offset, size, *txgp); + metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, - uint64_t txg) + boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - if (txg > spa_freeze_txg(spa)) + if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && @@ -3243,13 +3343,13 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ - free_from_removing_vdev(vd, offset, size, txg); + free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { - vdev_indirect_mark_obsolete(vd, offset, size, txg); + vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, - metaslab_free_impl_cb, &txg); + metaslab_free_impl_cb, &checkpoint); } else { - metaslab_free_concrete(vd, offset, size, txg); + metaslab_free_concrete(vd, offset, size, checkpoint); } } @@ -3426,26 +3526,25 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); - range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], + range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); - VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, + VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - range_tree_add(msp->ms_tree, offset, size); + range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* - * Free the block represented by DVA in the context of the specified - * transaction group. + * Free the block represented by the given DVA. */ void -metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); @@ -3459,7 +3558,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); } - metaslab_free_impl(vd, offset, size, txg); + metaslab_free_impl(vd, offset, size, checkpoint); } /* @@ -3529,7 +3628,8 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) + if (error == 0 && + !range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ @@ -3540,13 +3640,15 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); - range_tree_remove(msp->ms_tree, offset, size); + VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, + msp->ms_size); + range_tree_remove(msp->ms_allocatable, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); + range_tree_add(msp->ms_allocating[txg & TXG_MASK], + offset, size); } mutex_exit(&msp->ms_lock); @@ -3691,13 +3793,41 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); + /* + * If we have a checkpoint for the pool we need to make sure that + * the blocks that we free that are part of the checkpoint won't be + * reused until the checkpoint is discarded or we revert to it. + * + * The checkpoint flag is passed down the metaslab_free code path + * and is set whenever we want to add a block to the checkpoint's + * accounting. That is, we "checkpoint" blocks that existed at the + * time the checkpoint was created and are therefore referenced by + * the checkpointed uberblock. + * + * Note that, we don't checkpoint any blocks if the current + * syncing txg <= spa_checkpoint_txg. We want these frees to sync + * normally as they will be referenced by the checkpointed uberblock. + */ + boolean_t checkpoint = B_FALSE; + if (bp->blk_birth <= spa->spa_checkpoint_txg && + spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { + /* + * At this point, if the block is part of the checkpoint + * there is no way it was created in the current txg. + */ + ASSERT(!now); + ASSERT3U(spa_syncing_txg(spa), ==, txg); + checkpoint = B_TRUE; + } + spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { - metaslab_free_dva(spa, &dva[d], txg); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + metaslab_free_dva(spa, &dva[d], checkpoint); } } @@ -3818,12 +3948,13 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) mutex_enter(&msp->ms_lock); if (msp->ms_loaded) - range_tree_verify(msp->ms_tree, offset, size); + range_tree_verify(msp->ms_allocatable, offset, size); - range_tree_verify(msp->ms_freeingtree, offset, size); - range_tree_verify(msp->ms_freedtree, offset, size); + range_tree_verify(msp->ms_freeing, offset, size); + range_tree_verify(msp->ms_checkpointing, offset, size); + range_tree_verify(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) - range_tree_verify(msp->ms_defertree[j], offset, size); + range_tree_verify(msp->ms_defer[j], offset, size); mutex_exit(&msp->ms_lock); } |