diff options
author | Serapheim Dimitropoulos <[email protected]> | 2021-03-19 22:36:02 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2021-03-19 22:36:02 -0700 |
commit | 793c958f6f168ebbf71db9b1ae342d8b7d73ce5a (patch) | |
tree | 860980ab3fab797be341b2b6e4276f486206bda3 /module | |
parent | ffd6978ef59cfe2773e984bf03de2f0b93b03f5c (diff) |
Initialize metaslab range trees in metaslab_init
= Motivation
We've noticed several zloop crashes within Delphix generated
due to the following sequence of events:
- A device gets expanded and new metaslabas are allocated for
it. These metaslabs go through `metaslab_init()` but haven't
gone through `metaslab_sync_done()` yet. This meas that the
only range tree that's actually set is the `ms_allocatable`.
All the others are NULL.
- A vdev_initialization is issues and `vdev_initialize_thread`
starts processing one of these new metaslabs of the expanded
vdev.
- As part of `vdev_initialize_calculate_progress()` we call
into `metaslab_load()` and `metaslab_load_impl()` which
in turn tries to dereference the metaslabs trees that
are still NULL and therefore we crash.
The same failure can come up from the `vdev_trim` code paths.
= This Patch
We considered the following solutions to deal with this issue:
[A] Add logic to `vdev_initialize/trim` to skip those new
metaslabs. We decided against this as it would be good
to avoid exposing this lower-level detail to higer-level
operations.
[B] Have `metaslab_load_impl()` return early for new metaslabs
and thus never touch those range_trees that are NULL at
that time. This seemed more of a work-around for the bug
and not a clear-cut solution.
[C] Refactor our logic so all metaslabs have their range_trees
created at the time of their creatin in `metaslab_init()`.
In this patch we decided to go with [C] because:
(1) It doesn't expose more metaslab details to higher level
operations such as vdev initialize and trim.
(2) The current behavior of creating the range trees lazily
in `metaslab_sync_done()` is unnecessarily complicated.
(3) Always initializing the metaslab range_trees makes other
parts of the codebase cleaner. For example, we used to
use `ms_freed` as the reference value for knowing whether
all the range_trees have been initialized. Now we no
longer need to do that check in most places (and in the
few that we do we use the `ms_new` boolean field now
which is more readable).
= Side Changes
Probably due to a mismerge we set `ms_loaded` to `B_TRUE` twice
in `metasloab_load_impl()`. In this patch we remove the extraneous
assignment.
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Matthew Ahrens <[email protected]>
Signed-off-by: Serapheim Dimitropoulos <[email protected]>
Closes #11737
Diffstat (limited to 'module')
-rw-r--r-- | module/zfs/metaslab.c | 149 |
1 files changed, 55 insertions, 94 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index bc4f007b6..463806c60 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -2316,18 +2316,13 @@ metaslab_load_impl(metaslab_t *msp) range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); - if (msp->ms_freed != NULL) { + if (msp->ms_new) { /* * If the ms_sm doesn't exist, this means that this * metaslab hasn't gone through metaslab_sync() and * thus has never been dirtied. So we shouldn't * expect any unflushed allocs or frees from previous * TXGs. - * - * Note: ms_freed and all the other trees except for - * the ms_allocatable, can be NULL at this point only - * if this is a new metaslab of a vdev that just got - * expanded. */ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); @@ -2365,8 +2360,6 @@ metaslab_load_impl(metaslab_t *msp) range_tree_walk(msp->ms_unflushed_frees, range_tree_add, msp->ms_allocatable); - msp->ms_loaded = B_TRUE; - ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (spa_syncing_log_sm(spa) != NULL) { @@ -2680,19 +2673,31 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } - range_seg_type_t type; uint64_t shift, start; - type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); + range_seg_type_t type = + metaslab_calculate_range_tree_type(vd, ms, &start, &shift); - /* - * We create the ms_allocatable here, but we don't create the - * other range trees until metaslab_sync_done(). This serves - * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that - * we'd data fault on any attempt to use this metaslab before - * it's ready. - */ ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); + for (int t = 0; t < TXG_SIZE; t++) { + ms->ms_allocating[t] = range_tree_create(NULL, type, + NULL, start, shift); + } + ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); + ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + ms->ms_defer[t] = range_tree_create(NULL, type, NULL, + start, shift); + } + ms->ms_checkpointing = + range_tree_create(NULL, type, NULL, start, shift); + ms->ms_unflushed_allocs = + range_tree_create(NULL, type, NULL, start, shift); + + metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + mrap->mra_bt = &ms->ms_unflushed_frees_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, + type, mrap, start, shift); ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); @@ -2765,13 +2770,13 @@ metaslab_fini(metaslab_t *msp) mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); + /* - * If the range trees haven't been allocated, this metaslab hasn't - * been through metaslab_sync_done() for the first time yet, so its + * If this metaslab hasn't been through metaslab_sync_done() yet its * space hasn't been accounted for in its vdev and doesn't need to be * subtracted. */ - if (msp->ms_freed != NULL) { + if (!msp->ms_new) { metaslab_space_update(vd, mg->mg_class, -metaslab_allocated_space(msp), 0, -msp->ms_size); @@ -2782,27 +2787,24 @@ metaslab_fini(metaslab_t *msp) metaslab_unload(msp); range_tree_destroy(msp->ms_allocatable); + range_tree_destroy(msp->ms_freeing); + range_tree_destroy(msp->ms_freed); - if (msp->ms_freed != NULL) { - range_tree_destroy(msp->ms_freeing); - range_tree_destroy(msp->ms_freed); + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_allocs); + range_tree_destroy(msp->ms_checkpointing); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_frees); - ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, - metaslab_unflushed_changes_memused(msp)); - spa->spa_unflushed_stats.sus_memused -= - metaslab_unflushed_changes_memused(msp); - range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_allocs); - range_tree_destroy(msp->ms_checkpointing); - range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_frees); - - for (int t = 0; t < TXG_SIZE; t++) { - range_tree_destroy(msp->ms_allocating[t]); - } - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_destroy(msp->ms_defer[t]); - } + for (int t = 0; t < TXG_SIZE; t++) { + range_tree_destroy(msp->ms_allocating[t]); + } + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); @@ -3926,17 +3928,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * This metaslab has just been added so there's no work to do now. */ - if (msp->ms_freeing == NULL) { - ASSERT3P(alloctree, ==, NULL); + if (msp->ms_new) { + ASSERT0(range_tree_space(alloctree)); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_freed)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + ASSERT0(range_tree_space(msp->ms_trim)); return; } - ASSERT3P(alloctree, !=, NULL); - ASSERT3P(msp->ms_freeing, !=, NULL); - ASSERT3P(msp->ms_freed, !=, NULL); - ASSERT3P(msp->ms_checkpointing, !=, NULL); - ASSERT3P(msp->ms_trim, !=, NULL); - /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being @@ -4240,54 +4240,15 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); - /* - * If this metaslab is just becoming available, initialize its - * range trees and add its capacity to the vdev. - */ - if (msp->ms_freed == NULL) { - range_seg_type_t type; - uint64_t shift, start; - type = metaslab_calculate_range_tree_type(vd, msp, &start, - &shift); - - for (int t = 0; t < TXG_SIZE; t++) { - ASSERT(msp->ms_allocating[t] == NULL); - - msp->ms_allocating[t] = range_tree_create(NULL, type, - NULL, start, shift); - } - - ASSERT3P(msp->ms_freeing, ==, NULL); - msp->ms_freeing = range_tree_create(NULL, type, NULL, start, - shift); - - ASSERT3P(msp->ms_freed, ==, NULL); - msp->ms_freed = range_tree_create(NULL, type, NULL, start, - shift); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ASSERT3P(msp->ms_defer[t], ==, NULL); - msp->ms_defer[t] = range_tree_create(NULL, type, NULL, - start, shift); - } - - ASSERT3P(msp->ms_checkpointing, ==, NULL); - msp->ms_checkpointing = range_tree_create(NULL, type, NULL, - start, shift); - - ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); - msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, - start, shift); - - metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); - mrap->mra_bt = &msp->ms_unflushed_frees_by_size; - mrap->mra_floor_shift = metaslab_by_size_min_shift; - ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, - type, mrap, start, shift); - + if (msp->ms_new) { + /* this is a new metaslab, add its capacity to the vdev */ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); + + /* there should be no allocations nor frees at this point */ + VERIFY0(msp->ms_allocated_this_txg); + VERIFY0(range_tree_space(msp->ms_freed)); } + ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); |