summaryrefslogtreecommitdiffstats
path: root/module/zfs/metaslab.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/metaslab.c')
-rw-r--r--module/zfs/metaslab.c976
1 files changed, 770 insertions, 206 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5da929b48..0b22aa875 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -56,12 +56,21 @@ unsigned long metaslab_aliquot = 512 << 10;
unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
/*
- * Since we can touch multiple metaslabs (and their respective space maps)
- * with each transaction group, we benefit from having a smaller space map
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
* block size since it allows us to issue more I/O operations scattered
- * around the disk.
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
*/
-int zfs_metaslab_sm_blksz = (1 << 12);
+int zfs_metaslab_sm_blksz_no_log = (1 << 14);
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+int zfs_metaslab_sm_blksz_with_log = (1 << 17);
/*
* The in-core space map representation is more compact than its on-disk form.
@@ -270,6 +279,7 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
+static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
#ifdef _METASLAB_TRACING
kmem_cache_t *metaslab_alloc_trace_cache;
#endif
@@ -540,67 +550,6 @@ metaslab_compare(const void *x1, const void *x2)
return (AVL_CMP(m1->ms_start, m2->ms_start));
}
-uint64_t
-metaslab_allocated_space(metaslab_t *msp)
-{
- return (msp->ms_allocated_space);
-}
-
-/*
- * Verify that the space accounting on disk matches the in-core range_trees.
- */
-static void
-metaslab_verify_space(metaslab_t *msp, uint64_t txg)
-{
- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
- uint64_t allocating = 0;
- uint64_t sm_free_space, msp_free_space;
-
- ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT(!msp->ms_condensing);
-
- if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
- return;
-
- /*
- * We can only verify the metaslab space when we're called
- * from syncing context with a loaded metaslab that has an
- * allocated space map. Calling this in non-syncing context
- * does not provide a consistent view of the metaslab since
- * we're performing allocations in the future.
- */
- if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
- !msp->ms_loaded)
- return;
-
- /*
- * Even though the smp_alloc field can get negative (e.g.
- * see vdev_checkpoint_sm), that should never be the case
- * when it come's to a metaslab's space map.
- */
- ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
-
- sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
-
- /*
- * Account for future allocations since we would have
- * already deducted that space from the ms_allocatable.
- */
- for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
- allocating +=
- range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
- }
-
- ASSERT3U(msp->ms_deferspace, ==,
- range_tree_space(msp->ms_defer[0]) +
- range_tree_space(msp->ms_defer[1]));
-
- msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
- msp->ms_deferspace + range_tree_space(msp->ms_freed);
-
- VERIFY3U(sm_free_space, ==, msp_free_space);
-}
-
/*
* ==========================================================================
* Metaslab groups
@@ -689,6 +638,25 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
mutex_exit(&mg->mg_lock);
}
+int
+metaslab_sort_by_flushed(const void *va, const void *vb)
+{
+ const metaslab_t *a = va;
+ const metaslab_t *b = vb;
+
+ int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
+ if (likely(cmp))
+ return (cmp);
+
+ uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
+ uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
+ cmp = AVL_CMP(a_vdev_id, b_vdev_id);
+ if (cmp)
+ return (cmp);
+
+ return (AVL_CMP(a->ms_id, b->ms_id));
+}
+
metaslab_group_t *
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
{
@@ -703,7 +671,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
KM_SLEEP);
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
- sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+ sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
mg->mg_vd = vd;
mg->mg_class = mc;
mg->mg_activation_count = 0;
@@ -900,7 +868,6 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
- ASSERT(msp != NULL);
/* skip if not active or not a member */
if (msp->ms_sm == NULL || msp->ms_group != mg)
@@ -1454,6 +1421,101 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
* ==========================================================================
*/
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ while (msp->ms_loading) {
+ ASSERT(!msp->ms_loaded);
+ cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+ }
+}
+
+/*
+ * Wait for any in-progress flushing to complete.
+ */
+void
+metaslab_flush_wait(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ while (msp->ms_flushing)
+ cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
+}
+
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+ return (msp->ms_allocated_space);
+}
+
+/*
+ * Verify that the space accounting on disk matches the in-core range_trees.
+ */
+static void
+metaslab_verify_space(metaslab_t *msp, uint64_t txg)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ uint64_t allocating = 0;
+ uint64_t sm_free_space, msp_free_space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!msp->ms_condensing);
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /*
+ * We can only verify the metaslab space when we're called
+ * from syncing context with a loaded metaslab that has an
+ * allocated space map. Calling this in non-syncing context
+ * does not provide a consistent view of the metaslab since
+ * we're performing allocations in the future.
+ */
+ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
+ !msp->ms_loaded)
+ return;
+
+ /*
+ * Even though the smp_alloc field can get negative,
+ * when it comes to a metaslab's space map, that should
+ * never be the case.
+ */
+ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+ ASSERT3U(space_map_allocated(msp->ms_sm), >=,
+ range_tree_space(msp->ms_unflushed_frees));
+
+ ASSERT3U(metaslab_allocated_space(msp), ==,
+ space_map_allocated(msp->ms_sm) +
+ range_tree_space(msp->ms_unflushed_allocs) -
+ range_tree_space(msp->ms_unflushed_frees));
+
+ sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
+
+ /*
+ * Account for future allocations since we would have
+ * already deducted that space from the ms_allocatable.
+ */
+ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+ allocating +=
+ range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
+ }
+
+ ASSERT3U(msp->ms_deferspace, ==,
+ range_tree_space(msp->ms_defer[0]) +
+ range_tree_space(msp->ms_defer[1]));
+
+ msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
+ msp->ms_deferspace + range_tree_space(msp->ms_freed);
+
+ VERIFY3U(sm_free_space, ==, msp_free_space);
+}
+
static void
metaslab_aux_histograms_clear(metaslab_t *msp)
{
@@ -1651,20 +1713,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
VERIFY3U(msp->ms_weight, ==, weight);
}
-/*
- * Wait for any in-progress metaslab loads to complete.
- */
-static void
-metaslab_load_wait(metaslab_t *msp)
-{
- ASSERT(MUTEX_HELD(&msp->ms_lock));
-
- while (msp->ms_loading) {
- ASSERT(!msp->ms_loaded);
- cv_wait(&msp->ms_load_cv, &msp->ms_lock);
- }
-}
-
static int
metaslab_load_impl(metaslab_t *msp)
{
@@ -1679,13 +1727,19 @@ metaslab_load_impl(metaslab_t *msp)
* are reading the space map. Therefore, metaslab_sync() and
* metaslab_sync_done() can run at the same time as we do.
*
- * metaslab_sync() can append to the space map while we are loading.
- * Therefore we load only entries that existed when we started the
- * load. Additionally, metaslab_sync_done() has to wait for the load
- * to complete because there are potential races like metaslab_load()
- * loading parts of the space map that are currently being appended
- * by metaslab_sync(). If we didn't, the ms_allocatable would have
- * entries that metaslab_sync_done() would try to re-add later.
+ * If we are using the log space maps, metaslab_sync() can't write to
+ * the metaslab's space map while we are loading as we only write to
+ * it when we are flushing the metaslab, and that can't happen while
+ * we are loading it.
+ *
+ * If we are not using log space maps though, metaslab_sync() can
+ * append to the space map while we are loading. Therefore we load
+ * only entries that existed when we started the load. Additionally,
+ * metaslab_sync_done() has to wait for the load to complete because
+ * there are potential races like metaslab_load() loading parts of the
+ * space map that are currently being appended by metaslab_sync(). If
+ * we didn't, the ms_allocatable would have entries that
+ * metaslab_sync_done() would try to re-add later.
*
* That's why before dropping the lock we remember the synced length
* of the metaslab and read up to that point of the space map,
@@ -1695,6 +1749,7 @@ metaslab_load_impl(metaslab_t *msp)
uint64_t length = msp->ms_synced_length;
mutex_exit(&msp->ms_lock);
+ hrtime_t load_start = gethrtime();
if (msp->ms_sm != NULL) {
error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
SM_FREE, length);
@@ -1706,18 +1761,37 @@ metaslab_load_impl(metaslab_t *msp)
*/
range_tree_add(msp->ms_allocatable,
msp->ms_start, msp->ms_size);
+
+ if (msp->ms_freed != NULL) {
+ /*
+ * If the ms_sm doesn't exist, this means that this
+ * metaslab hasn't gone through metaslab_sync() and
+ * thus has never been dirtied. So we shouldn't
+ * expect any unflushed allocs or frees from previous
+ * TXGs.
+ *
+ * Note: ms_freed and all the other trees except for
+ * the ms_allocatable, can be NULL at this point only
+ * if this is a new metaslab of a vdev that just got
+ * expanded.
+ */
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+ }
}
/*
* We need to grab the ms_sync_lock to prevent metaslab_sync() from
- * changing the ms_sm and the metaslab's range trees while we are
- * about to use them and populate the ms_allocatable. The ms_lock
- * is insufficient for this because metaslab_sync() doesn't hold
- * the ms_lock while writing the ms_checkpointing tree to disk.
+ * changing the ms_sm (or log_sm) and the metaslab's range trees
+ * while we are about to use them and populate the ms_allocatable.
+ * The ms_lock is insufficient for this because metaslab_sync() doesn't
+ * hold the ms_lock while writing the ms_checkpointing tree to disk.
*/
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
+
ASSERT(!msp->ms_condensing);
+ ASSERT(!msp->ms_flushing);
if (error != 0) {
mutex_exit(&msp->ms_sync_lock);
@@ -1728,10 +1802,60 @@ metaslab_load_impl(metaslab_t *msp)
msp->ms_loaded = B_TRUE;
/*
- * The ms_allocatable contains the segments that exist in the
- * ms_defer trees [see ms_synced_length]. Thus we need to remove
- * them from ms_allocatable as they will be added again in
+ * Apply all the unflushed changes to ms_allocatable right
+ * away so any manipulations we do below have a clear view
+ * of what is allocated and what is free.
+ */
+ range_tree_walk(msp->ms_unflushed_allocs,
+ range_tree_remove, msp->ms_allocatable);
+ range_tree_walk(msp->ms_unflushed_frees,
+ range_tree_add, msp->ms_allocatable);
+
+ msp->ms_loaded = B_TRUE;
+
+ ASSERT3P(msp->ms_group, !=, NULL);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ if (spa_syncing_log_sm(spa) != NULL) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_LOG_SPACEMAP));
+
+ /*
+ * If we use a log space map we add all the segments
+ * that are in ms_unflushed_frees so they are available
+ * for allocation.
+ *
+ * ms_allocatable needs to contain all free segments
+ * that are ready for allocations (thus not segments
+ * from ms_freeing, ms_freed, and the ms_defer trees).
+ * But if we grab the lock in this code path at a sync
+ * pass later that 1, then it also contains the
+ * segments of ms_freed (they were added to it earlier
+ * in this path through ms_unflushed_frees). So we
+ * need to remove all the segments that exist in
+ * ms_freed from ms_allocatable as they will be added
+ * later in metaslab_sync_done().
+ *
+ * When there's no log space map, the ms_allocatable
+ * correctly doesn't contain any segments that exist
+ * in ms_freed [see ms_synced_length].
+ */
+ range_tree_walk(msp->ms_freed,
+ range_tree_remove, msp->ms_allocatable);
+ }
+
+ /*
+ * If we are not using the log space map, ms_allocatable
+ * contains the segments that exist in the ms_defer trees
+ * [see ms_synced_length]. Thus we need to remove them
+ * from ms_allocatable as they will be added again in
* metaslab_sync_done().
+ *
+ * If we are using the log space map, ms_allocatable still
+ * contains the segments that exist in the ms_defer trees.
+ * Not because it read them through the ms_sm though. But
+ * because these segments are part of ms_unflushed_frees
+ * whose segments we add to ms_allocatable earlier in this
+ * code path.
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
@@ -1756,10 +1880,26 @@ metaslab_load_impl(metaslab_t *msp)
ASSERT3U(weight, <=, msp->ms_weight);
msp->ms_max_size = metaslab_block_maxsize(msp);
- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ hrtime_t load_end = gethrtime();
+ if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
+ zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
+ "ms_id %llu, smp_length %llu, "
+ "unflushed_allocs %llu, unflushed_frees %llu, "
+ "freed %llu, defer %llu + %llu, "
+ "loading_time %lld ms",
+ spa_syncing_txg(spa), spa_name(spa),
+ msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+ space_map_length(msp->ms_sm),
+ range_tree_space(msp->ms_unflushed_allocs),
+ range_tree_space(msp->ms_unflushed_frees),
+ range_tree_space(msp->ms_freed),
+ range_tree_space(msp->ms_defer[0]),
+ range_tree_space(msp->ms_defer[1]),
+ (longlong_t)((load_end - load_start) / 1000000));
+ }
+
metaslab_verify_space(msp, spa_syncing_txg(spa));
mutex_exit(&msp->ms_sync_lock);
-
return (0);
}
@@ -1778,8 +1918,32 @@ metaslab_load(metaslab_t *msp)
VERIFY(!msp->ms_loading);
ASSERT(!msp->ms_condensing);
+ /*
+ * We set the loading flag BEFORE potentially dropping the lock to
+ * wait for an ongoing flush (see ms_flushing below). This way other
+ * threads know that there is already a thread that is loading this
+ * metaslab.
+ */
msp->ms_loading = B_TRUE;
+
+ /*
+ * Wait for any in-progress flushing to finish as we drop the ms_lock
+ * both here (during space_map_load()) and in metaslab_flush() (when
+ * we flush our changes to the ms_sm).
+ */
+ if (msp->ms_flushing)
+ metaslab_flush_wait(msp);
+
+ /*
+ * In the possibility that we were waiting for the metaslab to be
+ * flushed (where we temporarily dropped the ms_lock), ensure that
+ * no one else loaded the metaslab somehow.
+ */
+ ASSERT(!msp->ms_loaded);
+
int error = metaslab_load_impl(msp);
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
msp->ms_loading = B_FALSE;
cv_broadcast(&msp->ms_load_cv);
@@ -1806,7 +1970,7 @@ metaslab_unload(metaslab_t *msp)
* have their weights calculated from the space map histograms, while
* loaded ones have it calculated from their in-core range tree
* [see metaslab_load()]. This way, the weight reflects the information
- * available in-core, whether it is loaded or not
+ * available in-core, whether it is loaded or not.
*
* If ms_group == NULL means that we came here from metaslab_fini(),
* at which point it doesn't make sense for us to do the recalculation
@@ -1816,7 +1980,7 @@ metaslab_unload(metaslab_t *msp)
metaslab_recalculate_weight_and_sort(msp);
}
-static void
+void
metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta)
{
@@ -1830,8 +1994,8 @@ metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
}
int
-metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
- metaslab_t **msp)
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
+ uint64_t txg, metaslab_t **msp)
{
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
@@ -1843,6 +2007,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift;
@@ -1905,17 +2070,6 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
metaslab_allocated_space(ms), 0, 0);
}
- /*
- * If metaslab_debug_load is set and we're initializing a metaslab
- * that has an allocated space map object then load the space map
- * so that we can verify frees.
- */
- if (metaslab_debug_load && ms->ms_sm != NULL) {
- mutex_enter(&ms->ms_lock);
- VERIFY0(metaslab_load(ms));
- mutex_exit(&ms->ms_lock);
- }
-
if (txg != 0) {
vdev_dirty(vd, 0, NULL, txg);
vdev_dirty(vd, VDD_METASLAB, ms, txg);
@@ -1926,11 +2080,42 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
return (0);
}
+static void
+metaslab_fini_flush_data(metaslab_t *msp)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ if (metaslab_unflushed_txg(msp) == 0) {
+ ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
+ ==, NULL);
+ return;
+ }
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+ mutex_exit(&spa->spa_flushed_ms_lock);
+
+ spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+ spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+}
+
+uint64_t
+metaslab_unflushed_changes_memused(metaslab_t *ms)
+{
+ return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
+ range_tree_numsegs(ms->ms_unflushed_frees)) *
+ sizeof (range_seg_t));
+}
+
void
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+
+ metaslab_fini_flush_data(msp);
metaslab_group_remove(mg, msp);
@@ -1940,13 +2125,22 @@ metaslab_fini(metaslab_t *msp)
-metaslab_allocated_space(msp), 0, -msp->ms_size);
space_map_close(msp->ms_sm);
+ msp->ms_sm = NULL;
metaslab_unload(msp);
-
range_tree_destroy(msp->ms_allocatable);
range_tree_destroy(msp->ms_freeing);
range_tree_destroy(msp->ms_freed);
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+ range_tree_destroy(msp->ms_unflushed_allocs);
+ range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+ range_tree_destroy(msp->ms_unflushed_frees);
+
for (int t = 0; t < TXG_SIZE; t++) {
range_tree_destroy(msp->ms_allocating[t]);
}
@@ -1966,6 +2160,7 @@ metaslab_fini(metaslab_t *msp)
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
+ cv_destroy(&msp->ms_flush_cv);
mutex_destroy(&msp->ms_lock);
mutex_destroy(&msp->ms_sync_lock);
ASSERT3U(msp->ms_allocator, ==, -1);
@@ -2207,9 +2402,9 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
}
/*
- * Calculate the weight based on the on-disk histogram. This should only
- * be called after a sync pass has completely finished since the on-disk
- * information is updated in metaslab_sync().
+ * Calculate the weight based on the on-disk histogram. Should be applied
+ * only to unloaded metaslabs (i.e no incoming allocations) in-order to
+ * give results consistent with the on-disk state
*/
static uint64_t
metaslab_weight_from_spacemap(metaslab_t *msp)
@@ -2283,7 +2478,6 @@ metaslab_segment_weight(metaslab_t *msp)
}
WEIGHT_SET_ACTIVE(weight, 0);
ASSERT(!WEIGHT_IS_SPACEBASED(weight));
-
return (weight);
}
@@ -2651,18 +2845,19 @@ metaslab_group_preload(metaslab_group_t *mg)
}
/*
- * Determine if the space map's on-disk footprint is past our tolerance
- * for inefficiency. We would like to use the following criteria to make
- * our decision:
+ * Determine if the space map's on-disk footprint is past our tolerance for
+ * inefficiency. We would like to use the following criteria to make our
+ * decision:
*
- * 1. The size of the space map object should not dramatically increase as a
- * result of writing out the free space range tree.
+ * 1. Do not condense if the size of the space map object would dramatically
+ * increase as a result of writing out the free space range tree.
*
- * 2. The minimal on-disk space map representation is zfs_condense_pct/100
- * times the size than the free space range tree representation
- * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
+ * 2. Condense if the on on-disk space map representation is at least
+ * zfs_condense_pct/100 times the size of the optimal representation
+ * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
*
- * 3. The on-disk size of the space map should actually decrease.
+ * 3. Do not condense if the on-disk size of the space map does not actually
+ * decrease.
*
* Unfortunately, we cannot compute the on-disk size of the space map in this
* context because we cannot accurately compute the effects of compression, etc.
@@ -2676,27 +2871,11 @@ metaslab_should_condense(metaslab_t *msp)
space_map_t *sm = msp->ms_sm;
vdev_t *vd = msp->ms_group->mg_vd;
uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
- uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
-
- /*
- * Allocations and frees in early passes are generally more space
- * efficient (in terms of blocks described in space map entries)
- * than the ones in later passes (e.g. we don't compress after
- * sync pass 5) and condensing a metaslab multiple times in a txg
- * could degrade performance.
- *
- * Thus we prefer condensing each metaslab at most once every txg at
- * the earliest sync pass possible. If a metaslab is eligible for
- * condensing again after being considered for condensing within the
- * same txg, it will hopefully be dirty in the next txg where it will
- * be condensed at an earlier pass.
- */
- if (msp->ms_condense_checked_txg == current_txg)
- return (B_FALSE);
- msp->ms_condense_checked_txg = current_txg;
+ ASSERT(sm != NULL);
+ ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
/*
* We always condense metaslabs that are empty and metaslabs for
@@ -2706,97 +2885,343 @@ metaslab_should_condense(metaslab_t *msp)
msp->ms_condense_wanted)
return (B_TRUE);
- uint64_t object_size = space_map_length(msp->ms_sm);
+ uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
+ uint64_t object_size = space_map_length(sm);
uint64_t optimal_size = space_map_estimate_optimal_size(sm,
msp->ms_allocatable, SM_NO_VDEVID);
- dmu_object_info_t doi;
- dmu_object_info_from_db(sm->sm_dbuf, &doi);
- uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
-
return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
object_size > zfs_metaslab_condense_block_threshold * record_size);
}
/*
* Condense the on-disk space map representation to its minimized form.
- * The minimized form consists of a small number of allocations followed by
- * the entries of the free range tree.
+ * The minimized form consists of a small number of allocations followed
+ * by the entries of the free range tree (ms_allocatable). The condensed
+ * spacemap contains all the entries of previous TXGs (including those in
+ * the pool-wide log spacemaps; thus this is effectively a superset of
+ * metaslab_flush()), but this TXG's entries still need to be written.
*/
static void
-metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
+metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
{
range_tree_t *condense_tree;
space_map_t *sm = msp->ms_sm;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
+ ASSERT(msp->ms_sm != NULL);
+ /*
+ * In order to condense the space map, we need to change it so it
+ * only describes which segments are currently allocated and free.
+ *
+ * All the current free space resides in the ms_allocatable, all
+ * the ms_defer trees, and all the ms_allocating trees. We ignore
+ * ms_freed because it is empty because we're in sync pass 1. We
+ * ignore ms_freeing because these changes are not yet reflected
+ * in the spacemap (they will be written later this txg).
+ *
+ * So to truncate the space map to represent all the entries of
+ * previous TXGs we do the following:
+ *
+ * 1] We create a range tree (condense tree) that is 100% allocated.
+ * 2] We remove from it all segments found in the ms_defer trees
+ * as those segments are marked as free in the original space
+ * map. We do the same with the ms_allocating trees for the same
+ * reason. Removing these segments should be a relatively
+ * inexpensive operation since we expect these trees to have a
+ * small number of nodes.
+ * 3] We vacate any unflushed allocs as they should already exist
+ * in the condense tree. Then we vacate any unflushed frees as
+ * they should already be part of ms_allocatable.
+ * 4] At this point, we would ideally like to remove all segments
+ * in the ms_allocatable tree from the condense tree. This way
+ * we would write all the entries of the condense tree as the
+ * condensed space map, which would only contain allocated
+ * segments with everything else assumed to be freed.
+ *
+ * Doing so can be prohibitively expensive as ms_allocatable can
+ * be large, and therefore computationally expensive to subtract
+ * from the condense_tree. Instead we first sync out the
+ * condense_tree and then the ms_allocatable, in the condensed
+ * space map. While this is not optimal, it is typically close to
+ * optimal and more importantly much cheaper to compute.
+ *
+ * 5] Finally, as both of the unflushed trees were written to our
+ * new and condensed metaslab space map, we basically flushed
+ * all the unflushed changes to disk, thus we call
+ * metaslab_flush_update().
+ */
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
- msp->ms_group->mg_vd->vdev_spa->spa_name,
- space_map_length(msp->ms_sm),
+ spa->spa_name, space_map_length(msp->ms_sm),
avl_numnodes(&msp->ms_allocatable->rt_root),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
msp->ms_condense_wanted = B_FALSE;
- /*
- * Create an range tree that is 100% allocated. We remove segments
- * that have been freed in this txg, any deferred frees that exist,
- * and any allocation in the future. Removing segments should be
- * a relatively inexpensive operation since we expect these trees to
- * have a small number of nodes.
- */
condense_tree = range_tree_create(NULL, NULL);
range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
- range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
- range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
-
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
range_tree_remove, condense_tree);
}
- for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
range_tree_remove, condense_tree);
}
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+ range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
/*
- * We're about to drop the metaslab's lock thus allowing
- * other consumers to change it's content. Set the
- * metaslab's ms_condensing flag to ensure that
- * allocations on this metaslab do not occur while we're
- * in the middle of committing it to disk. This is only critical
- * for ms_allocatable as all other range trees use per txg
+ * We're about to drop the metaslab's lock thus allowing other
+ * consumers to change it's content. Set the metaslab's ms_condensing
+ * flag to ensure that allocations on this metaslab do not occur
+ * while we're in the middle of committing it to disk. This is only
+ * critical for ms_allocatable as all other range trees use per TXG
* views of their content.
*/
msp->ms_condensing = B_TRUE;
mutex_exit(&msp->ms_lock);
- space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
+ uint64_t object = space_map_object(msp->ms_sm);
+ space_map_truncate(sm,
+ spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+ zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
+
+ /*
+ * space_map_truncate() may have reallocated the spacemap object.
+ * If so, update the vdev_ms_array.
+ */
+ if (space_map_object(msp->ms_sm) != object) {
+ object = space_map_object(msp->ms_sm);
+ dmu_write(spa->spa_meta_objset,
+ msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
+ msp->ms_id, sizeof (uint64_t), &object, tx);
+ }
/*
- * While we would ideally like to create a space map representation
- * that consists only of allocation records, doing so can be
- * prohibitively expensive because the in-core free tree can be
- * large, and therefore computationally expensive to subtract
- * from the condense_tree. Instead we sync out two trees, a cheap
- * allocation only tree followed by the in-core free tree. While not
- * optimal, this is typically close to optimal, and much cheaper to
- * compute.
+ * Note:
+ * When the log space map feature is enabled, each space map will
+ * always have ALLOCS followed by FREES for each sync pass. This is
+ * typically true even when the log space map feature is disabled,
+ * except from the case where a metaslab goes through metaslab_sync()
+ * and gets condensed. In that case the metaslab's space map will have
+ * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
+ * followed by FREES (due to space_map_write() in metaslab_sync()) for
+ * sync pass 1.
*/
space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+ space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+
range_tree_vacate(condense_tree, NULL, NULL);
range_tree_destroy(condense_tree);
-
- space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
mutex_enter(&msp->ms_lock);
+
msp->ms_condensing = B_FALSE;
+ metaslab_flush_update(msp, tx);
+}
+
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc..) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ spa_t *spa = mg->mg_vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+
+ /*
+ * Just because a metaslab got flushed, that doesn't mean that
+ * it will pass through metaslab_sync_done(). Thus, make sure to
+ * update ms_synced_length here in case it doesn't.
+ */
+ msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+ /*
+ * We may end up here from metaslab_condense() without the
+ * feature being active. In that case this is a no-op.
+ */
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ ASSERT(spa_syncing_log_sm(spa) != NULL);
+ ASSERT(msp->ms_sm != NULL);
+ ASSERT(metaslab_unflushed_txg(msp) != 0);
+ ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+
+ VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
+
+ /* update metaslab's position in our flushing tree */
+ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+ metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+ avl_add(&spa->spa_metaslabs_by_flushed, msp);
+ mutex_exit(&spa->spa_flushed_ms_lock);
+
+ /* update metaslab counts of spa_log_sm_t nodes */
+ spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
+ spa_log_sm_increment_current_mscount(spa);
+
+ /* cleanup obsolete logs if any */
+ uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
+ spa_cleanup_old_sm_logs(spa, tx);
+ uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
+ VERIFY3U(log_blocks_after, <=, log_blocks_before);
+
+ /* update log space map summary */
+ uint64_t blocks_gone = log_blocks_before - log_blocks_after;
+ spa_log_summary_add_flushed_metaslab(spa);
+ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
+ spa_log_summary_decrement_blkcount(spa, blocks_gone);
+}
+
+boolean_t
+metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ ASSERT(msp->ms_sm != NULL);
+ ASSERT(metaslab_unflushed_txg(msp) != 0);
+ ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
+
+ /*
+ * There is nothing wrong with flushing the same metaslab twice, as
+ * this codepath should work on that case. However, the current
+ * flushing scheme makes sure to avoid this situation as we would be
+ * making all these calls without having anything meaningful to write
+ * to disk. We assert this behavior here.
+ */
+ ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
+
+ /*
+ * We can not flush while loading, because then we would
+ * not load the ms_unflushed_{allocs,frees}.
+ */
+ if (msp->ms_loading)
+ return (B_FALSE);
+
+ metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+ metaslab_verify_weight_and_frag(msp);
+
+ /*
+ * Metaslab condensing is effectively flushing. Therefore if the
+ * metaslab can be condensed we can just condense it instead of
+ * flushing it.
+ *
+ * Note that metaslab_condense() does call metaslab_flush_update()
+ * so we can just return immediately after condensing. We also
+ * don't need to care about setting ms_flushing or broadcasting
+ * ms_flush_cv, even if we temporarily drop the ms_lock in
+ * metaslab_condense(), as the metaslab is already loaded.
+ */
+ if (msp->ms_loaded && metaslab_should_condense(msp)) {
+ metaslab_group_t *mg = msp->ms_group;
+
+ /*
+ * For all histogram operations below refer to the
+ * comments of metaslab_sync() where we follow a
+ * similar procedure.
+ */
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+ metaslab_group_histogram_remove(mg, msp);
+
+ metaslab_condense(msp, tx);
+
+ space_map_histogram_clear(msp->ms_sm);
+ space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
+ ASSERT(range_tree_is_empty(msp->ms_freed));
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ space_map_histogram_add(msp->ms_sm,
+ msp->ms_defer[t], tx);
+ }
+ metaslab_aux_histograms_update(msp);
+
+ metaslab_group_histogram_add(mg, msp);
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+
+ metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+
+ /*
+ * Since we recreated the histogram (and potentially
+ * the ms_sm too while condensing) ensure that the
+ * weight is updated too because we are not guaranteed
+ * that this metaslab is dirty and will go through
+ * metaslab_sync_done().
+ */
+ metaslab_recalculate_weight_and_sort(msp);
+ return (B_TRUE);
+ }
+
+ msp->ms_flushing = B_TRUE;
+ uint64_t sm_len_before = space_map_length(msp->ms_sm);
+
+ mutex_exit(&msp->ms_lock);
+ space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
+ SM_NO_VDEVID, tx);
+ space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
+ SM_NO_VDEVID, tx);
+ mutex_enter(&msp->ms_lock);
+
+ uint64_t sm_len_after = space_map_length(msp->ms_sm);
+ if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
+ zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
+ "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
+ "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
+ msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+ range_tree_space(msp->ms_unflushed_allocs),
+ range_tree_space(msp->ms_unflushed_frees),
+ (sm_len_after - sm_len_before));
+ }
+
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+ range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
+ metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+ metaslab_verify_weight_and_frag(msp);
+
+ metaslab_flush_update(msp, tx);
+
+ metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+ metaslab_verify_weight_and_frag(msp);
+
+ msp->ms_flushing = B_FALSE;
+ cv_broadcast(&msp->ms_flush_cv);
+ return (B_TRUE);
}
/*
@@ -2811,7 +3236,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
objset_t *mos = spa_meta_objset(spa);
range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
dmu_tx_t *tx;
- uint64_t object = space_map_object(msp->ms_sm);
ASSERT(!vd->vdev_ishole);
@@ -2858,25 +3282,53 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
*/
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
- if (msp->ms_sm == NULL) {
- uint64_t new_object;
+ /*
+ * Generate a log space map if one doesn't exist already.
+ */
+ spa_generate_syncing_log_sm(spa, tx);
- new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
+ if (msp->ms_sm == NULL) {
+ uint64_t new_object = space_map_alloc(mos,
+ spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+ zfs_metaslab_sm_blksz_with_log :
+ zfs_metaslab_sm_blksz_no_log, tx);
VERIFY3U(new_object, !=, 0);
+ dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+ msp->ms_id, sizeof (uint64_t), &new_object, tx);
+
VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
msp->ms_start, msp->ms_size, vd->vdev_ashift));
-
ASSERT(msp->ms_sm != NULL);
+
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
ASSERT0(metaslab_allocated_space(msp));
}
+ if (metaslab_unflushed_txg(msp) == 0 &&
+ spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+ ASSERT(spa_syncing_log_sm(spa) != NULL);
+
+ metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+ spa_log_sm_increment_current_mscount(spa);
+ spa_log_summary_add_flushed_metaslab(spa);
+
+ ASSERT(msp->ms_sm != NULL);
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ avl_add(&spa->spa_metaslabs_by_flushed, msp);
+ mutex_exit(&spa->spa_flushed_ms_lock);
+
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+ }
+
if (!range_tree_is_empty(msp->ms_checkpointing) &&
vd->vdev_checkpoint_sm == NULL) {
ASSERT(spa_has_checkpoint(spa));
uint64_t new_object = space_map_alloc(mos,
- vdev_standard_sm_blksz, tx);
+ zfs_vdev_standard_sm_blksz, tx);
VERIFY3U(new_object, !=, 0);
VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
@@ -2905,10 +3357,39 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
metaslab_class_histogram_verify(mg->mg_class);
metaslab_group_histogram_remove(mg, msp);
- if (msp->ms_loaded && metaslab_should_condense(msp)) {
- metaslab_condense(msp, txg, tx);
+ if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
+ metaslab_should_condense(msp))
+ metaslab_condense(msp, tx);
+
+ /*
+ * We'll be going to disk to sync our space accounting, thus we
+ * drop the ms_lock during that time so allocations coming from
+ * open-context (ZIL) for future TXGs do not block.
+ */
+ mutex_exit(&msp->ms_lock);
+ space_map_t *log_sm = spa_syncing_log_sm(spa);
+ if (log_sm != NULL) {
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ space_map_write(log_sm, alloctree, SM_ALLOC,
+ vd->vdev_id, tx);
+ space_map_write(log_sm, msp->ms_freeing, SM_FREE,
+ vd->vdev_id, tx);
+ mutex_enter(&msp->ms_lock);
+
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_remove_xor_add(alloctree,
+ msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
+ range_tree_remove_xor_add(msp->ms_freeing,
+ msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
+ spa->spa_unflushed_stats.sus_memused +=
+ metaslab_unflushed_changes_memused(msp);
} else {
- mutex_exit(&msp->ms_lock);
+ ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
SM_NO_VDEVID, tx);
space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
@@ -2928,7 +3409,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* Since we are doing writes to disk and the ms_checkpointing
* tree won't be changing during that time, we drop the
- * ms_lock while writing to the checkpoint space map.
+ * ms_lock while writing to the checkpoint space map, for the
+ * same reason mentioned above.
*/
mutex_exit(&msp->ms_lock);
space_map_write(vd->vdev_checkpoint_sm,
@@ -2996,6 +3478,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* and instead will just swap the pointers for freeing and freed.
* We can safely do this since the freed_tree is guaranteed to be
* empty on the initial pass.
+ *
+ * Keep in mind that even if we are currently using a log spacemap
+ * we want current frees to end up in the ms_allocatable (but not
+ * get appended to the ms_sm) so their ranges can be reused as usual.
*/
if (spa_sync_pass(spa) == 1) {
range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
@@ -3015,11 +3501,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
mutex_exit(&msp->ms_lock);
- if (object != space_map_object(msp->ms_sm)) {
- object = space_map_object(msp->ms_sm);
- dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
- msp->ms_id, sizeof (uint64_t), &object, tx);
- }
+ /*
+ * Verify that the space map object ID has been recorded in the
+ * vdev_ms_array.
+ */
+ uint64_t object;
+ VERIFY0(dmu_read(mos, vd->vdev_ms_array,
+ msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
+ VERIFY3U(object, ==, space_map_object(msp->ms_sm));
+
mutex_exit(&msp->ms_sync_lock);
dmu_tx_commit(tx);
}
@@ -3084,14 +3574,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
msp->ms_freed = range_tree_create(NULL, NULL);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- ASSERT(msp->ms_defer[t] == NULL);
-
+ ASSERT3P(msp->ms_defer[t], ==, NULL);
msp->ms_defer[t] = range_tree_create(NULL, NULL);
}
ASSERT3P(msp->ms_checkpointing, ==, NULL);
msp->ms_checkpointing = range_tree_create(NULL, NULL);
+ ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
+ msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
+ ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
+ msp->ms_unflushed_frees = range_tree_create(NULL, NULL);
+
metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
}
ASSERT0(range_tree_space(msp->ms_freeing));
@@ -3108,21 +3602,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
defer_delta = 0;
alloc_delta = msp->ms_allocated_this_txg -
range_tree_space(msp->ms_freed);
+
if (defer_allowed) {
defer_delta = range_tree_space(msp->ms_freed) -
range_tree_space(*defer_tree);
} else {
defer_delta -= range_tree_space(*defer_tree);
}
-
metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
defer_delta, 0);
- /*
- * If there's a metaslab_load() in progress, wait for it to complete
- * so that we have a consistent view of the in-core space map.
- */
- metaslab_load_wait(msp);
+ if (spa_syncing_log_sm(spa) == NULL) {
+ /*
+ * If there's a metaslab_load() in progress and we don't have
+ * a log space map, it means that we probably wrote to the
+ * metaslab's space map. If this is the case, we need to
+ * make sure that we wait for the load to complete so that we
+ * have a consistent view at the in-core side of the metaslab.
+ */
+ metaslab_load_wait(msp);
+ } else {
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+ }
/*
* When auto-trimming is enabled, free ranges which are added to
@@ -3451,6 +3952,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
range_tree_t *rt = msp->ms_allocatable;
metaslab_class_t *mc = msp->ms_group->mg_class;
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
VERIFY(!msp->ms_condensing);
VERIFY0(msp->ms_disabled);
@@ -4864,12 +5366,23 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
offset, size);
}
- range_tree_verify_not_present(msp->ms_trim, offset, size);
+ /*
+ * Check all segments that currently exist in the freeing pipeline.
+ *
+ * It would intuitively make sense to also check the current allocating
+ * tree since metaslab_unalloc_dva() exists for extents that are
+ * allocated and freed in the same sync pass withing the same txg.
+ * Unfortunately there are places (e.g. the ZIL) where we allocate a
+ * segment but then we free part of it within the same txg
+ * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
+ * current allocating tree.
+ */
range_tree_verify_not_present(msp->ms_freeing, offset, size);
range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
range_tree_verify_not_present(msp->ms_freed, offset, size);
for (int j = 0; j < TXG_DEFER_SIZE; j++)
range_tree_verify_not_present(msp->ms_defer[j], offset, size);
+ range_tree_verify_not_present(msp->ms_trim, offset, size);
mutex_exit(&msp->ms_lock);
}
@@ -4979,6 +5492,57 @@ metaslab_enable(metaslab_t *msp, boolean_t sync)
mutex_exit(&mg->mg_ms_disabled_lock);
}
+static void
+metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
+{
+ vdev_t *vd = ms->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa_meta_objset(spa);
+
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+ metaslab_unflushed_phys_t entry = {
+ .msp_unflushed_txg = metaslab_unflushed_txg(ms),
+ };
+ uint64_t entry_size = sizeof (entry);
+ uint64_t entry_offset = ms->ms_id * entry_size;
+
+ uint64_t object = 0;
+ int err = zap_lookup(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+ &object);
+ if (err == ENOENT) {
+ object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ VERIFY0(zap_add(mos, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+ &object, tx));
+ } else {
+ VERIFY0(err);
+ }
+
+ dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
+ &entry, tx);
+}
+
+void
+metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
+{
+ spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ ms->ms_unflushed_txg = txg;
+ metaslab_update_ondisk_flush_data(ms, tx);
+}
+
+uint64_t
+metaslab_unflushed_txg(metaslab_t *ms)
+{
+ return (ms->ms_unflushed_txg);
+}
+
#if defined(_KERNEL)
/* BEGIN CSTYLED */
module_param(metaslab_aliquot, ulong, 0644);