1 files changed, 770 insertions, 206 deletions
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5da929b48..0b22aa875 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -56,12 +56,21 @@ unsigned long metaslab_aliquot = 512 << 10;
 unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
 /*
- * Since we can touch multiple metaslabs (and their respective space maps)
- * with each transaction group, we benefit from having a smaller space map
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
- * around the disk.
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
  */
-int zfs_metaslab_sm_blksz = (1 << 12);
+int zfs_metaslab_sm_blksz_no_log = (1 << 14);
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+int zfs_metaslab_sm_blksz_with_log = (1 << 17);
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
@@ -270,6 +279,7 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 
 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
+static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 #ifdef _METASLAB_TRACING
 kmem_cache_t *metaslab_alloc_trace_cache;
 #endif
@@ -540,67 +550,6 @@ metaslab_compare(const void *x1, const void *x2)
 	return (AVL_CMP(m1->ms_start, m2->ms_start));
 }
 
-uint64_t
-metaslab_allocated_space(metaslab_t *msp)
-{
-	return (msp->ms_allocated_space);
-}
-
-/*
- * Verify that the space accounting on disk matches the in-core range_trees.
- */
-static void
-metaslab_verify_space(metaslab_t *msp, uint64_t txg)
-{
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-	uint64_t allocating = 0;
-	uint64_t sm_free_space, msp_free_space;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(!msp->ms_condensing);
-
-	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
-		return;
-
-	/*
-	 * We can only verify the metaslab space when we're called
-	 * from syncing context with a loaded metaslab that has an
-	 * allocated space map. Calling this in non-syncing context
-	 * does not provide a consistent view of the metaslab since
-	 * we're performing allocations in the future.
-	 */
-	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
-	    !msp->ms_loaded)
-		return;
-
-	/*
-	 * Even though the smp_alloc field can get negative (e.g.
-	 * see vdev_checkpoint_sm), that should never be the case
-	 * when it come's to a metaslab's space map.
-	 */
-	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
-
-	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
-
-	/*
-	 * Account for future allocations since we would have
-	 * already deducted that space from the ms_allocatable.
-	 */
-	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
-		allocating +=
-		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
-	}
-
-	ASSERT3U(msp->ms_deferspace, ==,
-	    range_tree_space(msp->ms_defer[0]) +
-	    range_tree_space(msp->ms_defer[1]));
-
-	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
-	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
-
-	VERIFY3U(sm_free_space, ==, msp_free_space);
-}
-
 /*
  * ==========================================================================
  * Metaslab groups
@@ -689,6 +638,25 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
 	mutex_exit(&mg->mg_lock);
 }
 
+int
+metaslab_sort_by_flushed(const void *va, const void *vb)
+{
+	const metaslab_t *a = va;
+	const metaslab_t *b = vb;
+
+	int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
+	if (likely(cmp))
+		return (cmp);
+
+	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
+	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
+	cmp = AVL_CMP(a_vdev_id, b_vdev_id);
+	if (cmp)
+		return (cmp);
+
+	return (AVL_CMP(a->ms_id, b->ms_id));
+}
+
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
@@ -703,7 +671,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 	    KM_SLEEP);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
-	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
@@ -900,7 +868,6 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
-		ASSERT(msp != NULL);
 
 		/* skip if not active or not a member */
 		if (msp->ms_sm == NULL || msp->ms_group != mg)
@@ -1454,6 +1421,101 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
  * ==========================================================================
  */
 
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_loading) {
+		ASSERT(!msp->ms_loaded);
+		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+	}
+}
+
+/*
+ * Wait for any in-progress flushing to complete.
+ */
+void
+metaslab_flush_wait(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_flushing)
+		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
+}
+
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+	return (msp->ms_allocated_space);
+}
+
+/*
+ * Verify that the space accounting on disk matches the in-core range_trees.
+ */
+static void
+metaslab_verify_space(metaslab_t *msp, uint64_t txg)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	uint64_t allocating = 0;
+	uint64_t sm_free_space, msp_free_space;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(!msp->ms_condensing);
+
+	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+		return;
+
+	/*
+	 * We can only verify the metaslab space when we're called
+	 * from syncing context with a loaded metaslab that has an
+	 * allocated space map. Calling this in non-syncing context
+	 * does not provide a consistent view of the metaslab since
+	 * we're performing allocations in the future.
+	 */
+	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
+	    !msp->ms_loaded)
+		return;
+
+	/*
+	 * Even though the smp_alloc field can get negative,
+	 * when it comes to a metaslab's space map, that should
+	 * never be the case.
+	 */
+	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
+	    range_tree_space(msp->ms_unflushed_frees));
+
+	ASSERT3U(metaslab_allocated_space(msp), ==,
+	    space_map_allocated(msp->ms_sm) +
+	    range_tree_space(msp->ms_unflushed_allocs) -
+	    range_tree_space(msp->ms_unflushed_frees));
+
+	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
+
+	/*
+	 * Account for future allocations since we would have
+	 * already deducted that space from the ms_allocatable.
+	 */
+	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+		allocating +=
+		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
+	}
+
+	ASSERT3U(msp->ms_deferspace, ==,
+	    range_tree_space(msp->ms_defer[0]) +
+	    range_tree_space(msp->ms_defer[1]));
+
+	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
+	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
+
+	VERIFY3U(sm_free_space, ==, msp_free_space);
+}
+
 static void
 metaslab_aux_histograms_clear(metaslab_t *msp)
 {
@@ -1651,20 +1713,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
 	VERIFY3U(msp->ms_weight, ==, weight);
 }
 
-/*
- * Wait for any in-progress metaslab loads to complete.
- */
-static void
-metaslab_load_wait(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	while (msp->ms_loading) {
-		ASSERT(!msp->ms_loaded);
-		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
-	}
-}
-
 static int
 metaslab_load_impl(metaslab_t *msp)
 {
@@ -1679,13 +1727,19 @@ metaslab_load_impl(metaslab_t *msp)
 	 * are reading the space map. Therefore, metaslab_sync() and
 	 * metaslab_sync_done() can run at the same time as we do.
 	 *
-	 * metaslab_sync() can append to the space map while we are loading.
-	 * Therefore we load only entries that existed when we started the
-	 * load. Additionally, metaslab_sync_done() has to wait for the load
-	 * to complete because there are potential races like metaslab_load()
-	 * loading parts of the space map that are currently being appended
-	 * by metaslab_sync(). If we didn't, the ms_allocatable would have
-	 * entries that metaslab_sync_done() would try to re-add later.
+	 * If we are using the log space maps, metaslab_sync() can't write to
+	 * the metaslab's space map while we are loading as we only write to
+	 * it when we are flushing the metaslab, and that can't happen while
+	 * we are loading it.
+	 *
+	 * If we are not using log space maps though, metaslab_sync() can
+	 * append to the space map while we are loading. Therefore we load
+	 * only entries that existed when we started the load. Additionally,
+	 * metaslab_sync_done() has to wait for the load to complete because
+	 * there are potential races like metaslab_load() loading parts of the
+	 * space map that are currently being appended by metaslab_sync(). If
+	 * we didn't, the ms_allocatable would have entries that
+	 * metaslab_sync_done() would try to re-add later.
 	 *
 	 * That's why before dropping the lock we remember the synced length
 	 * of the metaslab and read up to that point of the space map,
@@ -1695,6 +1749,7 @@ metaslab_load_impl(metaslab_t *msp)
 	uint64_t length = msp->ms_synced_length;
 	mutex_exit(&msp->ms_lock);
 
+	hrtime_t load_start = gethrtime();
 	if (msp->ms_sm != NULL) {
 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 		    SM_FREE, length);
@@ -1706,18 +1761,37 @@ metaslab_load_impl(metaslab_t *msp)
 		 */
 		range_tree_add(msp->ms_allocatable,
 		    msp->ms_start, msp->ms_size);
+
+		if (msp->ms_freed != NULL) {
+			/*
+			 * If the ms_sm doesn't exist, this means that this
+			 * metaslab hasn't gone through metaslab_sync() and
+			 * thus has never been dirtied. So we shouldn't
+			 * expect any unflushed allocs or frees from previous
+			 * TXGs.
+			 *
+			 * Note: ms_freed and all the other trees except for
+			 * the ms_allocatable, can be NULL at this point only
+			 * if this is a new metaslab of a vdev that just got
+			 * expanded.
+			 */
+			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+		}
 	}
 
 	/*
 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
-	 * changing the ms_sm and the metaslab's range trees while we are
-	 * about to use them and populate the ms_allocatable. The ms_lock
-	 * is insufficient for this because metaslab_sync() doesn't hold
-	 * the ms_lock while writing the ms_checkpointing tree to disk.
+	 * changing the ms_sm (or log_sm) and the metaslab's range trees
+	 * while we are about to use them and populate the ms_allocatable.
+	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
+	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
 	 */
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
+
 	ASSERT(!msp->ms_condensing);
+	ASSERT(!msp->ms_flushing);
 
 	if (error != 0) {
 		mutex_exit(&msp->ms_sync_lock);
@@ -1728,10 +1802,60 @@ metaslab_load_impl(metaslab_t *msp)
 	msp->ms_loaded = B_TRUE;
 
 	/*
-	 * The ms_allocatable contains the segments that exist in the
-	 * ms_defer trees [see ms_synced_length]. Thus we need to remove
-	 * them from ms_allocatable as they will be added again in
+	 * Apply all the unflushed changes to ms_allocatable right
+	 * away so any manipulations we do below have a clear view
+	 * of what is allocated and what is free.
+	 */
+	range_tree_walk(msp->ms_unflushed_allocs,
+	    range_tree_remove, msp->ms_allocatable);
+	range_tree_walk(msp->ms_unflushed_frees,
+	    range_tree_add, msp->ms_allocatable);
+
+	msp->ms_loaded = B_TRUE;
+
+	ASSERT3P(msp->ms_group, !=, NULL);
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	if (spa_syncing_log_sm(spa) != NULL) {
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LOG_SPACEMAP));
+
+		/*
+		 * If we use a log space map we add all the segments
+		 * that are in ms_unflushed_frees so they are available
+		 * for allocation.
+		 *
+		 * ms_allocatable needs to contain all free segments
+		 * that are ready for allocations (thus not segments
+		 * from ms_freeing, ms_freed, and the ms_defer trees).
+		 * But if we grab the lock in this code path at a sync
+		 * pass later that 1, then it also contains the
+		 * segments of ms_freed (they were added to it earlier
+		 * in this path through ms_unflushed_frees). So we
+		 * need to remove all the segments that exist in
+		 * ms_freed from ms_allocatable as they will be added
+		 * later in metaslab_sync_done().
+		 *
+		 * When there's no log space map, the ms_allocatable
+		 * correctly doesn't contain any segments that exist
+		 * in ms_freed [see ms_synced_length].
+		 */
+		range_tree_walk(msp->ms_freed,
+		    range_tree_remove, msp->ms_allocatable);
+	}
+
+	/*
+	 * If we are not using the log space map, ms_allocatable
+	 * contains the segments that exist in the ms_defer trees
+	 * [see ms_synced_length]. Thus we need to remove them
+	 * from ms_allocatable as they will be added again in
 	 * metaslab_sync_done().
+	 *
+	 * If we are using the log space map, ms_allocatable still
+	 * contains the segments that exist in the ms_defer trees.
+	 * Not because it read them through the ms_sm though. But
+	 * because these segments are part of ms_unflushed_frees
+	 * whose segments we add to ms_allocatable earlier in this
+	 * code path.
 	 */
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
@@ -1756,10 +1880,26 @@ metaslab_load_impl(metaslab_t *msp)
 		ASSERT3U(weight, <=, msp->ms_weight);
 	msp->ms_max_size = metaslab_block_maxsize(msp);
 
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	hrtime_t load_end = gethrtime();
+	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
+		zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
+		    "ms_id %llu, smp_length %llu, "
+		    "unflushed_allocs %llu, unflushed_frees %llu, "
+		    "freed %llu, defer %llu + %llu, "
+		    "loading_time %lld ms",
+		    spa_syncing_txg(spa), spa_name(spa),
+		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+		    space_map_length(msp->ms_sm),
+		    range_tree_space(msp->ms_unflushed_allocs),
+		    range_tree_space(msp->ms_unflushed_frees),
+		    range_tree_space(msp->ms_freed),
+		    range_tree_space(msp->ms_defer[0]),
+		    range_tree_space(msp->ms_defer[1]),
+		    (longlong_t)((load_end - load_start) / 1000000));
+	}
+
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_sync_lock);
-
 	return (0);
 }
 
@@ -1778,8 +1918,32 @@ metaslab_load(metaslab_t *msp)
 	VERIFY(!msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
+	/*
+	 * We set the loading flag BEFORE potentially dropping the lock to
+	 * wait for an ongoing flush (see ms_flushing below). This way other
+	 * threads know that there is already a thread that is loading this
+	 * metaslab.
+	 */
 	msp->ms_loading = B_TRUE;
+
+	/*
+	 * Wait for any in-progress flushing to finish as we drop the ms_lock
+	 * both here (during space_map_load()) and in metaslab_flush() (when
+	 * we flush our changes to the ms_sm).
+	 */
+	if (msp->ms_flushing)
+		metaslab_flush_wait(msp);
+
+	/*
+	 * In the possibility that we were waiting for the metaslab to be
+	 * flushed (where we temporarily dropped the ms_lock), ensure that
+	 * no one else loaded the metaslab somehow.
+	 */
+	ASSERT(!msp->ms_loaded);
+
 	int error = metaslab_load_impl(msp);
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
 
@@ -1806,7 +1970,7 @@ metaslab_unload(metaslab_t *msp)
 	 * have their weights calculated from the space map histograms, while
 	 * loaded ones have it calculated from their in-core range tree
 	 * [see metaslab_load()]. This way, the weight reflects the information
-	 * available in-core, whether it is loaded or not
+	 * available in-core, whether it is loaded or not.
 	 *
 	 * If ms_group == NULL means that we came here from metaslab_fini(),
 	 * at which point it doesn't make sense for us to do the recalculation
@@ -1816,7 +1980,7 @@ metaslab_unload(metaslab_t *msp)
 		metaslab_recalculate_weight_and_sort(msp);
 }
 
-static void
+void
 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta)
 {
@@ -1830,8 +1994,8 @@ metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
 }
 
 int
-metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
-    metaslab_t **msp)
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
+    uint64_t txg, metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
@@ -1843,6 +2007,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
 
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
@@ -1905,17 +2070,6 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 		    metaslab_allocated_space(ms), 0, 0);
 	}
 
-	/*
-	 * If metaslab_debug_load is set and we're initializing a metaslab
-	 * that has an allocated space map object then load the space map
-	 * so that we can verify frees.
-	 */
-	if (metaslab_debug_load && ms->ms_sm != NULL) {
-		mutex_enter(&ms->ms_lock);
-		VERIFY0(metaslab_load(ms));
-		mutex_exit(&ms->ms_lock);
-	}
-
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
@@ -1926,11 +2080,42 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	return (0);
 }
 
+static void
+metaslab_fini_flush_data(metaslab_t *msp)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	if (metaslab_unflushed_txg(msp) == 0) {
+		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
+		    ==, NULL);
+		return;
+	}
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	mutex_enter(&spa->spa_flushed_ms_lock);
+	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+	mutex_exit(&spa->spa_flushed_ms_lock);
+
+	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+}
+
+uint64_t
+metaslab_unflushed_changes_memused(metaslab_t *ms)
+{
+	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
+	    range_tree_numsegs(ms->ms_unflushed_frees)) *
+	    sizeof (range_seg_t));
+}
+
 void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+
+	metaslab_fini_flush_data(msp);
 
 	metaslab_group_remove(mg, msp);
 
@@ -1940,13 +2125,22 @@ metaslab_fini(metaslab_t *msp)
 	    -metaslab_allocated_space(msp), 0, -msp->ms_size);
 
 	space_map_close(msp->ms_sm);
+	msp->ms_sm = NULL;
 
 	metaslab_unload(msp);
-
 	range_tree_destroy(msp->ms_allocatable);
 	range_tree_destroy(msp->ms_freeing);
 	range_tree_destroy(msp->ms_freed);
 
+	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+	    metaslab_unflushed_changes_memused(msp));
+	spa->spa_unflushed_stats.sus_memused -=
+	    metaslab_unflushed_changes_memused(msp);
+	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+	range_tree_destroy(msp->ms_unflushed_allocs);
+	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+	range_tree_destroy(msp->ms_unflushed_frees);
+
 	for (int t = 0; t < TXG_SIZE; t++) {
 		range_tree_destroy(msp->ms_allocating[t]);
 	}
@@ -1966,6 +2160,7 @@ metaslab_fini(metaslab_t *msp)
 
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
+	cv_destroy(&msp->ms_flush_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
 	ASSERT3U(msp->ms_allocator, ==, -1);
@@ -2207,9 +2402,9 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
 }
 
 /*
- * Calculate the weight based on the on-disk histogram. This should only
- * be called after a sync pass has completely finished since the on-disk
- * information is updated in metaslab_sync().
+ * Calculate the weight based on the on-disk histogram. Should be applied
+ * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
+ * give results consistent with the on-disk state
  */
 static uint64_t
 metaslab_weight_from_spacemap(metaslab_t *msp)
@@ -2283,7 +2478,6 @@ metaslab_segment_weight(metaslab_t *msp)
 		}
 		WEIGHT_SET_ACTIVE(weight, 0);
 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
-
 		return (weight);
 	}
 
@@ -2651,18 +2845,19 @@ metaslab_group_preload(metaslab_group_t *mg)
 }
 
 /*
- * Determine if the space map's on-disk footprint is past our tolerance
- * for inefficiency. We would like to use the following criteria to make
- * our decision:
+ * Determine if the space map's on-disk footprint is past our tolerance for
+ * inefficiency. We would like to use the following criteria to make our
+ * decision:
  *
- * 1. The size of the space map object should not dramatically increase as a
- * result of writing out the free space range tree.
+ * 1. Do not condense if the size of the space map object would dramatically
+ *    increase as a result of writing out the free space range tree.
  *
- * 2. The minimal on-disk space map representation is zfs_condense_pct/100
- * times the size than the free space range tree representation
- * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
+ * 2. Condense if the on on-disk space map representation is at least
+ *    zfs_condense_pct/100 times the size of the optimal representation
+ *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
  *
- * 3. The on-disk size of the space map should actually decrease.
+ * 3. Do not condense if the on-disk size of the space map does not actually
+ *    decrease.
  *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
@@ -2676,27 +2871,11 @@ metaslab_should_condense(metaslab_t *msp)
 	space_map_t *sm = msp->ms_sm;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
-	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
-
-	/*
-	 * Allocations and frees in early passes are generally more space
-	 * efficient (in terms of blocks described in space map entries)
-	 * than the ones in later passes (e.g. we don't compress after
-	 * sync pass 5) and condensing a metaslab multiple times in a txg
-	 * could degrade performance.
-	 *
-	 * Thus we prefer condensing each metaslab at most once every txg at
-	 * the earliest sync pass possible. If a metaslab is eligible for
-	 * condensing again after being considered for condensing within the
-	 * same txg, it will hopefully be dirty in the next txg where it will
-	 * be condensed at an earlier pass.
-	 */
-	if (msp->ms_condense_checked_txg == current_txg)
-		return (B_FALSE);
-	msp->ms_condense_checked_txg = current_txg;
+	ASSERT(sm != NULL);
+	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
 
 	/*
 	 * We always condense metaslabs that are empty and metaslabs for
@@ -2706,97 +2885,343 @@ metaslab_should_condense(metaslab_t *msp)
 	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
-	uint64_t object_size = space_map_length(msp->ms_sm);
+	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
+	uint64_t object_size = space_map_length(sm);
 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 	    msp->ms_allocatable, SM_NO_VDEVID);
 
-	dmu_object_info_t doi;
-	dmu_object_info_from_db(sm->sm_dbuf, &doi);
-	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
-
 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
- * The minimized form consists of a small number of allocations followed by
- * the entries of the free range tree.
+ * The minimized form consists of a small number of allocations followed
+ * by the entries of the free range tree (ms_allocatable). The condensed
+ * spacemap contains all the entries of previous TXGs (including those in
+ * the pool-wide log spacemaps; thus this is effectively a superset of
+ * metaslab_flush()), but this TXG's entries still need to be written.
  */
 static void
-metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
+metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 {
 	range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
+	ASSERT(msp->ms_sm != NULL);
 
+	/*
+	 * In order to condense the space map, we need to change it so it
+	 * only describes which segments are currently allocated and free.
+	 *
+	 * All the current free space resides in the ms_allocatable, all
+	 * the ms_defer trees, and all the ms_allocating trees. We ignore
+	 * ms_freed because it is empty because we're in sync pass 1. We
+	 * ignore ms_freeing because these changes are not yet reflected
+	 * in the spacemap (they will be written later this txg).
+	 *
+	 * So to truncate the space map to represent all the entries of
+	 * previous TXGs we do the following:
+	 *
+	 * 1] We create a range tree (condense tree) that is 100% allocated.
+	 * 2] We remove from it all segments found in the ms_defer trees
+	 *    as those segments are marked as free in the original space
+	 *    map. We do the same with the ms_allocating trees for the same
+	 *    reason. Removing these segments should be a relatively
+	 *    inexpensive operation since we expect these trees to have a
+	 *    small number of nodes.
+	 * 3] We vacate any unflushed allocs as they should already exist
+	 *    in the condense tree. Then we vacate any unflushed frees as
+	 *    they should already be part of ms_allocatable.
+	 * 4] At this point, we would ideally like to remove all segments
+	 *    in the ms_allocatable tree from the condense tree. This way
+	 *    we would write all the entries of the condense tree as the
+	 *    condensed space map, which would only contain allocated
+	 *    segments with everything else assumed to be freed.
+	 *
+	 *    Doing so can be prohibitively expensive as ms_allocatable can
+	 *    be large, and therefore computationally expensive to subtract
+	 *    from the condense_tree. Instead we first sync out the
+	 *    condense_tree and then the ms_allocatable, in the condensed
+	 *    space map. While this is not optimal, it is typically close to
+	 *    optimal and more importantly much cheaper to compute.
+	 *
+	 * 5] Finally, as both of the unflushed trees were written to our
+	 *    new and condensed metaslab space map, we basically flushed
+	 *    all the unflushed changes to disk, thus we call
+	 *    metaslab_flush_update().
+	 */
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
 
 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
-	    msp->ms_group->mg_vd->vdev_spa->spa_name,
-	    space_map_length(msp->ms_sm),
+	    spa->spa_name, space_map_length(msp->ms_sm),
 	    avl_numnodes(&msp->ms_allocatable->rt_root),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
-	/*
-	 * Create an range tree that is 100% allocated. We remove segments
-	 * that have been freed in this txg, any deferred frees that exist,
-	 * and any allocation in the future. Removing segments should be
-	 * a relatively inexpensive operation since we expect these trees to
-	 * have a small number of nodes.
-	 */
 	condense_tree = range_tree_create(NULL, NULL);
 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
 
-	range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
-	range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
-
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
 		    range_tree_remove, condense_tree);
 	}
 
-	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    range_tree_remove, condense_tree);
 	}
 
+	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+	    metaslab_unflushed_changes_memused(msp));
+	spa->spa_unflushed_stats.sus_memused -=
+	    metaslab_unflushed_changes_memused(msp);
+	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
 	/*
-	 * We're about to drop the metaslab's lock thus allowing
-	 * other consumers to change it's content. Set the
-	 * metaslab's ms_condensing flag to ensure that
-	 * allocations on this metaslab do not occur while we're
-	 * in the middle of committing it to disk. This is only critical
-	 * for ms_allocatable as all other range trees use per txg
+	 * We're about to drop the metaslab's lock thus allowing other
+	 * consumers to change it's content. Set the metaslab's ms_condensing
+	 * flag to ensure that allocations on this metaslab do not occur
+	 * while we're in the middle of committing it to disk. This is only
+	 * critical for ms_allocatable as all other range trees use per TXG
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
-	space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
+	uint64_t object = space_map_object(msp->ms_sm);
+	space_map_truncate(sm,
+	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
+
+	/*
+	 * space_map_truncate() may have reallocated the spacemap object.
+	 * If so, update the vdev_ms_array.
+	 */
+	if (space_map_object(msp->ms_sm) != object) {
+		object = space_map_object(msp->ms_sm);
+		dmu_write(spa->spa_meta_objset,
+		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
+		    msp->ms_id, sizeof (uint64_t), &object, tx);
+	}
 
 	/*
-	 * While we would ideally like to create a space map representation
-	 * that consists only of allocation records, doing so can be
-	 * prohibitively expensive because the in-core free tree can be
-	 * large, and therefore computationally expensive to subtract
-	 * from the condense_tree. Instead we sync out two trees, a cheap
-	 * allocation only tree followed by the in-core free tree. While not
-	 * optimal, this is typically close to optimal, and much cheaper to
-	 * compute.
+	 * Note:
+	 * When the log space map feature is enabled, each space map will
+	 * always have ALLOCS followed by FREES for each sync pass. This is
+	 * typically true even when the log space map feature is disabled,
+	 * except from the case where a metaslab goes through metaslab_sync()
+	 * and gets condensed. In that case the metaslab's space map will have
+	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
+	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
+	 * sync pass 1.
 	 */
 	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
-
-	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
+
 	msp->ms_condensing = B_FALSE;
+	metaslab_flush_update(msp, tx);
+}
+
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc..) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	spa_t *spa = mg->mg_vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+
+	/*
+	 * Just because a metaslab got flushed, that doesn't mean that
+	 * it will pass through metaslab_sync_done(). Thus, make sure to
+	 * update ms_synced_length here in case it doesn't.
+	 */
+	msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+	/*
+	 * We may end up here from metaslab_condense() without the
+	 * feature being active. In that case this is a no-op.
+	 */
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	ASSERT(spa_syncing_log_sm(spa) != NULL);
+	ASSERT(msp->ms_sm != NULL);
+	ASSERT(metaslab_unflushed_txg(msp) != 0);
+	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+
+	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
+
+	/* update metaslab's position in our flushing tree */
+	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+	mutex_enter(&spa->spa_flushed_ms_lock);
+	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+	avl_add(&spa->spa_metaslabs_by_flushed, msp);
+	mutex_exit(&spa->spa_flushed_ms_lock);
+
+	/* update metaslab counts of spa_log_sm_t nodes */
+	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
+	spa_log_sm_increment_current_mscount(spa);
+
+	/* cleanup obsolete logs if any */
+	uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
+	spa_cleanup_old_sm_logs(spa, tx);
+	uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
+	VERIFY3U(log_blocks_after, <=, log_blocks_before);
+
+	/* update log space map summary */
+	uint64_t blocks_gone = log_blocks_before - log_blocks_after;
+	spa_log_summary_add_flushed_metaslab(spa);
+	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
+	spa_log_summary_decrement_blkcount(spa, blocks_gone);
+}
+
+boolean_t
+metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	ASSERT(msp->ms_sm != NULL);
+	ASSERT(metaslab_unflushed_txg(msp) != 0);
+	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
+
+	/*
+	 * There is nothing wrong with flushing the same metaslab twice, as
+	 * this codepath should work on that case. However, the current
+	 * flushing scheme makes sure to avoid this situation as we would be
+	 * making all these calls without having anything meaningful to write
+	 * to disk. We assert this behavior here.
+	 */
+	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
+
+	/*
+	 * We can not flush while loading, because then we would
+	 * not load the ms_unflushed_{allocs,frees}.
+	 */
+	if (msp->ms_loading)
+		return (B_FALSE);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	/*
+	 * Metaslab condensing is effectively flushing. Therefore if the
+	 * metaslab can be condensed we can just condense it instead of
+	 * flushing it.
+	 *
+	 * Note that metaslab_condense() does call metaslab_flush_update()
+	 * so we can just return immediately after condensing. We also
+	 * don't need to care about setting ms_flushing or broadcasting
+	 * ms_flush_cv, even if we temporarily drop the ms_lock in
+	 * metaslab_condense(), as the metaslab is already loaded.
+	 */
+	if (msp->ms_loaded && metaslab_should_condense(msp)) {
+		metaslab_group_t *mg = msp->ms_group;
+
+		/*
+		 * For all histogram operations below refer to the
+		 * comments of metaslab_sync() where we follow a
+		 * similar procedure.
+		 */
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+		metaslab_group_histogram_remove(mg, msp);
+
+		metaslab_condense(msp, tx);
+
+		space_map_histogram_clear(msp->ms_sm);
+		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
+		ASSERT(range_tree_is_empty(msp->ms_freed));
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			space_map_histogram_add(msp->ms_sm,
+			    msp->ms_defer[t], tx);
+		}
+		metaslab_aux_histograms_update(msp);
+
+		metaslab_group_histogram_add(mg, msp);
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+
+		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+
+		/*
+		 * Since we recreated the histogram (and potentially
+		 * the ms_sm too while condensing) ensure that the
+		 * weight is updated too because we are not guaranteed
+		 * that this metaslab is dirty and will go through
+		 * metaslab_sync_done().
+		 */
+		metaslab_recalculate_weight_and_sort(msp);
+		return (B_TRUE);
+	}
+
+	msp->ms_flushing = B_TRUE;
+	uint64_t sm_len_before = space_map_length(msp->ms_sm);
+
+	mutex_exit(&msp->ms_lock);
+	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
+	    SM_NO_VDEVID, tx);
+	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
+	    SM_NO_VDEVID, tx);
+	mutex_enter(&msp->ms_lock);
+
+	uint64_t sm_len_after = space_map_length(msp->ms_sm);
+	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
+		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
+		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
+		    "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
+		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+		    range_tree_space(msp->ms_unflushed_allocs),
+		    range_tree_space(msp->ms_unflushed_frees),
+		    (sm_len_after - sm_len_before));
+	}
+
+	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+	    metaslab_unflushed_changes_memused(msp));
+	spa->spa_unflushed_stats.sus_memused -=
+	    metaslab_unflushed_changes_memused(msp);
+	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	metaslab_flush_update(msp, tx);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	msp->ms_flushing = B_FALSE;
+	cv_broadcast(&msp->ms_flush_cv);
+	return (B_TRUE);
 }
 
 /*
@@ -2811,7 +3236,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	objset_t *mos = spa_meta_objset(spa);
 	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
-	uint64_t object = space_map_object(msp->ms_sm);
 
 	ASSERT(!vd->vdev_ishole);
 
@@ -2858,25 +3282,53 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 */
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
-	if (msp->ms_sm == NULL) {
-		uint64_t new_object;
+	/*
+	 * Generate a log space map if one doesn't exist already.
+	 */
+	spa_generate_syncing_log_sm(spa, tx);
 
-		new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
+	if (msp->ms_sm == NULL) {
+		uint64_t new_object = space_map_alloc(mos,
+		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+		    zfs_metaslab_sm_blksz_with_log :
+		    zfs_metaslab_sm_blksz_no_log, tx);
 		VERIFY3U(new_object, !=, 0);
 
+		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
+
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
-
 		ASSERT(msp->ms_sm != NULL);
+
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 		ASSERT0(metaslab_allocated_space(msp));
 	}
 
+	if (metaslab_unflushed_txg(msp) == 0 &&
+	    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+		ASSERT(spa_syncing_log_sm(spa) != NULL);
+
+		metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+		spa_log_sm_increment_current_mscount(spa);
+		spa_log_summary_add_flushed_metaslab(spa);
+
+		ASSERT(msp->ms_sm != NULL);
+		mutex_enter(&spa->spa_flushed_ms_lock);
+		avl_add(&spa->spa_metaslabs_by_flushed, msp);
+		mutex_exit(&spa->spa_flushed_ms_lock);
+
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+	}
+
 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
 	    vd->vdev_checkpoint_sm == NULL) {
 		ASSERT(spa_has_checkpoint(spa));
 
 		uint64_t new_object = space_map_alloc(mos,
-		    vdev_standard_sm_blksz, tx);
+		    zfs_vdev_standard_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
@@ -2905,10 +3357,39 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
-	if (msp->ms_loaded && metaslab_should_condense(msp)) {
-		metaslab_condense(msp, txg, tx);
+	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
+	    metaslab_should_condense(msp))
+		metaslab_condense(msp, tx);
+
+	/*
+	 * We'll be going to disk to sync our space accounting, thus we
+	 * drop the ms_lock during that time so allocations coming from
+	 * open-context (ZIL) for future TXGs do not block.
+	 */
+	mutex_exit(&msp->ms_lock);
+	space_map_t *log_sm = spa_syncing_log_sm(spa);
+	if (log_sm != NULL) {
+		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+		space_map_write(log_sm, alloctree, SM_ALLOC,
+		    vd->vdev_id, tx);
+		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
+		    vd->vdev_id, tx);
+		mutex_enter(&msp->ms_lock);
+
+		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+		    metaslab_unflushed_changes_memused(msp));
+		spa->spa_unflushed_stats.sus_memused -=
+		    metaslab_unflushed_changes_memused(msp);
+		range_tree_remove_xor_add(alloctree,
+		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
+		range_tree_remove_xor_add(msp->ms_freeing,
+		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
+		spa->spa_unflushed_stats.sus_memused +=
+		    metaslab_unflushed_changes_memused(msp);
 	} else {
-		mutex_exit(&msp->ms_lock);
+		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
 		    SM_NO_VDEVID, tx);
 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
@@ -2928,7 +3409,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		/*
 		 * Since we are doing writes to disk and the ms_checkpointing
 		 * tree won't be changing during that time, we drop the
-		 * ms_lock while writing to the checkpoint space map.
+		 * ms_lock while writing to the checkpoint space map, for the
+		 * same reason mentioned above.
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
@@ -2996,6 +3478,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 * and instead will just swap the pointers for freeing and freed.
 	 * We can safely do this since the freed_tree is guaranteed to be
 	 * empty on the initial pass.
+	 *
+	 * Keep in mind that even if we are currently using a log spacemap
+	 * we want current frees to end up in the ms_allocatable (but not
+	 * get appended to the ms_sm) so their ranges can be reused as usual.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
@@ -3015,11 +3501,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
 	mutex_exit(&msp->ms_lock);
 
-	if (object != space_map_object(msp->ms_sm)) {
-		object = space_map_object(msp->ms_sm);
-		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
-		    msp->ms_id, sizeof (uint64_t), &object, tx);
-	}
+	/*
+	 * Verify that the space map object ID has been recorded in the
+	 * vdev_ms_array.
+	 */
+	uint64_t object;
+	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
+	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
+	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
+
 	mutex_exit(&msp->ms_sync_lock);
 	dmu_tx_commit(tx);
 }
@@ -3084,14 +3574,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		msp->ms_freed = range_tree_create(NULL, NULL);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			ASSERT(msp->ms_defer[t] == NULL);
-
+			ASSERT3P(msp->ms_defer[t], ==, NULL);
 			msp->ms_defer[t] = range_tree_create(NULL, NULL);
 		}
 
 		ASSERT3P(msp->ms_checkpointing, ==, NULL);
 		msp->ms_checkpointing = range_tree_create(NULL, NULL);
 
+		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
+		msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
+		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
+		msp->ms_unflushed_frees = range_tree_create(NULL, NULL);
+
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 	}
 	ASSERT0(range_tree_space(msp->ms_freeing));
@@ -3108,21 +3602,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	defer_delta = 0;
 	alloc_delta = msp->ms_allocated_this_txg -
 	    range_tree_space(msp->ms_freed);
+
 	if (defer_allowed) {
 		defer_delta = range_tree_space(msp->ms_freed) -
 		    range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= range_tree_space(*defer_tree);
 	}
-
 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
 	    defer_delta, 0);
 
-	/*
-	 * If there's a metaslab_load() in progress, wait for it to complete
-	 * so that we have a consistent view of the in-core space map.
-	 */
-	metaslab_load_wait(msp);
+	if (spa_syncing_log_sm(spa) == NULL) {
+		/*
+		 * If there's a metaslab_load() in progress and we don't have
+		 * a log space map, it means that we probably wrote to the
+		 * metaslab's space map. If this is the case, we need to
+		 * make sure that we wait for the load to complete so that we
+		 * have a consistent view at the in-core side of the metaslab.
+		 */
+		metaslab_load_wait(msp);
+	} else {
+		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+	}
 
 	/*
 	 * When auto-trimming is enabled, free ranges which are added to
@@ -3451,6 +3952,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(msp->ms_disabled);
 
@@ -4864,12 +5366,23 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 		    offset, size);
 	}
 
-	range_tree_verify_not_present(msp->ms_trim, offset, size);
+	/*
+	 * Check all segments that currently exist in the freeing pipeline.
+	 *
+	 * It would intuitively make sense to also check the current allocating
+	 * tree since metaslab_unalloc_dva() exists for extents that are
+	 * allocated and freed in the same sync pass withing the same txg.
+	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
+	 * segment but then we free part of it within the same txg
+	 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
+	 * current allocating tree.
+	 */
 	range_tree_verify_not_present(msp->ms_freeing, offset, size);
 	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 	range_tree_verify_not_present(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
 		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
+	range_tree_verify_not_present(msp->ms_trim, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
@@ -4979,6 +5492,57 @@ metaslab_enable(metaslab_t *msp, boolean_t sync)
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
+static void
+metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
+{
+	vdev_t *vd = ms->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa_meta_objset(spa);
+
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	metaslab_unflushed_phys_t entry = {
+		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
+	};
+	uint64_t entry_size = sizeof (entry);
+	uint64_t entry_offset = ms->ms_id * entry_size;
+
+	uint64_t object = 0;
+	int err = zap_lookup(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+	    &object);
+	if (err == ENOENT) {
+		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
+		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+		VERIFY0(zap_add(mos, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+		    &object, tx));
+	} else {
+		VERIFY0(err);
+	}
+
+	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
+	    &entry, tx);
+}
+
+void
+metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
+{
+	spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	ms->ms_unflushed_txg = txg;
+	metaslab_update_ondisk_flush_data(ms, tx);
+}
+
+uint64_t
+metaslab_unflushed_txg(metaslab_t *ms)
+{
+	return (ms->ms_unflushed_txg);
+}
+
 #if defined(_KERNEL)
 /* BEGIN CSTYLED */
 module_param(metaslab_aliquot, ulong, 0644);