2 files changed, 200 insertions, 169 deletions
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index c98daab49..bbe2f8962 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -7323,6 +7323,9 @@ spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
+	if (spa_sync_pass(spa) != 1)
+		return;
+
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    spa_free_sync_cb, zio, tx), ==, 0);
@@ -7718,10 +7721,10 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
-	dsl_pool_t *dp = spa->spa_dsl_pool;
-
-	ASSERT(spa->spa_sync_pass == 1);
+	if (spa_sync_pass(spa) != 1)
+		return;
 
+	dsl_pool_t *dp = spa->spa_dsl_pool;
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
@@ -7817,118 +7820,31 @@ vdev_indirect_state_sync_verify(vdev_t *vd)
 }
 
 /*
- * Sync the specified transaction group.  New blocks may be dirtied as
- * part of the process, so we iterate until it converges.
+ * Set the top-level vdev's max queue depth. Evaluate each top-level's
+ * async write queue depth in case it changed. The max queue depth will
+ * not change in the middle of syncing out this txg.
  */
-void
-spa_sync(spa_t *spa, uint64_t txg)
+static void
+spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 {
-	dsl_pool_t *dp = spa->spa_dsl_pool;
-	objset_t *mos = spa->spa_meta_objset;
-	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
-	metaslab_class_t *normal = spa_normal_class(spa);
-	metaslab_class_t *special = spa_special_class(spa);
-	metaslab_class_t *dedup = spa_dedup_class(spa);
+	ASSERT(spa_writeable(spa));
+
 	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd;
-	dmu_tx_t *tx;
-	int error;
 	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
 	    zfs_vdev_queue_depth_pct / 100;
+	metaslab_class_t *normal = spa_normal_class(spa);
+	metaslab_class_t *special = spa_special_class(spa);
+	metaslab_class_t *dedup = spa_dedup_class(spa);
 
-	VERIFY(spa_writeable(spa));
-
-	/*
-	 * Wait for i/os issued in open context that need to complete
-	 * before this txg syncs.
-	 */
-	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
-	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL);
-
-	/*
-	 * Lock out configuration changes.
-	 */
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
-	spa->spa_syncing_txg = txg;
-	spa->spa_sync_pass = 0;
-
-	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_enter(&spa->spa_alloc_locks[i]);
-		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
-		mutex_exit(&spa->spa_alloc_locks[i]);
-	}
-
-	/*
-	 * If there are any pending vdev state changes, convert them
-	 * into config changes that go out with this transaction group.
-	 */
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	while (list_head(&spa->spa_state_dirty_list) != NULL) {
-		/*
-		 * We need the write lock here because, for aux vdevs,
-		 * calling vdev_config_dirty() modifies sav_config.
-		 * This is ugly and will become unnecessary when we
-		 * eliminate the aux vdev wart by integrating all vdevs
-		 * into the root vdev tree.
-		 */
-		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
-		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
-			vdev_state_clean(vd);
-			vdev_config_dirty(vd);
-		}
-		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
-	}
-	spa_config_exit(spa, SCL_STATE, FTAG);
-
-	tx = dmu_tx_create_assigned(dp, txg);
-
-	spa->spa_sync_starttime = gethrtime();
-	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
-	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
-	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
-	    NSEC_TO_TICK(spa->spa_deadman_synctime));
-
-	/*
-	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
-	 * set spa_deflate if we have no raid-z vdevs.
-	 */
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
-		int i;
-
-		for (i = 0; i < rvd->vdev_children; i++) {
-			vd = rvd->vdev_child[i];
-			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
-				break;
-		}
-		if (i == rvd->vdev_children) {
-			spa->spa_deflate = TRUE;
-			VERIFY(0 == zap_add(spa->spa_meta_objset,
-			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
-		}
-	}
-
-	/*
-	 * Set the top-level vdev's max queue depth. Evaluate each
-	 * top-level's async write queue depth in case it changed.
-	 * The max queue depth will not change in the middle of syncing
-	 * out this txg.
-	 */
 	uint64_t slots_per_allocator = 0;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
-		metaslab_class_t *mc;
 
+		metaslab_group_t *mg = tvd->vdev_mg;
 		if (mg == NULL || !metaslab_group_initialized(mg))
 			continue;
 
-		mc = mg->mg_class;
+		metaslab_class_t *mc = mg->mg_class;
 		if (mc != normal && mc != special && mc != dedup)
 			continue;
 
@@ -7960,7 +7876,14 @@ spa_sync(spa_t *spa, uint64_t txg)
 	normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+}
+
+static void
+spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
+{
+	ASSERT(spa_writeable(spa));
 
+	vdev_t *rvd = spa->spa_root_vdev;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		vdev_indirect_state_sync_verify(vd);
@@ -7970,10 +7893,16 @@ spa_sync(spa_t *spa, uint64_t txg)
 			break;
 		}
 	}
+}
+
+static void
+spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
+{
+	objset_t *mos = spa->spa_meta_objset;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	uint64_t txg = tx->tx_txg;
+	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 
-	/*
-	 * Iterate to convergence.
-	 */
 	do {
 		int pass = ++spa->spa_sync_pass;
 
@@ -7999,81 +7928,60 @@ spa_sync(spa_t *spa, uint64_t txg)
 
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
+		svr_sync(spa, tx);
+		spa_sync_upgrades(spa, tx);
 
-		if (spa->spa_vdev_removal != NULL)
-			svr_sync(spa, tx);
-
+		vdev_t *vd = NULL;
 		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 		    != NULL)
 			vdev_sync(vd, txg);
 
-		if (pass == 1) {
-			spa_sync_upgrades(spa, tx);
-			ASSERT3U(txg, >=,
-			    spa->spa_uberblock.ub_rootbp.blk_birth);
+		/*
+		 * Note: We need to check if the MOS is dirty because we could
+		 * have marked the MOS dirty without updating the uberblock
+		 * (e.g. if we have sync tasks but no dirty user data). We need
+		 * to check the uberblock's rootbp because it is updated if we
+		 * have synced out dirty data (though in this case the MOS will
+		 * most likely also be dirty due to second order effects, we
+		 * don't want to rely on that here).
+		 */
+		if (pass == 1 &&
+		    spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+		    !dmu_objset_is_dirty(mos, txg)) {
 			/*
-			 * Note: We need to check if the MOS is dirty
-			 * because we could have marked the MOS dirty
-			 * without updating the uberblock (e.g. if we
-			 * have sync tasks but no dirty user data).  We
-			 * need to check the uberblock's rootbp because
-			 * it is updated if we have synced out dirty
-			 * data (though in this case the MOS will most
-			 * likely also be dirty due to second order
-			 * effects, we don't want to rely on that here).
+			 * Nothing changed on the first pass, therefore this
+			 * TXG is a no-op. Avoid syncing deferred frees, so
+			 * that we can keep this TXG as a no-op.
 			 */
-			if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
-			    !dmu_objset_is_dirty(mos, txg)) {
-				/*
-				 * Nothing changed on the first pass,
-				 * therefore this TXG is a no-op.  Avoid
-				 * syncing deferred frees, so that we
-				 * can keep this TXG as a no-op.
-				 */
-				ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
-				    txg));
-				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
-				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
-				ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
-				    txg));
-				break;
-			}
-			spa_sync_deferred_frees(spa, tx);
+			ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+			ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+			ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+			ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
+			break;
 		}
 
+		spa_sync_deferred_frees(spa, tx);
 	} while (dmu_objset_is_dirty(mos, txg));
+}
 
-#ifdef ZFS_DEBUG
-	if (!list_is_empty(&spa->spa_config_dirty_list)) {
-		/*
-		 * Make sure that the number of ZAPs for all the vdevs matches
-		 * the number of ZAPs in the per-vdev ZAP list. This only gets
-		 * called if the config is dirty; otherwise there may be
-		 * outstanding AVZ operations that weren't completed in
-		 * spa_sync_config_object.
-		 */
-		uint64_t all_vdev_zap_entry_count;
-		ASSERT0(zap_count(spa->spa_meta_objset,
-		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
-		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
-		    all_vdev_zap_entry_count);
-	}
-#endif
-
-	if (spa->spa_vdev_removal != NULL) {
-		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
-	}
+/*
+ * Rewrite the vdev configuration (which includes the uberblock) to
+ * commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few random
+ * top-level vdevs that are known to be visible in the config cache
+ * (see spa_vdev_add() for a complete description). If there *are* dirty
+ * vdevs, sync the uberblock to all vdevs.
+ */
+static void
+spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t txg = tx->tx_txg;
 
-	/*
-	 * Rewrite the vdev configuration (which includes the uberblock)
-	 * to commit the transaction group.
-	 *
-	 * If there are no dirty vdevs, we sync the uberblock to a few
-	 * random top-level vdevs that are known to be visible in the
-	 * config cache (see spa_vdev_add() for a complete description).
-	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
-	 */
 	for (;;) {
+		int error = 0;
+
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
@@ -8087,13 +7995,15 @@ spa_sync(spa_t *spa, uint64_t txg)
 			int c0 = spa_get_random(children);
 
 			for (int c = 0; c < children; c++) {
-				vd = rvd->vdev_child[(c0 + c) % children];
+				vdev_t *vd =
+				    rvd->vdev_child[(c0 + c) % children];
 
 				/* Stop when revisiting the first vdev */
 				if (c > 0 && svd[0] == vd)
 					break;
 
-				if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+				if (vd->vdev_ms_array == 0 ||
+				    vd->vdev_islog ||
 				    !vdev_is_concrete(vd))
 					continue;
 
@@ -8117,6 +8027,124 @@ spa_sync(spa_t *spa, uint64_t txg)
 		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
 		zio_resume_wait(spa);
 	}
+}
+
+/*
+ * Sync the specified transaction group.  New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+	vdev_t *vd = NULL;
+
+	VERIFY(spa_writeable(spa));
+
+	/*
+	 * Wait for i/os issued in open context that need to complete
+	 * before this txg syncs.
+	 */
+	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
+
+	/*
+	 * Lock out configuration changes.
+	 */
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	spa->spa_syncing_txg = txg;
+	spa->spa_sync_pass = 0;
+
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_enter(&spa->spa_alloc_locks[i]);
+		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+		mutex_exit(&spa->spa_alloc_locks[i]);
+	}
+
+	/*
+	 * If there are any pending vdev state changes, convert them
+	 * into config changes that go out with this transaction group.
+	 */
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	while (list_head(&spa->spa_state_dirty_list) != NULL) {
+		/*
+		 * We need the write lock here because, for aux vdevs,
+		 * calling vdev_config_dirty() modifies sav_config.
+		 * This is ugly and will become unnecessary when we
+		 * eliminate the aux vdev wart by integrating all vdevs
+		 * into the root vdev tree.
+		 */
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+			vdev_state_clean(vd);
+			vdev_config_dirty(vd);
+		}
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+	}
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+
+	spa->spa_sync_starttime = gethrtime();
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+	    NSEC_TO_TICK(spa->spa_deadman_synctime));
+
+	/*
+	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+	 * set spa_deflate if we have no raid-z vdevs.
+	 */
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+		vdev_t *rvd = spa->spa_root_vdev;
+
+		int i;
+		for (i = 0; i < rvd->vdev_children; i++) {
+			vd = rvd->vdev_child[i];
+			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+				break;
+		}
+		if (i == rvd->vdev_children) {
+			spa->spa_deflate = TRUE;
+			VERIFY0(zap_add(spa->spa_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+		}
+	}
+
+	spa_sync_adjust_vdev_max_queue_depth(spa);
+
+	spa_sync_condense_indirect(spa, tx);
+
+	spa_sync_iterate_to_convergence(spa, tx);
+
+#ifdef ZFS_DEBUG
+	if (!list_is_empty(&spa->spa_config_dirty_list)) {
+	/*
+	 * Make sure that the number of ZAPs for all the vdevs matches
+	 * the number of ZAPs in the per-vdev ZAP list. This only gets
+	 * called if the config is dirty; otherwise there may be
+	 * outstanding AVZ operations that weren't completed in
+	 * spa_sync_config_object.
+	 */
+		uint64_t all_vdev_zap_entry_count;
+		ASSERT0(zap_count(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+		    all_vdev_zap_entry_count);
+	}
+#endif
+
+	if (spa->spa_vdev_removal != NULL) {
+		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+	}
+
+	spa_sync_rewrite_vdev_config(spa, tx);
 	dmu_tx_commit(tx);
 
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 1896e824f..8d8900787 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -1798,6 +1798,9 @@ svr_sync(spa_t *spa, dmu_tx_t *tx)
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
+	if (svr == NULL)
+		return;
+
 	/*
 	 * This check is necessary so that we do not dirty the
 	 * DIRECTORY_OBJECT via spa_sync_removing_state() when there