aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/spa.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/spa.c')
-rw-r--r--module/zfs/spa.c366
1 files changed, 197 insertions, 169 deletions
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index c98daab49..bbe2f8962 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -7323,6 +7323,9 @@ spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
static void
spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
{
+ if (spa_sync_pass(spa) != 1)
+ return;
+
zio_t *zio = zio_root(spa, NULL, NULL, 0);
VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
spa_free_sync_cb, zio, tx), ==, 0);
@@ -7718,10 +7721,10 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
static void
spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
{
- dsl_pool_t *dp = spa->spa_dsl_pool;
-
- ASSERT(spa->spa_sync_pass == 1);
+ if (spa_sync_pass(spa) != 1)
+ return;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
@@ -7817,118 +7820,31 @@ vdev_indirect_state_sync_verify(vdev_t *vd)
}
/*
- * Sync the specified transaction group. New blocks may be dirtied as
- * part of the process, so we iterate until it converges.
+ * Set the top-level vdev's max queue depth. Evaluate each top-level's
+ * async write queue depth in case it changed. The max queue depth will
+ * not change in the middle of syncing out this txg.
*/
-void
-spa_sync(spa_t *spa, uint64_t txg)
+static void
+spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
{
- dsl_pool_t *dp = spa->spa_dsl_pool;
- objset_t *mos = spa->spa_meta_objset;
- bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
- metaslab_class_t *normal = spa_normal_class(spa);
- metaslab_class_t *special = spa_special_class(spa);
- metaslab_class_t *dedup = spa_dedup_class(spa);
+ ASSERT(spa_writeable(spa));
+
vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *vd;
- dmu_tx_t *tx;
- int error;
uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
zfs_vdev_queue_depth_pct / 100;
+ metaslab_class_t *normal = spa_normal_class(spa);
+ metaslab_class_t *special = spa_special_class(spa);
+ metaslab_class_t *dedup = spa_dedup_class(spa);
- VERIFY(spa_writeable(spa));
-
- /*
- * Wait for i/os issued in open context that need to complete
- * before this txg syncs.
- */
- (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
- spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL);
-
- /*
- * Lock out configuration changes.
- */
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
- spa->spa_syncing_txg = txg;
- spa->spa_sync_pass = 0;
-
- for (int i = 0; i < spa->spa_alloc_count; i++) {
- mutex_enter(&spa->spa_alloc_locks[i]);
- VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
- mutex_exit(&spa->spa_alloc_locks[i]);
- }
-
- /*
- * If there are any pending vdev state changes, convert them
- * into config changes that go out with this transaction group.
- */
- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
- while (list_head(&spa->spa_state_dirty_list) != NULL) {
- /*
- * We need the write lock here because, for aux vdevs,
- * calling vdev_config_dirty() modifies sav_config.
- * This is ugly and will become unnecessary when we
- * eliminate the aux vdev wart by integrating all vdevs
- * into the root vdev tree.
- */
- spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
- while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
- vdev_state_clean(vd);
- vdev_config_dirty(vd);
- }
- spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
- }
- spa_config_exit(spa, SCL_STATE, FTAG);
-
- tx = dmu_tx_create_assigned(dp, txg);
-
- spa->spa_sync_starttime = gethrtime();
- taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
- spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
- spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
- NSEC_TO_TICK(spa->spa_deadman_synctime));
-
- /*
- * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
- * set spa_deflate if we have no raid-z vdevs.
- */
- if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
- int i;
-
- for (i = 0; i < rvd->vdev_children; i++) {
- vd = rvd->vdev_child[i];
- if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
- break;
- }
- if (i == rvd->vdev_children) {
- spa->spa_deflate = TRUE;
- VERIFY(0 == zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
- sizeof (uint64_t), 1, &spa->spa_deflate, tx));
- }
- }
-
- /*
- * Set the top-level vdev's max queue depth. Evaluate each
- * top-level's async write queue depth in case it changed.
- * The max queue depth will not change in the middle of syncing
- * out this txg.
- */
uint64_t slots_per_allocator = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- metaslab_group_t *mg = tvd->vdev_mg;
- metaslab_class_t *mc;
+ metaslab_group_t *mg = tvd->vdev_mg;
if (mg == NULL || !metaslab_group_initialized(mg))
continue;
- mc = mg->mg_class;
+ metaslab_class_t *mc = mg->mg_class;
if (mc != normal && mc != special && mc != dedup)
continue;
@@ -7960,7 +7876,14 @@ spa_sync(spa_t *spa, uint64_t txg)
normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+}
+
+static void
+spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
+{
+ ASSERT(spa_writeable(spa));
+ vdev_t *rvd = spa->spa_root_vdev;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
vdev_indirect_state_sync_verify(vd);
@@ -7970,10 +7893,16 @@ spa_sync(spa_t *spa, uint64_t txg)
break;
}
}
+}
+
+static void
+spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ uint64_t txg = tx->tx_txg;
+ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
- /*
- * Iterate to convergence.
- */
do {
int pass = ++spa->spa_sync_pass;
@@ -7999,81 +7928,60 @@ spa_sync(spa_t *spa, uint64_t txg)
ddt_sync(spa, txg);
dsl_scan_sync(dp, tx);
+ svr_sync(spa, tx);
+ spa_sync_upgrades(spa, tx);
- if (spa->spa_vdev_removal != NULL)
- svr_sync(spa, tx);
-
+ vdev_t *vd = NULL;
while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
!= NULL)
vdev_sync(vd, txg);
- if (pass == 1) {
- spa_sync_upgrades(spa, tx);
- ASSERT3U(txg, >=,
- spa->spa_uberblock.ub_rootbp.blk_birth);
+ /*
+ * Note: We need to check if the MOS is dirty because we could
+ * have marked the MOS dirty without updating the uberblock
+ * (e.g. if we have sync tasks but no dirty user data). We need
+ * to check the uberblock's rootbp because it is updated if we
+ * have synced out dirty data (though in this case the MOS will
+ * most likely also be dirty due to second order effects, we
+ * don't want to rely on that here).
+ */
+ if (pass == 1 &&
+ spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ !dmu_objset_is_dirty(mos, txg)) {
/*
- * Note: We need to check if the MOS is dirty
- * because we could have marked the MOS dirty
- * without updating the uberblock (e.g. if we
- * have sync tasks but no dirty user data). We
- * need to check the uberblock's rootbp because
- * it is updated if we have synced out dirty
- * data (though in this case the MOS will most
- * likely also be dirty due to second order
- * effects, we don't want to rely on that here).
+ * Nothing changed on the first pass, therefore this
+ * TXG is a no-op. Avoid syncing deferred frees, so
+ * that we can keep this TXG as a no-op.
*/
- if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
- !dmu_objset_is_dirty(mos, txg)) {
- /*
- * Nothing changed on the first pass,
- * therefore this TXG is a no-op. Avoid
- * syncing deferred frees, so that we
- * can keep this TXG as a no-op.
- */
- ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
- txg));
- ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
- ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
- ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
- txg));
- break;
- }
- spa_sync_deferred_frees(spa, tx);
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+ ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
+ break;
}
+ spa_sync_deferred_frees(spa, tx);
} while (dmu_objset_is_dirty(mos, txg));
+}
-#ifdef ZFS_DEBUG
- if (!list_is_empty(&spa->spa_config_dirty_list)) {
- /*
- * Make sure that the number of ZAPs for all the vdevs matches
- * the number of ZAPs in the per-vdev ZAP list. This only gets
- * called if the config is dirty; otherwise there may be
- * outstanding AVZ operations that weren't completed in
- * spa_sync_config_object.
- */
- uint64_t all_vdev_zap_entry_count;
- ASSERT0(zap_count(spa->spa_meta_objset,
- spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
- ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
- all_vdev_zap_entry_count);
- }
-#endif
-
- if (spa->spa_vdev_removal != NULL) {
- ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
- }
+/*
+ * Rewrite the vdev configuration (which includes the uberblock) to
+ * commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few random
+ * top-level vdevs that are known to be visible in the config cache
+ * (see spa_vdev_add() for a complete description). If there *are* dirty
+ * vdevs, sync the uberblock to all vdevs.
+ */
+static void
+spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg = tx->tx_txg;
- /*
- * Rewrite the vdev configuration (which includes the uberblock)
- * to commit the transaction group.
- *
- * If there are no dirty vdevs, we sync the uberblock to a few
- * random top-level vdevs that are known to be visible in the
- * config cache (see spa_vdev_add() for a complete description).
- * If there *are* dirty vdevs, sync the uberblock to all vdevs.
- */
for (;;) {
+ int error = 0;
+
/*
* We hold SCL_STATE to prevent vdev open/close/etc.
* while we're attempting to write the vdev labels.
@@ -8087,13 +7995,15 @@ spa_sync(spa_t *spa, uint64_t txg)
int c0 = spa_get_random(children);
for (int c = 0; c < children; c++) {
- vd = rvd->vdev_child[(c0 + c) % children];
+ vdev_t *vd =
+ rvd->vdev_child[(c0 + c) % children];
/* Stop when revisiting the first vdev */
if (c > 0 && svd[0] == vd)
break;
- if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+ if (vd->vdev_ms_array == 0 ||
+ vd->vdev_islog ||
!vdev_is_concrete(vd))
continue;
@@ -8117,6 +8027,124 @@ spa_sync(spa_t *spa, uint64_t txg)
zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
zio_resume_wait(spa);
}
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ vdev_t *vd = NULL;
+
+ VERIFY(spa_writeable(spa));
+
+ /*
+ * Wait for i/os issued in open context that need to complete
+ * before this txg syncs.
+ */
+ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
+
+ /*
+ * If there are any pending vdev state changes, convert them
+ * into config changes that go out with this transaction group.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ while (list_head(&spa->spa_state_dirty_list) != NULL) {
+ /*
+ * We need the write lock here because, for aux vdevs,
+ * calling vdev_config_dirty() modifies sav_config.
+ * This is ugly and will become unnecessary when we
+ * eliminate the aux vdev wart by integrating all vdevs
+ * into the root vdev tree.
+ */
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+ while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+ vdev_state_clean(vd);
+ vdev_config_dirty(vd);
+ }
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+
+ spa->spa_sync_starttime = gethrtime();
+ taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+ spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+ spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+ NSEC_TO_TICK(spa->spa_deadman_synctime));
+
+ /*
+ * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+ * set spa_deflate if we have no raid-z vdevs.
+ */
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ int i;
+ for (i = 0; i < rvd->vdev_children; i++) {
+ vd = rvd->vdev_child[i];
+ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+ break;
+ }
+ if (i == rvd->vdev_children) {
+ spa->spa_deflate = TRUE;
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+ }
+ }
+
+ spa_sync_adjust_vdev_max_queue_depth(spa);
+
+ spa_sync_condense_indirect(spa, tx);
+
+ spa_sync_iterate_to_convergence(spa, tx);
+
+#ifdef ZFS_DEBUG
+ if (!list_is_empty(&spa->spa_config_dirty_list)) {
+ /*
+ * Make sure that the number of ZAPs for all the vdevs matches
+ * the number of ZAPs in the per-vdev ZAP list. This only gets
+ * called if the config is dirty; otherwise there may be
+ * outstanding AVZ operations that weren't completed in
+ * spa_sync_config_object.
+ */
+ uint64_t all_vdev_zap_entry_count;
+ ASSERT0(zap_count(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+ ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+ all_vdev_zap_entry_count);
+ }
+#endif
+
+ if (spa->spa_vdev_removal != NULL) {
+ ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+ }
+
+ spa_sync_rewrite_vdev_config(spa, tx);
dmu_tx_commit(tx);
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);