summaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
Diffstat (limited to 'module')
-rw-r--r--module/zcommon/zfeature_common.c7
-rw-r--r--module/zcommon/zpool_prop.c4
-rw-r--r--module/zfs/Makefile.in1
-rw-r--r--module/zfs/dmu_traverse.c13
-rw-r--r--module/zfs/dnode.c5
-rw-r--r--module/zfs/dnode_sync.c2
-rw-r--r--module/zfs/dsl_dataset.c12
-rw-r--r--module/zfs/dsl_destroy.c4
-rw-r--r--module/zfs/dsl_dir.c37
-rw-r--r--module/zfs/dsl_pool.c113
-rw-r--r--module/zfs/dsl_scan.c178
-rw-r--r--module/zfs/dsl_synctask.c124
-rw-r--r--module/zfs/dsl_userhold.c5
-rw-r--r--module/zfs/metaslab.c419
-rw-r--r--module/zfs/range_tree.c2
-rw-r--r--module/zfs/spa.c451
-rw-r--r--module/zfs/spa_checkpoint.c638
-rw-r--r--module/zfs/spa_misc.c82
-rw-r--r--module/zfs/space_map.c149
-rw-r--r--module/zfs/uberblock.c3
-rw-r--r--module/zfs/vdev.c187
-rw-r--r--module/zfs/vdev_indirect.c17
-rw-r--r--module/zfs/vdev_label.c79
-rw-r--r--module/zfs/vdev_removal.c85
-rw-r--r--module/zfs/zcp.c2
-rw-r--r--module/zfs/zcp_synctask.c9
-rw-r--r--module/zfs/zfs_ioctl.c32
-rw-r--r--module/zfs/zil.c110
-rw-r--r--module/zfs/zio.c17
-rw-r--r--module/zfs/zthr.c2
30 files changed, 2328 insertions, 461 deletions
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index ea1bccf50..b010c8843 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
@@ -210,6 +210,11 @@ zpool_feature_init(void)
hole_birth_deps);
}
+ zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
+ "com.delphix:zpool_checkpoint", "zpool_checkpoint",
+ "Pool state can be checkpointed, allowing rewind later.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
"com.delphix:extensible_dataset", "extensible_dataset",
"Enhanced dataset functionality, used by other features.",
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index bc38eca7d..dc0bb59bc 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#include <sys/zio.h>
@@ -79,6 +79,8 @@ zpool_prop_init(void)
ZFS_TYPE_POOL, "<size>", "FREE");
zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "FREEING");
+ zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
+ PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "LEAKED");
zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 1c2187c56..d8d1e3a23 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -68,6 +68,7 @@ $(MODULE)-objs += sha256.o
$(MODULE)-objs += skein_zfs.o
$(MODULE)-objs += spa.o
$(MODULE)-objs += spa_boot.o
+$(MODULE)-objs += spa_checkpoint.o
$(MODULE)-objs += spa_config.o
$(MODULE)-objs += spa_errlog.o
$(MODULE)-objs += spa_history.o
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index a5f468ac8..f0b535618 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -81,8 +81,8 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
if (BP_IS_HOLE(bp))
return (0);
- if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
- return (0);
+ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+ return (-1);
SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
@@ -121,20 +121,17 @@ static void
traverse_zil(traverse_data_t *td, zil_header_t *zh)
{
uint64_t claim_txg = zh->zh_claim_txg;
- zilog_t *zilog;
/*
* We only want to visit blocks that have been claimed but not yet
- * replayed; plus, in read-only mode, blocks that are already stable.
+ * replayed; plus blocks that are already stable in read-only mode.
*/
if (claim_txg == 0 && spa_writeable(td->td_spa))
return;
- zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
-
+ zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
-
zil_free(zilog);
}
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index ab687f7cc..fddad607d 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1284,6 +1284,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
(spa_is_root(os->os_spa) &&
spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
+ ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
+
if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
object == DMU_PROJECTUSED_OBJECT) {
if (object == DMU_USERUSED_OBJECT)
@@ -1519,7 +1521,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
mutex_exit(&dn->dn_mtx);
dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
- return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
+ return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
+ ENOENT : EEXIST));
}
if (refcount_add(&dn->dn_holds, tag) == 1)
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 96e7bccc9..044031e4f 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -644,7 +644,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dn->dn_maxblkid == 0 || list_head(list) != NULL ||
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
dnp->dn_datablkszsec ||
- range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
+ !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
dnp->dn_datablkszsec =
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
dn->dn_next_blksz[txgoff] = 0;
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 9db6d1e0b..bb9b4a1c7 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -46,6 +46,7 @@
#include <sys/zfs_context.h>
#include <sys/zfs_ioctl.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/vdev.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_onexit.h>
@@ -208,7 +209,9 @@ int
dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
boolean_t async)
{
- int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ int used = bp_get_dsize_sync(spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
@@ -3821,7 +3824,8 @@ dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
ddsqra.ddsqra_value = refquota;
return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
- dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+ dsl_dataset_set_refquota_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
static int
@@ -3936,8 +3940,8 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
ddsqra.ddsqra_value = refreservation;
return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
- dsl_dataset_set_refreservation_sync, &ddsqra,
- 0, ZFS_SPACE_CHECK_NONE));
+ dsl_dataset_set_refreservation_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
/*
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index b3296ceee..b450073cc 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -1036,7 +1036,7 @@ dsl_destroy_head(const char *name)
error = dsl_sync_task(name, dsl_destroy_head_check,
dsl_destroy_head_begin_sync, &ddha,
- 0, ZFS_SPACE_CHECK_NONE);
+ 0, ZFS_SPACE_CHECK_DESTROY);
if (error != 0)
return (error);
@@ -1062,7 +1062,7 @@ dsl_destroy_head(const char *name)
}
return (dsl_sync_task(name, dsl_destroy_head_check,
- dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
+ dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
}
/*
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index 36abfe024..519c94b64 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 Martin Matuska. All rights reserved.
* Copyright (c) 2014 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -942,14 +942,14 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
if (pds) {
- VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
+ VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
name, sizeof (uint64_t), 1, &ddobj, tx));
} else {
/* it's the root dir */
- VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
}
- VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+ VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
ddphys = dbuf->db_data;
@@ -988,6 +988,12 @@ dsl_dir_get_used(dsl_dir_t *dd)
}
uint64_t
+dsl_dir_get_compressed(dsl_dir_t *dd)
+{
+ return (dsl_dir_phys(dd)->dd_compressed_bytes);
+}
+
+uint64_t
dsl_dir_get_quota(dsl_dir_t *dd)
{
return (dsl_dir_phys(dd)->dd_quota);
@@ -1215,7 +1221,8 @@ dsl_dir_space_available(dsl_dir_t *dd,
used += dsl_dir_space_towrite(dd);
if (dd->dd_parent == NULL) {
- uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
+ ZFS_SPACE_CHECK_NORMAL);
quota = MIN(quota, poolsize);
}
@@ -1326,11 +1333,12 @@ top_of_function:
*/
uint64_t deferred = 0;
if (dd->dd_parent == NULL) {
- spa_t *spa = dd->dd_pool->dp_spa;
- uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
- deferred = metaslab_class_get_deferred(spa_normal_class(spa));
- if (poolsize - deferred < quota) {
- quota = poolsize - deferred;
+ uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
+ (netfree) ?
+ ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
+
+ if (avail < quota) {
+ quota = avail;
retval = ENOSPC;
}
}
@@ -1684,7 +1692,8 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
ddsqra.ddsqra_value = quota;
return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
- dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+ dsl_dir_set_quota_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
int
@@ -1727,7 +1736,8 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
avail = dsl_dir_space_available(dd->dd_parent,
NULL, 0, FALSE);
} else {
- avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+ avail = dsl_pool_adjustedsize(dd->dd_pool,
+ ZFS_SPACE_CHECK_NORMAL) - used;
}
if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
@@ -1805,7 +1815,8 @@ dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
ddsqra.ddsqra_value = reservation;
return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
- dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+ dsl_dir_set_reservation_sync, &ddsqra, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
static dsl_dir_t *
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 1bb49c13a..e8f519b18 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -43,6 +43,8 @@
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/dsl_deadlist.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
#include <sys/bptree.h>
#include <sys/zfeature.h>
#include <sys/zil_impl.h>
@@ -201,6 +203,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
offsetof(dsl_dir_t, dd_dirty_link));
txg_list_create(&dp->dp_sync_tasks, spa,
offsetof(dsl_sync_task_t, dst_node));
+ txg_list_create(&dp->dp_early_sync_tasks, spa,
+ offsetof(dsl_sync_task_t, dst_node));
dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
@@ -385,6 +389,7 @@ dsl_pool_close(dsl_pool_t *dp)
txg_list_destroy(&dp->dp_dirty_datasets);
txg_list_destroy(&dp->dp_dirty_zilogs);
txg_list_destroy(&dp->dp_sync_tasks);
+ txg_list_destroy(&dp->dp_early_sync_tasks);
txg_list_destroy(&dp->dp_dirty_dirs);
taskq_destroy(dp->dp_zil_clean_taskq);
@@ -574,6 +579,29 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
cv_signal(&dp->dp_spaceavail_cv);
}
+#ifdef ZFS_DEBUG
+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+ spa_t *spa = dp->dp_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ txg_list_t *tl = &vd->vdev_ms_list;
+ metaslab_t *ms;
+
+ for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+ ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+ VERIFY(range_tree_is_empty(ms->ms_freeing));
+ VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+ }
+ }
+
+ return (B_TRUE);
+}
+#endif
+
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
@@ -590,6 +618,23 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
tx = dmu_tx_create_assigned(dp, txg);
/*
+ * Run all early sync tasks before writing out any dirty blocks.
+ * For more info on early sync tasks see block comment in
+ * dsl_early_sync_task().
+ */
+ if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+ dsl_sync_task_t *dst;
+
+ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+ while ((dst =
+ txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+ ASSERT(dsl_early_sync_task_verify(dp, txg));
+ dsl_sync_task_sync(dst, tx);
+ }
+ ASSERT(dsl_early_sync_task_verify(dp, txg));
+ }
+
+ /*
* Write out all dirty blocks of dirty datasets.
*/
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -744,22 +789,66 @@ dsl_pool_sync_context(dsl_pool_t *dp)
taskq_member(dp->dp_sync_taskq, curthread));
}
+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
{
- uint64_t space, resv;
-
- /*
- * If we're trying to assess whether it's OK to do a free,
- * cut the reservation in half to allow forward progress
- * (e.g. make it possible to rm(1) files from a full pool).
- */
- space = spa_get_dspace(dp->dp_spa);
- resv = spa_get_slop_space(dp->dp_spa);
- if (netfree)
+ spa_t *spa = dp->dp_spa;
+ uint64_t space, resv, adjustedsize;
+ uint64_t spa_deferred_frees =
+ spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
+
+ space = spa_get_dspace(spa)
+ - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+ resv = spa_get_slop_space(spa);
+
+ switch (slop_policy) {
+ case ZFS_SPACE_CHECK_NORMAL:
+ break;
+ case ZFS_SPACE_CHECK_RESERVED:
resv >>= 1;
+ break;
+ case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+ resv >>= 2;
+ break;
+ case ZFS_SPACE_CHECK_NONE:
+ resv = 0;
+ break;
+ default:
+ panic("invalid slop policy value: %d", slop_policy);
+ break;
+ }
+ adjustedsize = (space >= resv) ? (space - resv) : 0;
- return (space - resv);
+ return (adjustedsize);
+}
+
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+ uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+ uint64_t deferred =
+ metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+ uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+ return (quota);
}
boolean_t
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 2c3494746..986dccdea 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -733,7 +733,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
}
return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
- dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+ dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
/* ARGSUSED */
@@ -810,13 +810,23 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
* If the scrub/resilver completed, update all DTLs to
* reflect this. Whether it succeeded or not, vacate
* all temporary scrub DTLs.
+ *
+ * As the scrub does not currently support traversing
+ * data that have been freed but are part of a checkpoint,
+ * we don't mark the scrub as done in the DTLs as faults
+ * may still exist in those vdevs.
*/
- vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
- complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
- if (complete) {
+ if (complete &&
+ !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ scn->scn_phys.scn_max_txg, B_TRUE);
+
spa_event_notify(spa, NULL, NULL,
scn->scn_phys.scn_min_txg ?
ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ } else {
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ 0, B_TRUE);
}
spa_errlog_rotate(spa);
@@ -1217,7 +1227,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
* (on-disk) even if it hasn't been claimed (even though for
* scrub there's nothing to do to it).
*/
- if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
return (0);
SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1268,11 +1278,13 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
zil_scan_arg_t zsa = { dp, zh };
zilog_t *zilog;
+ ASSERT(spa_writeable(dp->dp_spa));
+
/*
* We only want to visit blocks that have been claimed but not yet
* replayed (or, in read-only mode, blocks that *would* be claimed).
*/
- if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+ if (claim_txg == 0)
return;
zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -3004,79 +3016,16 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
return (B_TRUE);
}
-/*
- * This is the primary entry point for scans that is called from syncing
- * context. Scans must happen entirely during syncing context so that we
- * cna guarantee that blocks we are currently scanning will not change out
- * from under us. While a scan is active, this function controls how quickly
- * transaction groups proceed, instead of the normal handling provided by
- * txg_sync_thread().
- */
-void
-dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
{
- int err = 0;
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
- state_sync_type_t sync_type = SYNC_OPTIONAL;
-
- /*
- * Check for scn_restart_txg before checking spa_load_state, so
- * that we can restart an old-style scan while the pool is being
- * imported (see dsl_scan_init).
- */
- if (dsl_scan_restarting(scn, tx)) {
- pool_scan_func_t func = POOL_SCAN_SCRUB;
- dsl_scan_done(scn, B_FALSE, tx);
- if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
- func = POOL_SCAN_RESILVER;
- zfs_dbgmsg("restarting scan func=%u txg=%llu",
- func, (longlong_t)tx->tx_txg);
- dsl_scan_setup_sync(&func, tx);
- }
-
- /*
- * Only process scans in sync pass 1.
- */
- if (spa_sync_pass(spa) > 1)
- return;
-
- /*
- * If the spa is shutting down, then stop scanning. This will
- * ensure that the scan does not dirty any new data during the
- * shutdown phase.
- */
- if (spa_shutting_down(spa))
- return;
-
- /*
- * If the scan is inactive due to a stalled async destroy, try again.
- */
- if (!scn->scn_async_stalled && !dsl_scan_active(scn))
- return;
+ int err = 0;
- /* reset scan statistics */
- scn->scn_visited_this_txg = 0;
- scn->scn_holes_this_txg = 0;
- scn->scn_lt_min_this_txg = 0;
- scn->scn_gt_max_this_txg = 0;
- scn->scn_ddt_contained_this_txg = 0;
- scn->scn_objsets_visited_this_txg = 0;
- scn->scn_avg_seg_size_this_txg = 0;
- scn->scn_segs_this_txg = 0;
- scn->scn_avg_zio_size_this_txg = 0;
- scn->scn_zios_this_txg = 0;
- scn->scn_suspending = B_FALSE;
- scn->scn_sync_start_time = gethrtime();
- spa->spa_scrub_active = B_TRUE;
+ if (spa_suspend_async_destroy(spa))
+ return (0);
- /*
- * First process the async destroys. If we suspend, don't do
- * any scrubbing or resilvering. This ensures that there are no
- * async destroys while we are scanning, so the scan code doesn't
- * have to worry about traversing it. It is also faster to free the
- * blocks than to scrub them.
- */
if (zfs_free_bpobj_enabled &&
spa_version(spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_is_bptree = B_FALSE;
@@ -3152,7 +3101,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
ddt_sync(spa, tx->tx_txg);
}
if (err != 0)
- return;
+ return (err);
if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
zfs_free_leak_on_eio &&
(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
@@ -3205,6 +3154,85 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
dsl_pool_destroy_obsolete_bpobj(dp, tx);
}
+ return (0);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this function controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ int err = 0;
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ state_sync_type_t sync_type = SYNC_OPTIONAL;
+
+ /*
+ * Check for scn_restart_txg before checking spa_load_state, so
+ * that we can restart an old-style scan while the pool is being
+ * imported (see dsl_scan_init).
+ */
+ if (dsl_scan_restarting(scn, tx)) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_done(scn, B_FALSE, tx);
+ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ func = POOL_SCAN_RESILVER;
+ zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ func, (longlong_t)tx->tx_txg);
+ dsl_scan_setup_sync(&func, tx);
+ }
+
+ /*
+ * Only process scans in sync pass 1.
+ */
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ /*
+ * If the spa is shutting down, then stop scanning. This will
+ * ensure that the scan does not dirty any new data during the
+ * shutdown phase.
+ */
+ if (spa_shutting_down(spa))
+ return;
+
+ /*
+ * If the scan is inactive due to a stalled async destroy, try again.
+ */
+ if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+ return;
+
+ /* reset scan statistics */
+ scn->scn_visited_this_txg = 0;
+ scn->scn_holes_this_txg = 0;
+ scn->scn_lt_min_this_txg = 0;
+ scn->scn_gt_max_this_txg = 0;
+ scn->scn_ddt_contained_this_txg = 0;
+ scn->scn_objsets_visited_this_txg = 0;
+ scn->scn_avg_seg_size_this_txg = 0;
+ scn->scn_segs_this_txg = 0;
+ scn->scn_avg_zio_size_this_txg = 0;
+ scn->scn_zios_this_txg = 0;
+ scn->scn_suspending = B_FALSE;
+ scn->scn_sync_start_time = gethrtime();
+ spa->spa_scrub_active = B_TRUE;
+
+ /*
+ * First process the async destroys. If we suspend, don't do
+ * any scrubbing or resilvering. This ensures that there are no
+ * async destroys while we are scanning, so the scan code doesn't
+ * have to worry about traversing it. It is also faster to free the
+ * blocks than to scrub them.
+ */
+ err = dsl_process_async_destroys(dp, tx);
+ if (err != 0)
+ return;
if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
return;
diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c
index d8eb10d37..b63ce5cad 100644
--- a/module/zfs/dsl_synctask.c
+++ b/module/zfs/dsl_synctask.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#include <sys/dmu.h>
@@ -39,33 +39,10 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
return (0);
}
-/*
- * Called from open context to perform a callback in syncing context. Waits
- * for the operation to complete.
- *
- * The checkfunc will be called from open context as a preliminary check
- * which can quickly fail. If it succeeds, it will be called again from
- * syncing context. The checkfunc should generally be designed to work
- * properly in either context, but if necessary it can check
- * dmu_tx_is_syncing(tx).
- *
- * The synctask infrastructure enforces proper locking strategy with respect
- * to the dp_config_rwlock -- the lock will always be held when the callbacks
- * are called. It will be held for read during the open-context (preliminary)
- * call to the checkfunc, and then held for write from syncing context during
- * the calls to the check and sync funcs.
- *
- * A dataset or pool name can be passed as the first argument. Typically,
- * the check func will hold, check the return value of the hold, and then
- * release the dataset. The sync func will VERIFYO(hold()) the dataset.
- * This is safe because no changes can be made between the check and sync funcs,
- * and the sync func will only be called if the check func successfully opened
- * the dataset.
- */
-int
-dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+static int
+dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, void *arg,
- int blocks_modified, zfs_space_check_t space_check)
+ int blocks_modified, zfs_space_check_t space_check, boolean_t early)
{
spa_t *spa;
dmu_tx_t *tx;
@@ -102,7 +79,9 @@ top:
return (err);
}
- VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg));
+ txg_list_t *task_list = (early) ?
+ &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+ VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));
dmu_tx_commit(tx);
@@ -117,9 +96,64 @@ top:
return (dst.dst_error);
}
-void
-dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
- int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+/*
+ * Called from open context to perform a callback in syncing context. Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail. If it succeeds, it will be called again from
+ * syncing context. The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called. It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument. Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset. The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
+int
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
+{
+ return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
+ blocks_modified, space_check, B_FALSE));
+}
+
+/*
+ * An early synctask works exactly as a standard synctask with one important
+ * difference on the way it is handled during syncing context. Standard
+ * synctasks run after we've written out all the dirty blocks of dirty
+ * datasets. Early synctasks are executed before writing out any dirty data,
+ * and thus before standard synctasks.
+ *
+ * For that reason, early synctasks can affect the process of writing dirty
+ * changes to disk for the txg that they run and should be used with caution.
+ * In addition, early synctasks should not dirty any metaslabs as this would
+ * invalidate the precodition/invariant for subsequent early synctasks.
+ * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
+ */
+int
+dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+ dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
+{
+ return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
+ blocks_modified, space_check, B_TRUE));
+}
+
+static void
+dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
+ boolean_t early)
{
dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
@@ -133,7 +167,25 @@ dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
dst->dst_error = 0;
dst->dst_nowaiter = B_TRUE;
- VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg));
+ txg_list_t *task_list = (early) ?
+ &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+ VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
+}
+
+void
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+ dsl_sync_task_nowait_common(dp, syncfunc, arg,
+ blocks_modified, space_check, tx, B_FALSE);
+}
+
+void
+dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+ dsl_sync_task_nowait_common(dp, syncfunc, arg,
+ blocks_modified, space_check, tx, B_TRUE);
}
/*
@@ -160,12 +212,12 @@ dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
* (arc_tempreserve, dsl_pool_tempreserve).
*/
if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
- uint64_t quota = dsl_pool_adjustedsize(dp,
- dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
- metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+ uint64_t quota = dsl_pool_unreserved_space(dp,
+ dst->dst_space_check);
uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+
/* MOS space is triple-dittoed, so we multiply by 3. */
- if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+ if (used + dst->dst_space * 3 > quota) {
dst->dst_error = SET_ERROR(ENOSPC);
if (dst->dst_nowaiter)
kmem_free(dst, sizeof (*dst));
diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c
index b5a684f0b..c80b35d48 100644
--- a/module/zfs/dsl_userhold.c
+++ b/module/zfs/dsl_userhold.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
*/
@@ -604,7 +604,8 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
KM_SLEEP));
error = dsl_sync_task(pool, dsl_dataset_user_release_check,
- dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
+ dsl_dataset_user_release_sync, &ddura, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
fnvlist_free(ddura.ddura_todelete);
fnvlist_free(ddura.ddura_chkholds);
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index c11e459e0..76fa99e8b 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -34,6 +34,7 @@
#include <sys/spa_impl.h>
#include <sys/zfeature.h>
#include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>
#define WITH_DF_BLOCK_ALLOCATOR
@@ -54,6 +55,14 @@ unsigned long metaslab_aliquot = 512 << 10;
unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
/*
+ * Since we can touch multiple metaslabs (and their respective space maps)
+ * with each transaction group, we benefit from having a smaller space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk.
+ */
+int zfs_metaslab_sm_blksz = (1 << 12);
+
+/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
* space map representation must be before we compact it on-disk.
@@ -211,7 +220,7 @@ uint64_t metaslab_trace_max_entries = 5000;
static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *);
-static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
#ifdef _METASLAB_TRACING
@@ -484,11 +493,11 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
*/
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
allocated +=
- range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
+ range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
}
- msp_free_space = range_tree_space(msp->ms_tree) + allocated +
- msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
+ msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+ msp->ms_deferspace + range_tree_space(msp->ms_freed);
VERIFY3U(sm_free_space, ==, msp_free_space);
}
@@ -1021,7 +1030,7 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
uint64_t
metaslab_block_maxsize(metaslab_t *msp)
{
- avl_tree_t *t = &msp->ms_size_tree;
+ avl_tree_t *t = &msp->ms_allocatable_by_size;
range_seg_t *rs;
if (t == NULL || (rs = avl_last(t)) == NULL)
@@ -1101,7 +1110,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
*/
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
- avl_tree_t *t = &msp->ms_tree->rt_root;
+ avl_tree_t *t = &msp->ms_allocatable->rt_root;
return (metaslab_block_picker(t, cursor, size, align));
}
@@ -1134,13 +1143,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
*/
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
- range_tree_t *rt = msp->ms_tree;
+ range_tree_t *rt = msp->ms_allocatable;
avl_tree_t *t = &rt->rt_root;
uint64_t max_size = metaslab_block_maxsize(msp);
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+ ASSERT3U(avl_numnodes(t), ==,
+ avl_numnodes(&msp->ms_allocatable_by_size));
if (max_size < size)
return (-1ULL);
@@ -1151,7 +1161,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
*/
if (max_size < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
- t = &msp->ms_size_tree;
+ t = &msp->ms_allocatable_by_size;
*cursor = 0;
}
@@ -1178,8 +1188,8 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
static uint64_t
metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
{
- range_tree_t *rt = msp->ms_tree;
- avl_tree_t *t = &msp->ms_size_tree;
+ range_tree_t *rt = msp->ms_allocatable;
+ avl_tree_t *t = &msp->ms_allocatable_by_size;
uint64_t *cursor = &msp->ms_lbas[0];
uint64_t *cursor_end = &msp->ms_lbas[1];
uint64_t offset = 0;
@@ -1192,7 +1202,7 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
if ((*cursor + size) > *cursor_end) {
range_seg_t *rs;
- rs = avl_last(&msp->ms_size_tree);
+ rs = avl_last(&msp->ms_allocatable_by_size);
if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
return (-1ULL);
@@ -1232,7 +1242,7 @@ uint64_t metaslab_ndf_clump_shift = 4;
static uint64_t
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &msp->ms_tree->rt_root;
+ avl_tree_t *t = &msp->ms_allocatable->rt_root;
avl_index_t where;
range_seg_t *rs, rsearch;
uint64_t hbit = highbit64(size);
@@ -1240,7 +1250,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
uint64_t max_size = metaslab_block_maxsize(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+ ASSERT3U(avl_numnodes(t), ==,
+ avl_numnodes(&msp->ms_allocatable_by_size));
if (max_size < size)
return (-1ULL);
@@ -1250,7 +1261,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
rs = avl_find(t, &rsearch, &where);
if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
- t = &msp->ms_size_tree;
+ t = &msp->ms_allocatable_by_size;
rsearch.rs_start = 0;
rsearch.rs_end = MIN(max_size,
@@ -1316,13 +1327,15 @@ metaslab_load(metaslab_t *msp)
/*
* If the space map has not been allocated yet, then treat
- * all the space in the metaslab as free and add it to the
- * ms_tree.
+ * all the space in the metaslab as free and add it to ms_allocatable.
*/
- if (msp->ms_sm != NULL)
- error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
- else
- range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+ if (msp->ms_sm != NULL) {
+ error = space_map_load(msp->ms_sm, msp->ms_allocatable,
+ SM_FREE);
+ } else {
+ range_tree_add(msp->ms_allocatable,
+ msp->ms_start, msp->ms_size);
+ }
success = (error == 0);
@@ -1333,9 +1346,16 @@ metaslab_load(metaslab_t *msp)
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_walk(msp->ms_defertree[t],
- range_tree_remove, msp->ms_tree);
+ /*
+ * If the metaslab already has a spacemap, then we need to
+ * remove all segments from the defer tree; otherwise, the
+ * metaslab is completely empty and we can skip this.
+ */
+ if (msp->ms_sm != NULL) {
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, msp->ms_allocatable);
+ }
}
msp->ms_max_size = metaslab_block_maxsize(msp);
}
@@ -1347,7 +1367,7 @@ void
metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
- range_tree_vacate(msp->ms_tree, NULL, NULL);
+ range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
@@ -1393,8 +1413,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
- ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
- metaslab_rangesize_compare, 0);
+ ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
+ &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms);
@@ -1446,20 +1466,21 @@ metaslab_fini(metaslab_t *msp)
space_map_close(msp->ms_sm);
metaslab_unload(msp);
- range_tree_destroy(msp->ms_tree);
- range_tree_destroy(msp->ms_freeingtree);
- range_tree_destroy(msp->ms_freedtree);
+ range_tree_destroy(msp->ms_allocatable);
+ range_tree_destroy(msp->ms_freeing);
+ range_tree_destroy(msp->ms_freed);
for (int t = 0; t < TXG_SIZE; t++) {
- range_tree_destroy(msp->ms_alloctree[t]);
+ range_tree_destroy(msp->ms_allocating[t]);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_destroy(msp->ms_defertree[t]);
+ range_tree_destroy(msp->ms_defer[t]);
}
-
ASSERT0(msp->ms_deferspace);
+ range_tree_destroy(msp->ms_checkpointing);
+
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
@@ -1679,7 +1700,7 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
segments <<= 1;
- segments += msp->ms_tree->rt_histogram[i];
+ segments += msp->ms_allocatable->rt_histogram[i];
/*
* The range tree provides more precision than the space map
@@ -1895,7 +1916,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
*/
ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
size >= SPA_MINBLOCKSIZE ||
- range_tree_space(msp->ms_tree) == 0);
+ range_tree_space(msp->ms_allocatable) == 0);
ASSERT0(weight & METASLAB_ACTIVE_MASK);
msp->ms_activation_weight = 0;
@@ -2028,18 +2049,37 @@ metaslab_should_condense(metaslab_t *msp)
range_seg_t *rs;
uint64_t size, entries, segsz, object_size, optimal_size, record_size;
dmu_object_info_t doi;
- uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
+ vdev_t *vd = msp->ms_group->mg_vd;
+ uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+ uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
/*
- * Use the ms_size_tree range tree, which is ordered by size, to
- * obtain the largest segment in the free tree. We always condense
- * metaslabs that are empty and metaslabs for which a condense
- * request has been made.
+ * Allocations and frees in early passes are generally more space
+ * efficient (in terms of blocks described in space map entries)
+ * than the ones in later passes (e.g. we don't compress after
+ * sync pass 5) and condensing a metaslab multiple times in a txg
+ * could degrade performance.
+ *
+ * Thus we prefer condensing each metaslab at most once every txg at
+ * the earliest sync pass possible. If a metaslab is eligible for
+ * condensing again after being considered for condensing within the
+ * same txg, it will hopefully be dirty in the next txg where it will
+ * be condensed at an earlier pass.
+ */
+ if (msp->ms_condense_checked_txg == current_txg)
+ return (B_FALSE);
+ msp->ms_condense_checked_txg = current_txg;
+
+ /*
+ * Use the ms_allocatable_by_size range tree, which is ordered by
+ * size, to obtain the largest segment in the free tree. We always
+ * condense metaslabs that are empty and metaslabs for which a
+ * condense request has been made.
*/
- rs = avl_last(&msp->ms_size_tree);
+ rs = avl_last(&msp->ms_allocatable_by_size);
if (rs == NULL || msp->ms_condense_wanted)
return (B_TRUE);
@@ -2053,7 +2093,8 @@ metaslab_should_condense(metaslab_t *msp)
entries = size / (MIN(size, SM_RUN_MAX));
segsz = entries * sizeof (uint64_t);
- optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+ optimal_size =
+ sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
object_size = space_map_length(msp->ms_sm);
dmu_object_info_from_db(sm->sm_dbuf, &doi);
@@ -2076,7 +2117,6 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
space_map_t *sm = msp->ms_sm;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(spa_sync_pass(msp->ms_group->mg_vd->vdev_spa), ==, 1);
ASSERT(msp->ms_loaded);
@@ -2084,7 +2124,8 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
msp->ms_group->mg_vd->vdev_spa->spa_name,
- space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+ space_map_length(msp->ms_sm),
+ avl_numnodes(&msp->ms_allocatable->rt_root),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
msp->ms_condense_wanted = B_FALSE;
@@ -2099,20 +2140,16 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
condense_tree = range_tree_create(NULL, NULL);
range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
- /*
- * Remove what's been freed in this txg from the condense_tree.
- * Since we're in sync_pass 1, we know that all the frees from
- * this txg are in the freeingtree.
- */
- range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
+ range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
+ range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_walk(msp->ms_defertree[t],
+ range_tree_walk(msp->ms_defer[t],
range_tree_remove, condense_tree);
}
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
- range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+ range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
range_tree_remove, condense_tree);
}
@@ -2122,13 +2159,13 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
* metaslab's ms_condensing flag to ensure that
* allocations on this metaslab do not occur while we're
* in the middle of committing it to disk. This is only critical
- * for the ms_tree as all other range trees use per txg
+ * for ms_allocatable as all other range trees use per txg
* views of their content.
*/
msp->ms_condensing = B_TRUE;
mutex_exit(&msp->ms_lock);
- space_map_truncate(sm, tx);
+ space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
/*
* While we would ideally like to create a space map representation
@@ -2144,7 +2181,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
range_tree_vacate(condense_tree, NULL, NULL);
range_tree_destroy(condense_tree);
- space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+ space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
mutex_enter(&msp->ms_lock);
msp->ms_condensing = B_FALSE;
}
@@ -2159,7 +2196,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa_meta_objset(spa);
- range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+ range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
dmu_tx_t *tx;
uint64_t object = space_map_object(msp->ms_sm);
@@ -2168,23 +2205,24 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* This metaslab has just been added so there's no work to do now.
*/
- if (msp->ms_freeingtree == NULL) {
+ if (msp->ms_freeing == NULL) {
ASSERT3P(alloctree, ==, NULL);
return;
}
ASSERT3P(alloctree, !=, NULL);
- ASSERT3P(msp->ms_freeingtree, !=, NULL);
- ASSERT3P(msp->ms_freedtree, !=, NULL);
+ ASSERT3P(msp->ms_freeing, !=, NULL);
+ ASSERT3P(msp->ms_freed, !=, NULL);
+ ASSERT3P(msp->ms_checkpointing, !=, NULL);
/*
- * Normally, we don't want to process a metaslab if there
- * are no allocations or frees to perform. However, if the metaslab
- * is being forced to condense and it's loaded, we need to let it
- * through.
+ * Normally, we don't want to process a metaslab if there are no
+ * allocations or frees to perform. However, if the metaslab is being
+ * forced to condense and it's loaded, we need to let it through.
*/
- if (range_tree_space(alloctree) == 0 &&
- range_tree_space(msp->ms_freeingtree) == 0 &&
+ if (range_tree_is_empty(alloctree) &&
+ range_tree_is_empty(msp->ms_freeing) &&
+ range_tree_is_empty(msp->ms_checkpointing) &&
!(msp->ms_loaded && msp->ms_condense_wanted))
return;
@@ -2193,10 +2231,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* The only state that can actually be changing concurrently with
- * metaslab_sync() is the metaslab's ms_tree. No other thread can
- * be modifying this txg's alloctree, freeingtree, freedtree, or
- * space_map_phys_t. We drop ms_lock whenever we could call
- * into the DMU, because the DMU can call down to us
+ * metaslab_sync() is the metaslab's ms_allocatable. No other
+ * thread can be modifying this txg's alloc, freeing,
+ * freed, or space_map_phys_t. We drop ms_lock whenever we
+ * could call into the DMU, because the DMU can call down to us
* (e.g. via zio_free()) at any time.
*
* The spa_vdev_remove_thread() can be reading metaslab state
@@ -2204,13 +2242,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* that the ms_lock is insufficient for this, because it is dropped
* by space_map_write().
*/
-
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
if (msp->ms_sm == NULL) {
uint64_t new_object;
- new_object = space_map_alloc(mos, tx);
+ new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
VERIFY3U(new_object, !=, 0);
VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
@@ -2218,6 +2255,28 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
ASSERT(msp->ms_sm != NULL);
}
+ if (!range_tree_is_empty(msp->ms_checkpointing) &&
+ vd->vdev_checkpoint_sm == NULL) {
+ ASSERT(spa_has_checkpoint(spa));
+
+ uint64_t new_object = space_map_alloc(mos,
+ vdev_standard_sm_blksz, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+ mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * We save the space map object as an entry in vdev_top_zap
+ * so it can be retrieved when the pool is reopened after an
+ * export or through zdb.
+ */
+ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (new_object), 1, &new_object, tx));
+ }
+
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
@@ -2230,16 +2289,40 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
metaslab_class_histogram_verify(mg->mg_class);
metaslab_group_histogram_remove(mg, msp);
- if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
- metaslab_should_condense(msp)) {
+ if (msp->ms_loaded && metaslab_should_condense(msp)) {
metaslab_condense(msp, txg, tx);
} else {
mutex_exit(&msp->ms_lock);
space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
- space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
+ space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
mutex_enter(&msp->ms_lock);
}
+ if (!range_tree_is_empty(msp->ms_checkpointing)) {
+ ASSERT(spa_has_checkpoint(spa));
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * Since we are doing writes to disk and the ms_checkpointing
+ * tree won't be changing during that time, we drop the
+ * ms_lock while writing to the checkpoint space map.
+ */
+ mutex_exit(&msp->ms_lock);
+ space_map_write(vd->vdev_checkpoint_sm,
+ msp->ms_checkpointing, SM_FREE, tx);
+ mutex_enter(&msp->ms_lock);
+ space_map_update(vd->vdev_checkpoint_sm);
+
+ spa->spa_checkpoint_info.sci_dspace +=
+ range_tree_space(msp->ms_checkpointing);
+ vd->vdev_stat.vs_checkpoint_space +=
+ range_tree_space(msp->ms_checkpointing);
+ ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+ -vd->vdev_checkpoint_sm->sm_alloc);
+
+ range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+ }
+
if (msp->ms_loaded) {
/*
* When the space map is loaded, we have an accurate
@@ -2248,7 +2331,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* it first before updating it.
*/
space_map_histogram_clear(msp->ms_sm);
- space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
+ space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
/*
* Since we've cleared the histogram we need to add back
@@ -2257,7 +2340,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* to accurately reflect all free space even if some space
* is not yet available for allocation (i.e. deferred).
*/
- space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
+ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
/*
* Add back any deferred free space that has not been
@@ -2268,7 +2351,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
space_map_histogram_add(msp->ms_sm,
- msp->ms_defertree[t], tx);
+ msp->ms_defer[t], tx);
}
}
@@ -2279,7 +2362,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* then we will lose some accuracy but will correct it the next
* time we load the space map.
*/
- space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
+ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
@@ -2287,21 +2370,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* For sync pass 1, we avoid traversing this txg's free range tree
- * and instead will just swap the pointers for freeingtree and
- * freedtree. We can safely do this since the freed_tree is
+ * and instead will just swap the pointers for freeing and
+ * freed. We can safely do this since the freed_tree is
* guaranteed to be empty on the initial pass.
*/
if (spa_sync_pass(spa) == 1) {
- range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
+ range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
} else {
- range_tree_vacate(msp->ms_freeingtree,
- range_tree_add, msp->ms_freedtree);
+ range_tree_vacate(msp->ms_freeing,
+ range_tree_add, msp->ms_freed);
}
range_tree_vacate(alloctree, NULL, NULL);
- ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
- ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
- ASSERT0(range_tree_space(msp->ms_freeingtree));
+ ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+ & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
mutex_exit(&msp->ms_lock);
@@ -2336,29 +2421,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* If this metaslab is just becoming available, initialize its
* range trees and add its capacity to the vdev.
*/
- if (msp->ms_freedtree == NULL) {
+ if (msp->ms_freed == NULL) {
for (int t = 0; t < TXG_SIZE; t++) {
- ASSERT(msp->ms_alloctree[t] == NULL);
+ ASSERT(msp->ms_allocating[t] == NULL);
- msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
+ msp->ms_allocating[t] = range_tree_create(NULL, NULL);
}
- ASSERT3P(msp->ms_freeingtree, ==, NULL);
- msp->ms_freeingtree = range_tree_create(NULL, NULL);
+ ASSERT3P(msp->ms_freeing, ==, NULL);
+ msp->ms_freeing = range_tree_create(NULL, NULL);
- ASSERT3P(msp->ms_freedtree, ==, NULL);
- msp->ms_freedtree = range_tree_create(NULL, NULL);
+ ASSERT3P(msp->ms_freed, ==, NULL);
+ msp->ms_freed = range_tree_create(NULL, NULL);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- ASSERT(msp->ms_defertree[t] == NULL);
+ ASSERT(msp->ms_defer[t] == NULL);
- msp->ms_defertree[t] = range_tree_create(NULL, NULL);
+ msp->ms_defer[t] = range_tree_create(NULL, NULL);
}
+ ASSERT3P(msp->ms_checkpointing, ==, NULL);
+ msp->ms_checkpointing = range_tree_create(NULL, NULL);
+
vdev_space_update(vd, 0, 0, msp->ms_size);
}
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
- defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
+ defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
metaslab_class_get_alloc(spa_normal_class(spa));
@@ -2369,7 +2459,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
defer_delta = 0;
alloc_delta = space_map_alloc_delta(msp->ms_sm);
if (defer_allowed) {
- defer_delta = range_tree_space(msp->ms_freedtree) -
+ defer_delta = range_tree_space(msp->ms_freed) -
range_tree_space(*defer_tree);
} else {
defer_delta -= range_tree_space(*defer_tree);
@@ -2385,19 +2475,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
/*
* Move the frees from the defer_tree back to the free
- * range tree (if it's loaded). Swap the freed_tree and the
- * defer_tree -- this is safe to do because we've just emptied out
- * the defer_tree.
+ * range tree (if it's loaded). Swap the freed_tree and
+ * the defer_tree -- this is safe to do because we've
+ * just emptied out the defer_tree.
*/
range_tree_vacate(*defer_tree,
- msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+ msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
if (defer_allowed) {
- range_tree_swap(&msp->ms_freedtree, defer_tree);
+ range_tree_swap(&msp->ms_freed, defer_tree);
} else {
- range_tree_vacate(msp->ms_freedtree,
- msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+ range_tree_vacate(msp->ms_freed,
+ msp->ms_loaded ? range_tree_add : NULL,
+ msp->ms_allocatable);
}
-
space_map_update(msp->ms_sm);
msp->ms_deferspace += defer_delta;
@@ -2426,16 +2516,17 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
VERIFY0(range_tree_space(
- msp->ms_alloctree[(txg + t) & TXG_MASK]));
+ msp->ms_allocating[(txg + t) & TXG_MASK]));
}
if (!metaslab_debug_unload)
metaslab_unload(msp);
}
- ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
- ASSERT0(range_tree_space(msp->ms_freeingtree));
- ASSERT0(range_tree_space(msp->ms_freedtree));
+ ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_freed));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
mutex_exit(&msp->ms_lock);
}
@@ -2666,7 +2757,7 @@ static uint64_t
metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
{
uint64_t start;
- range_tree_t *rt = msp->ms_tree;
+ range_tree_t *rt = msp->ms_allocatable;
metaslab_class_t *mc = msp->ms_group->mg_class;
VERIFY(!msp->ms_condensing);
@@ -2681,10 +2772,10 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
range_tree_remove(rt, start, size);
- if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
- range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
+ range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
/* Track the last successful allocation */
msp->ms_alloc_txg = txg;
@@ -3183,12 +3274,11 @@ next:
void
metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
- uint64_t txg)
+ boolean_t checkpoint)
{
metaslab_t *msp;
- ASSERTV(spa_t *spa = vd->vdev_spa);
+ spa_t *spa = vd->vdev_spa;
- ASSERT3U(txg, ==, spa->spa_syncing_txg);
ASSERT(vdev_is_concrete(vd));
ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
@@ -3202,11 +3292,19 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
metaslab_check_free_impl(vd, offset, asize);
+
mutex_enter(&msp->ms_lock);
- if (range_tree_space(msp->ms_freeingtree) == 0) {
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ if (range_tree_is_empty(msp->ms_freeing) &&
+ range_tree_is_empty(msp->ms_checkpointing)) {
+ vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+ }
+
+ if (checkpoint) {
+ ASSERT(spa_has_checkpoint(spa));
+ range_tree_add(msp->ms_checkpointing, offset, asize);
+ } else {
+ range_tree_add(msp->ms_freeing, offset, asize);
}
- range_tree_add(msp->ms_freeingtree, offset, asize);
mutex_exit(&msp->ms_lock);
}
@@ -3215,23 +3313,25 @@ void
metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
{
- uint64_t *txgp = arg;
+ boolean_t *checkpoint = arg;
+
+ ASSERT3P(checkpoint, !=, NULL);
if (vd->vdev_ops->vdev_op_remap != NULL)
- vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
+ vdev_indirect_mark_obsolete(vd, offset, size);
else
- metaslab_free_impl(vd, offset, size, *txgp);
+ metaslab_free_impl(vd, offset, size, *checkpoint);
}
static void
metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
- uint64_t txg)
+ boolean_t checkpoint)
{
spa_t *spa = vd->vdev_spa;
ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
- if (txg > spa_freeze_txg(spa))
+ if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
return;
if (spa->spa_vdev_removal != NULL &&
@@ -3243,13 +3343,13 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
* an indirect vdev (in open context), and then (in syncing
* context) clear spa_vdev_removal.
*/
- free_from_removing_vdev(vd, offset, size, txg);
+ free_from_removing_vdev(vd, offset, size);
} else if (vd->vdev_ops->vdev_op_remap != NULL) {
- vdev_indirect_mark_obsolete(vd, offset, size, txg);
+ vdev_indirect_mark_obsolete(vd, offset, size);
vd->vdev_ops->vdev_op_remap(vd, offset, size,
- metaslab_free_impl_cb, &txg);
+ metaslab_free_impl_cb, &checkpoint);
} else {
- metaslab_free_concrete(vd, offset, size, txg);
+ metaslab_free_concrete(vd, offset, size, checkpoint);
}
}
@@ -3426,26 +3526,25 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
mutex_enter(&msp->ms_lock);
- range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
+ range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
offset, size);
VERIFY(!msp->ms_condensing);
VERIFY3U(offset, >=, msp->ms_start);
VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
- VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+ VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
msp->ms_size);
VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
- range_tree_add(msp->ms_tree, offset, size);
+ range_tree_add(msp->ms_allocatable, offset, size);
mutex_exit(&msp->ms_lock);
}
/*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
+ * Free the block represented by the given DVA.
*/
void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
{
uint64_t vdev = DVA_GET_VDEV(dva);
uint64_t offset = DVA_GET_OFFSET(dva);
@@ -3459,7 +3558,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
}
- metaslab_free_impl(vd, offset, size, txg);
+ metaslab_free_impl(vd, offset, size, checkpoint);
}
/*
@@ -3529,7 +3628,8 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
- if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
+ if (error == 0 &&
+ !range_tree_contains(msp->ms_allocatable, offset, size))
error = SET_ERROR(ENOENT);
if (error || txg == 0) { /* txg == 0 indicates dry run */
@@ -3540,13 +3640,15 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
VERIFY(!msp->ms_condensing);
VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
- VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
- range_tree_remove(msp->ms_tree, offset, size);
+ VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+ msp->ms_size);
+ range_tree_remove(msp->ms_allocatable, offset, size);
if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
- if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(vd, VDD_METASLAB, msp, txg);
- range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
+ range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+ offset, size);
}
mutex_exit(&msp->ms_lock);
@@ -3691,13 +3793,41 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+ /*
+ * If we have a checkpoint for the pool we need to make sure that
+ * the blocks that we free that are part of the checkpoint won't be
+ * reused until the checkpoint is discarded or we revert to it.
+ *
+ * The checkpoint flag is passed down the metaslab_free code path
+ * and is set whenever we want to add a block to the checkpoint's
+ * accounting. That is, we "checkpoint" blocks that existed at the
+ * time the checkpoint was created and are therefore referenced by
+ * the checkpointed uberblock.
+ *
+ * Note that, we don't checkpoint any blocks if the current
+ * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+ * normally as they will be referenced by the checkpointed uberblock.
+ */
+ boolean_t checkpoint = B_FALSE;
+ if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+ spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+ /*
+ * At this point, if the block is part of the checkpoint
+ * there is no way it was created in the current txg.
+ */
+ ASSERT(!now);
+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
+ checkpoint = B_TRUE;
+ }
+
spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
for (int d = 0; d < ndvas; d++) {
if (now) {
metaslab_unalloc_dva(spa, &dva[d], txg);
} else {
- metaslab_free_dva(spa, &dva[d], txg);
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+ metaslab_free_dva(spa, &dva[d], checkpoint);
}
}
@@ -3818,12 +3948,13 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
mutex_enter(&msp->ms_lock);
if (msp->ms_loaded)
- range_tree_verify(msp->ms_tree, offset, size);
+ range_tree_verify(msp->ms_allocatable, offset, size);
- range_tree_verify(msp->ms_freeingtree, offset, size);
- range_tree_verify(msp->ms_freedtree, offset, size);
+ range_tree_verify(msp->ms_freeing, offset, size);
+ range_tree_verify(msp->ms_checkpointing, offset, size);
+ range_tree_verify(msp->ms_freed, offset, size);
for (int j = 0; j < TXG_DEFER_SIZE; j++)
- range_tree_verify(msp->ms_defertree[j], offset, size);
+ range_tree_verify(msp->ms_defer[j], offset, size);
mutex_exit(&msp->ms_lock);
}
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 448d00c1e..2181a92df 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index cdc03e66c..8ab7c3428 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -153,8 +153,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
static void spa_sync_version(void *arg, dmu_tx_t *tx);
static void spa_sync_props(void *arg, dmu_tx_t *tx);
static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
- boolean_t reloading);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
static void spa_vdev_resilver_done(spa_t *spa);
uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
@@ -216,6 +215,7 @@ unsigned long zfs_max_missing_tvds = 0;
* and we get a chance to retrieve the trusted config.
*/
uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+
/*
* In the case where config was assembled by scanning device paths (/dev/dsks
* by default) we are less tolerant since all the existing devices should have
@@ -224,6 +224,11 @@ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
uint64_t zfs_max_missing_tvds_scan = 0;
/*
+ * Debugging aid that pauses spa_sync() towards the end.
+ */
+boolean_t zfs_pause_spa_sync = B_FALSE;
+
+/*
* ==========================================================================
* SPA properties routines
* ==========================================================================
@@ -274,6 +279,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
size - alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+ spa->spa_checkpoint_info.sci_dspace, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
metaslab_class_fragmentation(mc), src);
@@ -811,6 +818,12 @@ spa_change_guid_check(void *arg, dmu_tx_t *tx)
vdev_t *rvd = spa->spa_root_vdev;
uint64_t vdev_state;
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ int error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (SET_ERROR(error));
+ }
+
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
vdev_state = rvd->vdev_state;
spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1452,6 +1465,12 @@ spa_unload(spa_t *spa)
spa->spa_condense_zthr = NULL;
}
+ if (spa->spa_checkpoint_discard_zthr != NULL) {
+ ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
+ zthr_destroy(spa->spa_checkpoint_discard_zthr);
+ spa->spa_checkpoint_discard_zthr = NULL;
+ }
+
spa_condense_fini(spa);
bpobj_close(&spa->spa_deferred_bpobj);
@@ -1535,6 +1554,18 @@ spa_load_spares(spa_t *spa)
int i;
vdev_t *vd, *tvd;
+#ifndef _KERNEL
+ /*
+ * zdb opens both the current state of the pool and the
+ * checkpointed state (if present), with a different spa_t.
+ *
+ * As spare vdevs are shared among open pools, we skip loading
+ * them when we load the checkpointed state of the pool.
+ */
+ if (!spa_writeable(spa))
+ return;
+#endif
+
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
/*
@@ -1654,6 +1685,19 @@ spa_load_l2cache(spa_t *spa)
vdev_t *vd, **oldvdevs, **newvdevs;
spa_aux_vdev_t *sav = &spa->spa_l2cache;
+#ifndef _KERNEL
+ /*
+ * zdb opens both the current state of the pool and the
+ * checkpointed state (if present), with a different spa_t.
+ *
+ * As L2 caches are part of the ARC which is shared among open
+ * pools, we skip loading them when we load the checkpointed
+ * state of the pool.
+ */
+ if (!spa_writeable(spa))
+ return;
+#endif
+
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
oldvdevs = sav->sav_vdevs;
@@ -2206,6 +2250,11 @@ spa_spawn_aux_threads(spa_t *spa)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_start_indirect_condensing_thread(spa);
+
+ ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
+ spa->spa_checkpoint_discard_zthr =
+ zthr_create(spa_checkpoint_discard_thread_check,
+ spa_checkpoint_discard_thread, spa);
}
/*
@@ -2299,7 +2348,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
spa->spa_load_state = state;
gethrestime(&spa->spa_loaded_ts);
- error = spa_load_impl(spa, type, &ereport, B_FALSE);
+ error = spa_load_impl(spa, type, &ereport);
/*
* Don't count references from objsets that are already closed
@@ -2606,8 +2655,25 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
return (SET_ERROR(EINVAL));
}
- if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state ==
- SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) {
+ /*
+ * If we are doing an import, ensure that the pool is not already
+ * imported by checking if its pool guid already exists in the
+ * spa namespace.
+ *
+ * The only case that we allow an already imported pool to be
+ * imported again, is when the pool is checkpointed and we want to
+ * look at its checkpointed state from userland tools like zdb.
+ */
+#ifdef _KERNEL
+ if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+#else
+ if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0) &&
+ !spa_importing_readonly_checkpoint(spa)) {
+#endif
spa_load_failed(spa, "a pool with guid %llu is already open",
(u_longlong_t)pool_guid);
return (SET_ERROR(EEXIST));
@@ -2766,6 +2832,19 @@ spa_ld_validate_vdevs(spa_t *spa)
return (0);
}
+static void
+spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
+{
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+ TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+ spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+ spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ spa->spa_claim_max_txg = spa->spa_first_txg;
+ spa->spa_prev_software_version = ub->ub_software_version;
+}
+
static int
spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
{
@@ -2775,6 +2854,29 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
boolean_t activity_check = B_FALSE;
/*
+ * If we are opening the checkpointed state of the pool by
+ * rewinding to it, at this point we will have written the
+ * checkpointed uberblock to the vdev labels, so searching
+ * the labels will find the right uberblock. However, if
+ * we are opening the checkpointed state read-only, we have
+ * not modified the labels. Therefore, we must ignore the
+ * labels and continue using the spa_uberblock that was set
+ * by spa_ld_checkpoint_rewind.
+ *
+ * Note that it would be fine to ignore the labels when
+ * rewinding (opening writeable) as well. However, if we
+ * crash just after writing the labels, we will end up
+ * searching the labels. Doing so in the common case means
+ * that this code path gets exercised normally, rather than
+ * just in the edge case.
+ */
+ if (ub->ub_checkpoint_txg != 0 &&
+ spa_importing_readonly_checkpoint(spa)) {
+ spa_ld_select_uberblock_done(spa, ub);
+ return (0);
+ }
+
+ /*
* Find the best uberblock.
*/
vdev_uberblock_load(rvd, ub, &label);
@@ -2905,14 +3007,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
/*
* Initialize internal SPA structures.
*/
- spa->spa_state = POOL_STATE_ACTIVE;
- spa->spa_ubsync = spa->spa_uberblock;
- spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
- TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
- spa->spa_first_txg = spa->spa_last_ubsync_txg ?
- spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
- spa->spa_claim_max_txg = spa->spa_first_txg;
- spa->spa_prev_software_version = ub->ub_software_version;
+ spa_ld_select_uberblock_done(spa, ub);
return (0);
}
@@ -2935,7 +3030,7 @@ spa_ld_open_rootbp(spa_t *spa)
}
static int
-spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type,
+spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
boolean_t reloading)
{
vdev_t *mrvd, *rvd = spa->spa_root_vdev;
@@ -3609,7 +3704,7 @@ spa_ld_claim_log_blocks(spa_t *spa)
static void
spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
- boolean_t reloading)
+ boolean_t update_config_cache)
{
vdev_t *rvd = spa->spa_root_vdev;
int need_update = B_FALSE;
@@ -3621,7 +3716,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
* If this is a verbatim import, trust the current
* in-core spa_config and update the disk labels.
*/
- if (reloading || config_cache_txg != spa->spa_config_txg ||
+ if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
spa->spa_load_state == SPA_LOAD_IMPORT ||
spa->spa_load_state == SPA_LOAD_RECOVER ||
(spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
@@ -3657,18 +3752,38 @@ spa_ld_prepare_for_reload(spa_t *spa)
spa->spa_async_suspended = async_suspended;
}
-/*
- * Load an existing storage pool, using the config provided. This config
- * describes which vdevs are part of the pool and is later validated against
- * partial configs present in each vdev's label and an entire copy of the
- * config stored in the MOS.
- */
static int
-spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
- boolean_t reloading)
+spa_ld_read_checkpoint_txg(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error = 0;
+
+ ASSERT0(spa->spa_checkpoint_txg);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error == ENOENT)
+ return (0);
+
+ if (error != 0)
+ return (error);
+
+ ASSERT3U(checkpoint.ub_txg, !=, 0);
+ ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
+ ASSERT3U(checkpoint.ub_timestamp, !=, 0);
+ spa->spa_checkpoint_txg = checkpoint.ub_txg;
+ spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+ return (0);
+}
+
+static int
+spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
{
int error = 0;
- boolean_t missing_feat_write = B_FALSE;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@@ -3684,11 +3799,6 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
if (type != SPA_IMPORT_ASSEMBLE)
spa->spa_trust_config = B_FALSE;
- if (reloading)
- spa_load_note(spa, "RELOADING");
- else
- spa_load_note(spa, "LOADING");
-
/*
* Parse the config provided to create a vdev tree.
*/
@@ -3721,11 +3831,11 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
}
/*
- * Read vdev labels to find the best uberblock (i.e. latest, unless
- * spa_load_max_txg is set) and store it in spa_uberblock. We get the
- * list of features required to read blkptrs in the MOS from the vdev
- * label with the best uberblock and verify that our version of zfs
- * supports them all.
+ * Read all vdev labels to find the best uberblock (i.e. latest,
+ * unless spa_load_max_txg is set) and store it in spa_uberblock. We
+ * get the list of features required to read blkptrs in the MOS from
+ * the vdev label with the best uberblock and verify that our version
+ * of zfs supports them all.
*/
error = spa_ld_select_uberblock(spa, type);
if (error != 0)
@@ -3740,23 +3850,211 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
if (error != 0)
return (error);
+ return (0);
+}
+
+static int
+spa_ld_checkpoint_rewind(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error != 0) {
+ spa_load_failed(spa, "unable to retrieve checkpointed "
+ "uberblock from the MOS config [error=%d]", error);
+
+ if (error == ENOENT)
+ error = ZFS_ERR_NO_CHECKPOINT;
+
+ return (error);
+ }
+
+ ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
+ ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
+
+ /*
+ * We need to update the txg and timestamp of the checkpointed
+ * uberblock to be higher than the latest one. This ensures that
+ * the checkpointed uberblock is selected if we were to close and
+ * reopen the pool right after we've written it in the vdev labels.
+ * (also see block comment in vdev_uberblock_compare)
+ */
+ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
+ checkpoint.ub_timestamp = gethrestime_sec();
+
+ /*
+ * Set current uberblock to be the checkpointed uberblock.
+ */
+ spa->spa_uberblock = checkpoint;
+
+ /*
+ * If we are doing a normal rewind, then the pool is open for
+ * writing and we sync the "updated" checkpointed uberblock to
+ * disk. Once this is done, we've basically rewound the whole
+ * pool and there is no way back.
+ *
+ * There are cases when we don't want to attempt and sync the
+ * checkpointed uberblock to disk because we are opening a
+ * pool as read-only. Specifically, verifying the checkpointed
+ * state with zdb, and importing the checkpointed state to get
+ * a "preview" of its content.
+ */
+ if (spa_writeable(spa)) {
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+ int svdcount = 0;
+ int children = rvd->vdev_children;
+ int c0 = spa_get_random(children);
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
+
+ /* Stop when revisiting the first vdev */
+ if (c > 0 && svd[0] == vd)
+ break;
+
+ if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+ !vdev_is_concrete(vd))
+ continue;
+
+ svd[svdcount++] = vd;
+ if (svdcount == SPA_SYNC_MIN_VDEVS)
+ break;
+ }
+ error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
+ if (error == 0)
+ spa->spa_last_synced_guid = rvd->vdev_guid;
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0) {
+ spa_load_failed(spa, "failed to write checkpointed "
+ "uberblock to the vdev labels [error=%d]", error);
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
+ boolean_t *update_config_cache)
+{
+ int error;
+
+ /*
+ * Parse the config for pool, open and validate vdevs,
+ * select an uberblock, and use that uberblock to open
+ * the MOS.
+ */
+ error = spa_ld_mos_init(spa, type);
+ if (error != 0)
+ return (error);
+
/*
* Retrieve the trusted config stored in the MOS and use it to create
* a new, exact version of the vdev tree, then reopen all vdevs.
*/
- error = spa_ld_load_trusted_config(spa, type, reloading);
+ error = spa_ld_trusted_config(spa, type, B_FALSE);
if (error == EAGAIN) {
- VERIFY(!reloading);
+ if (update_config_cache != NULL)
+ *update_config_cache = B_TRUE;
+
/*
* Redo the loading process with the trusted config if it is
* too different from the untrusted config.
*/
spa_ld_prepare_for_reload(spa);
- return (spa_load_impl(spa, type, ereport, B_TRUE));
+ spa_load_note(spa, "RELOADING");
+ error = spa_ld_mos_init(spa, type);
+ if (error != 0)
+ return (error);
+
+ error = spa_ld_trusted_config(spa, type, B_TRUE);
+ if (error != 0)
+ return (error);
+
} else if (error != 0) {
return (error);
}
+ return (0);
+}
+
+/*
+ * Load an existing storage pool, using the config provided. This config
+ * describes which vdevs are part of the pool and is later validated against
+ * partial configs present in each vdev's label and an entire copy of the
+ * config stored in the MOS.
+ */
+static int
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+ int error = 0;
+ boolean_t missing_feat_write = B_FALSE;
+ boolean_t checkpoint_rewind =
+ (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+ boolean_t update_config_cache = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+ spa_load_note(spa, "LOADING");
+
+ error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
+ if (error != 0)
+ return (error);
+
+ /*
+ * If we are rewinding to the checkpoint then we need to repeat
+ * everything we've done so far in this function but this time
+ * selecting the checkpointed uberblock and using that to open
+ * the MOS.
+ */
+ if (checkpoint_rewind) {
+ /*
+ * If we are rewinding to the checkpoint update config cache
+ * anyway.
+ */
+ update_config_cache = B_TRUE;
+
+ /*
+ * Extract the checkpointed uberblock from the current MOS
+ * and use this as the pool's uberblock from now on. If the
+ * pool is imported as writeable we also write the checkpoint
+ * uberblock to the labels, making the rewind permanent.
+ */
+ error = spa_ld_checkpoint_rewind(spa);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Redo the loading process process again with the
+ * checkpointed uberblock.
+ */
+ spa_ld_prepare_for_reload(spa);
+ spa_load_note(spa, "LOADING checkpointed uberblock");
+ error = spa_ld_mos_with_trusted_config(spa, type, NULL);
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * Retrieve the checkpoint txg if the pool has a checkpoint.
+ */
+ error = spa_ld_read_checkpoint_txg(spa);
+ if (error != 0)
+ return (error);
+
/*
* Retrieve the mapping of indirect vdevs. Those vdevs were removed
* from the pool and their contents were re-mapped to other vdevs. Note
@@ -3860,6 +4158,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
/*
+ * In case of a checkpoint rewind, log the original txg
+ * of the checkpointed uberblock.
+ */
+ if (checkpoint_rewind) {
+ spa_history_log_internal(spa, "checkpoint rewind",
+ NULL, "rewound state to txg=%llu",
+ (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
+ }
+
+ /*
* Traverse the ZIL and claim all blocks.
*/
spa_ld_claim_log_blocks(spa);
@@ -3886,7 +4194,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
* and the cachefile (by default /etc/zfs/zpool.cache).
*/
spa_ld_check_for_config_update(spa, config_cache_txg,
- reloading);
+ update_config_cache);
/*
* Check all DTLs to see if anything needs resilvering.
@@ -3970,6 +4278,15 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
if (load_error == 0)
return (0);
+ if (load_error == ZFS_ERR_NO_CHECKPOINT) {
+ /*
+ * When attempting checkpoint-rewind on a pool with no
+ * checkpoint, we should not attempt to load uberblocks
+ * from previous txgs when spa_load fails.
+ */
+ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+ return (load_error);
+ }
if (spa->spa_root_vdev != NULL)
config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
@@ -5564,6 +5881,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
@@ -5776,6 +6100,27 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ /*
+ * Besides being called directly from the userland through the
+ * ioctl interface, spa_vdev_detach() can be potentially called
+ * at the end of spa_vdev_resilver_done().
+ *
+ * In the regular case, when we have a checkpoint this shouldn't
+ * happen as we never empty the DTLs of a vdev during the scrub
+ * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
+ * should never get here when we have a checkpoint.
+ *
+ * That said, even in a case when we checkpoint the pool exactly
+ * as spa_vdev_resilver_done() calls this function everything
+ * should be fine as the resilver will return right away.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
if (vd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -6014,6 +6359,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
txg = spa_vdev_enter(spa);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
/* clear the log and flush everything up to now */
activate_slog = spa_passivate_log(spa);
(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
@@ -6665,6 +7017,10 @@ spa_async_suspend(spa_t *spa)
zthr_t *condense_thread = spa->spa_condense_zthr;
if (condense_thread != NULL && zthr_isrunning(condense_thread))
VERIFY0(zthr_cancel(condense_thread));
+
+ zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+ if (discard_thread != NULL && zthr_isrunning(discard_thread))
+ VERIFY0(zthr_cancel(discard_thread));
}
void
@@ -6679,6 +7035,10 @@ spa_async_resume(spa_t *spa)
zthr_t *condense_thread = spa->spa_condense_zthr;
if (condense_thread != NULL && !zthr_isrunning(condense_thread))
zthr_resume(condense_thread);
+
+ zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+ if (discard_thread != NULL && !zthr_isrunning(discard_thread))
+ zthr_resume(discard_thread);
}
static boolean_t
@@ -7454,6 +7814,8 @@ spa_sync(spa_t *spa, uint64_t txg)
txg));
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+ ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
+ txg));
break;
}
spa_sync_deferred_frees(spa, tx);
@@ -7499,16 +7861,22 @@ spa_sync(spa_t *spa, uint64_t txg)
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
if (list_is_empty(&spa->spa_config_dirty_list)) {
- vdev_t *svd[SPA_SYNC_MIN_VDEVS];
+ vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
int svdcount = 0;
int children = rvd->vdev_children;
int c0 = spa_get_random(children);
for (int c = 0; c < children; c++) {
vd = rvd->vdev_child[(c0 + c) % children];
+
+ /* Stop when revisiting the first vdev */
+ if (c > 0 && svd[0] == vd)
+ break;
+
if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
!vdev_is_concrete(vd))
continue;
+
svd[svdcount++] = vd;
if (svdcount == SPA_SYNC_MIN_VDEVS)
break;
@@ -7572,6 +7940,9 @@ spa_sync(spa_t *spa, uint64_t txg)
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+ while (zfs_pause_spa_sync)
+ delay(1);
+
spa->spa_sync_pass = 0;
/*
diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c
new file mode 100644
index 000000000..544658821
--- /dev/null
+++ b/module/zfs/spa_checkpoint.c
@@ -0,0 +1,638 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Storage Pool Checkpoint
+ *
+ * A storage pool checkpoint can be thought of as a pool-wide snapshot or
+ * a stable version of extreme rewind that guarantees no blocks from the
+ * checkpointed state will have been overwritten. It remembers the entire
+ * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
+ * point that it was taken and the user can rewind back to that point even if
+ * they applied destructive operations on their datasets or even enabled new
+ * zpool on-disk features. If a pool has a checkpoint that is no longer
+ * needed, the user can discard it.
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ * flag is set to active when we create the checkpoint and remains active
+ * until the checkpoint is fully discarded. The entry in the MOS config
+ * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
+ * references the state of the pool when we take the checkpoint. The entry
+ * remains populated until we start discarding the checkpoint or we rewind
+ * back to it.
+ *
+ * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
+ * which persists until the checkpoint is fully discarded. The space map
+ * contains entries that have been freed in the current state of the pool
+ * but we want to keep around in case we decide to rewind to the checkpoint.
+ * [see vdev_checkpoint_sm]
+ *
+ * - Each metaslab's ms_sm space map behaves the same as without the
+ * checkpoint, with the only exception being the scenario when we free
+ * blocks that belong to the checkpoint. In this case, these blocks remain
+ * ALLOCATED in the metaslab's space map and they are added as FREE in the
+ * vdev's checkpoint space map.
+ *
+ * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
+ * the uberblock was checkpointed. For normal uberblocks this field is 0.
+ *
+ * == Overview of operations ==
+ *
+ * - To create a checkpoint, we first wait for the current TXG to be synced,
+ * so we can use the most recently synced uberblock (spa_ubsync) as the
+ * checkpointed uberblock. Then we use an early synctask to place that
+ * uberblock in MOS config, increment the feature flag for the checkpoint
+ * (marking it active), and setting spa_checkpoint_txg (see its use below)
+ * to the TXG of the checkpointed uberblock. We use an early synctask for
+ * the aforementioned operations to ensure that no blocks were dirtied
+ * between the current TXG and the TXG of the checkpointed uberblock
+ * (e.g the previous txg).
+ *
+ * - When a checkpoint exists, we need to ensure that the blocks that
+ * belong to the checkpoint are freed but never reused. This means that
+ * these blocks should never end up in the ms_allocatable or the ms_freeing
+ * trees of a metaslab. Therefore, whenever there is a checkpoint the new
+ * ms_checkpointing tree is used in addition to the aforementioned ones.
+ *
+ * Whenever a block is freed and we find out that it is referenced by the
+ * checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
+ * we place it in the ms_checkpointing tree instead of the ms_freeingtree.
+ * This way, we divide the blocks that are being freed into checkpointed
+ * and not-checkpointed blocks.
+ *
+ * In order to persist these frees, we write the extents from the
+ * ms_freeingtree to the ms_sm as usual, and the extents from the
+ * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
+ * checkpointed extents will remain allocated in the metaslab's ms_sm space
+ * map, and therefore won't be reused [see metaslab_sync()]. In addition,
+ * when we discard the checkpoint, we can find the entries that have
+ * actually been freed in vdev_checkpoint_sm.
+ * [see spa_checkpoint_discard_thread_sync()]
+ *
+ * - To discard the checkpoint we use an early synctask to delete the
+ * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
+ * and wakeup the discarding zthr thread (an open-context async thread).
+ * We use an early synctask to ensure that the operation happens before any
+ * new data end up in the checkpoint's data structures.
+ *
+ * Once the synctask is done and the discarding zthr is awake, we discard
+ * the checkpointed data over multiple TXGs by having the zthr prefetching
+ * entries from vdev_checkpoint_sm and then starting a synctask that places
+ * them as free blocks in to their respective ms_allocatable and ms_sm
+ * structures.
+ * [see spa_checkpoint_discard_thread()]
+ *
+ * When there are no entries left in the vdev_checkpoint_sm of all
+ * top-level vdevs, a final synctask runs that decrements the feature flag.
+ *
+ * - To rewind to the checkpoint, we first use the current uberblock and
+ * open the MOS so we can access the checkpointed uberblock from the MOS
+ * config. After we retrieve the checkpointed uberblock, we use it as the
+ * current uberblock for the pool by writing it to disk with an updated
+ * TXG, opening its version of the MOS, and moving on as usual from there.
+ * [see spa_ld_checkpoint_rewind()]
+ *
+ * An important note on rewinding to the checkpoint has to do with how we
+ * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
+ * blocks that have not been claimed by the time we took the checkpoint
+ * as they should no longer be valid.
+ * [see comment in zil_claim()]
+ *
+ * == Miscellaneous information ==
+ *
+ * - In the hypothetical event that we take a checkpoint, remove a vdev,
+ * and attempt to rewind, the rewind would fail as the checkpointed
+ * uberblock would reference data in the removed device. For this reason
+ * and others of similar nature, we disallow the following operations that
+ * can change the config:
+ * vdev removal and attach/detach, mirror splitting, and pool reguid.
+ *
+ * - As most of the checkpoint logic is implemented in the SPA and doesn't
+ * distinguish datasets when it comes to space accounting, having a
+ * checkpoint can potentially break the boundaries set by dataset
+ * reservations.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_checkpoint.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+
+/*
+ * The following parameter limits the amount of memory to be used for the
+ * prefetching of the checkpoint space map done on each vdev while
+ * discarding the checkpoint.
+ *
+ * The reason it exists is because top-level vdevs with long checkpoint
+ * space maps can potentially take up a lot of memory depending on the
+ * amount of checkpointed data that has been freed within them while
+ * the pool had a checkpoint.
+ */
+unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
+
+int
+spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+ bzero(pcs, sizeof (pool_checkpoint_stat_t));
+
+ int error = zap_contains(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
+ ASSERT(error == 0 || error == ENOENT);
+
+ if (error == ENOENT)
+ pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
+ else
+ pcs->pcs_state = CS_CHECKPOINT_EXISTS;
+
+ pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
+ pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
+
+ return (0);
+}
+
+static void
+spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = arg;
+
+ spa->spa_checkpoint_info.sci_timestamp = 0;
+
+ spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+ spa_history_log_internal(spa, "spa discard checkpoint", tx,
+ "finished discarding checkpointed state from the pool");
+}
+
+typedef struct spa_checkpoint_discard_sync_callback_arg {
+ vdev_t *sdc_vd;
+ uint64_t sdc_txg;
+ uint64_t sdc_entry_limit;
+} spa_checkpoint_discard_sync_callback_arg_t;
+
+static int
+spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
+ vdev_t *vd = sdc->sdc_vd;
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ uint64_t end = offset + size;
+
+ if (sdc->sdc_entry_limit == 0)
+ return (EINTR);
+
+ /*
+ * Since the space map is not condensed, we know that
+ * none of its entries is crossing the boundaries of
+ * its respective metaslab.
+ *
+ * That said, there is no fundamental requirement that
+ * the checkpoint's space map entries should not cross
+ * metaslab boundaries. So if needed we could add code
+ * that handles metaslab-crossing segments in the future.
+ */
+ VERIFY3U(type, ==, SM_FREE);
+ VERIFY3U(offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * At this point we should not be processing any
+ * other frees concurrently, so the lock is technically
+ * unnecessary. We use the lock anyway though to
+ * potentially save ourselves from future headaches.
+ */
+ mutex_enter(&ms->ms_lock);
+ if (range_tree_is_empty(ms->ms_freeing))
+ vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
+ range_tree_add(ms->ms_freeing, offset, size);
+ mutex_exit(&ms->ms_lock);
+
+ ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
+ ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
+
+ vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
+ vd->vdev_stat.vs_checkpoint_space -= size;
+ sdc->sdc_entry_limit--;
+
+ return (0);
+}
+
+#ifdef ZFS_DEBUG
+static void
+spa_checkpoint_accounting_verify(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t ckpoint_sm_space_sum = 0;
+ uint64_t vs_ckpoint_space_sum = 0;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ if (vd->vdev_checkpoint_sm != NULL) {
+ ckpoint_sm_space_sum +=
+ -vd->vdev_checkpoint_sm->sm_alloc;
+ vs_ckpoint_space_sum +=
+ vd->vdev_stat.vs_checkpoint_space;
+ ASSERT3U(ckpoint_sm_space_sum, ==,
+ vs_ckpoint_space_sum);
+ } else {
+ ASSERT0(vd->vdev_stat.vs_checkpoint_space);
+ }
+ }
+ ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
+}
+#endif
+
+static void
+spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *vd = arg;
+ int error;
+
+ /*
+ * The space map callback is applied only to non-debug entries.
+ * Because the number of debug entries is less or equal to the
+ * number of non-debug entries, we want to ensure that we only
+ * read what we prefetched from open-context.
+ *
+ * Thus, we set the maximum entries that the space map callback
+ * will be applied to be half the entries that could fit in the
+ * imposed memory limit.
+ */
+ uint64_t max_entry_limit =
+ (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
+
+ uint64_t entries_in_sm =
+ space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+ /*
+ * Iterate from the end of the space map towards the beginning,
+ * placing its entries on ms_freeing and removing them from the
+ * space map. The iteration stops if one of the following
+ * conditions is true:
+ *
+ * 1] We reached the beginning of the space map. At this point
+ * the space map should be completely empty and
+ * space_map_incremental_destroy should have returned 0.
+ * The next step would be to free and close the space map
+ * and remove its entry from its vdev's top zap. This allows
+ * spa_checkpoint_discard_thread() to move on to the next vdev.
+ *
+ * 2] We reached the memory limit (amount of memory used to hold
+ * space map entries in memory) and space_map_incremental_destroy
+ * returned EINTR. This means that there are entries remaining
+ * in the space map that will be cleared in a future invocation
+ * of this function by spa_checkpoint_discard_thread().
+ */
+ spa_checkpoint_discard_sync_callback_arg_t sdc;
+ sdc.sdc_vd = vd;
+ sdc.sdc_txg = tx->tx_txg;
+ sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
+
+ uint64_t entries_before = entries_in_sm;
+
+ error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
+ spa_checkpoint_discard_sync_callback, &sdc, tx);
+
+ uint64_t entries_after =
+ space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+#ifdef ZFS_DEBUG
+ spa_checkpoint_accounting_verify(vd->vdev_spa);
+#endif
+
+ zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
+ "deleted %llu entries - %llu entries are left",
+ tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
+ entries_after);
+
+ if (error != EINTR) {
+ if (error != 0) {
+ zfs_panic_recover("zfs: error %d was returned "
+ "while incrementally destroying the checkpoint "
+ "space map of vdev %llu\n",
+ error, vd->vdev_id);
+ }
+ ASSERT0(entries_after);
+ ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
+ ASSERT0(vd->vdev_checkpoint_sm->sm_length);
+
+ space_map_free(vd->vdev_checkpoint_sm, tx);
+ space_map_close(vd->vdev_checkpoint_sm);
+ vd->vdev_checkpoint_sm = NULL;
+
+ VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
+ }
+}
+
+static boolean_t
+spa_checkpoint_discard_is_done(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(!spa_has_checkpoint(spa));
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
+ return (B_FALSE);
+ ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
+ }
+
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+boolean_t
+spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (B_FALSE);
+
+ if (spa_has_checkpoint(spa))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+int
+spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ while (vd->vdev_checkpoint_sm != NULL) {
+ space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
+ int numbufs;
+ dmu_buf_t **dbp;
+
+ if (zthr_iscancelled(zthr))
+ return (0);
+
+ ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+
+ uint64_t size = MIN(space_map_length(checkpoint_sm),
+ zfs_spa_discard_memory_limit);
+ uint64_t offset =
+ space_map_length(checkpoint_sm) - size;
+
+ /*
+ * Ensure that the part of the space map that will
+ * be destroyed by the synctask, is prefetched in
+ * memory before the synctask runs.
+ */
+ int error = dmu_buf_hold_array_by_bonus(
+ checkpoint_sm->sm_dbuf, offset, size,
+ B_TRUE, FTAG, &numbufs, &dbp);
+ if (error != 0) {
+ zfs_panic_recover("zfs: error %d was returned "
+ "while prefetching checkpoint space map "
+ "entries of vdev %llu\n",
+ error, vd->vdev_id);
+ }
+
+ VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+ spa_checkpoint_discard_thread_sync, vd,
+ 0, ZFS_SPACE_CHECK_NONE));
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ }
+ }
+
+ VERIFY(spa_checkpoint_discard_is_done(spa));
+ VERIFY0(spa->spa_checkpoint_info.sci_dspace);
+ VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+ spa_checkpoint_discard_complete_sync, spa,
+ 0, ZFS_SPACE_CHECK_NONE));
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+spa_checkpoint_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ENOTSUP));
+
+ if (!spa_top_vdevs_spacemap_addressable(spa))
+ return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
+
+ if (spa->spa_vdev_removal != NULL)
+ return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
+
+ if (spa->spa_checkpoint_txg != 0)
+ return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ uberblock_t checkpoint = spa->spa_ubsync;
+
+ /*
+ * At this point, there should not be a checkpoint in the MOS.
+ */
+ ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
+
+ ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
+ ASSERT0(spa->spa_checkpoint_info.sci_dspace);
+
+ /*
+ * Since the checkpointed uberblock is the one that just got synced
+ * (we use spa_ubsync), its txg must be equal to the txg number of
+ * the txg we are syncing, minus 1.
+ */
+ ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
+
+ /*
+ * Once the checkpoint is in place, we need to ensure that none of
+ * its blocks will be marked for reuse after it has been freed.
+ * When there is a checkpoint and a block is freed, we compare its
+ * birth txg to the txg of the checkpointed uberblock to see if the
+ * block is part of the checkpoint or not. Therefore, we have to set
+ * spa_checkpoint_txg before any frees happen in this txg (which is
+ * why this is done as an early_synctask as explained in the comment
+ * in spa_checkpoint()).
+ */
+ spa->spa_checkpoint_txg = checkpoint.ub_txg;
+ spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+ checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
+ VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
+ sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
+ &checkpoint, tx));
+
+ /*
+ * Increment the feature refcount and thus activate the feature.
+ * Note that the feature will be deactivated when we've
+ * completely discarded all checkpointed state (both vdev
+ * space maps and uberblock).
+ */
+ spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+ spa_history_log_internal(spa, "spa checkpoint", tx,
+ "checkpointed uberblock txg=%llu", checkpoint.ub_txg);
+}
+
+/*
+ * Create a checkpoint for the pool.
+ */
+int
+spa_checkpoint(const char *pool)
+{
+ int error;
+ spa_t *spa;
+
+ error = spa_open(pool, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ mutex_enter(&spa->spa_vdev_top_lock);
+
+ /*
+ * Wait for current syncing txg to finish so the latest synced
+ * uberblock (spa_ubsync) has all the changes that we expect
+ * to see if we were to revert later to the checkpoint. In other
+ * words we want the checkpointed uberblock to include/reference
+ * all the changes that were pending at the time that we issued
+ * the checkpoint command.
+ */
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * As the checkpointed uberblock references blocks from the previous
+ * txg (spa_ubsync) we want to ensure that are not freeing any of
+ * these blocks in the same txg that the following synctask will
+ * run. Thus, we run it as an early synctask, so the dirty changes
+ * that are synced to disk afterwards during zios and other synctasks
+ * do not reuse checkpointed blocks.
+ */
+ error = dsl_early_sync_task(pool, spa_checkpoint_check,
+ spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
+
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+ if (spa->spa_checkpoint_txg == 0)
+ return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+ VERIFY0(zap_contains(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, tx));
+
+ spa->spa_checkpoint_txg = 0;
+
+ zthr_wakeup(spa->spa_checkpoint_discard_zthr);
+
+ spa_history_log_internal(spa, "spa discard checkpoint", tx,
+ "started discarding checkpointed state from the pool");
+}
+
+/*
+ * Discard the checkpoint from a pool.
+ */
+int
+spa_checkpoint_discard(const char *pool)
+{
+ /*
+ * Similarly to spa_checkpoint(), we want our synctask to run
+ * before any pending dirty data are written to disk so they
+ * won't end up in the checkpoint's data structures (e.g.
+ * ms_checkpointing and vdev_checkpoint_sm) and re-create any
+ * space maps that the discarding open-context thread has
+ * deleted.
+ * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
+ */
+ return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
+ spa_checkpoint_discard_sync, NULL, 0,
+ ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(spa_checkpoint_get_stats);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
+
+/* BEGIN CSTYLED */
+module_param(zfs_spa_discard_memory_limit, ulong, 0644);
+MODULE_PARM_DESC(zfs_spa_discard_memory_limit,
+ "Maximum memory for prefetching checkpoint space "
+ "map per top-level vdev while discarding checkpoint");
+/* END CSTYLED */
+#endif
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 6c7e2f55c..9410fab07 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -357,12 +357,15 @@ int spa_asize_inflation = 24;
* These are the operations that call dsl_pool_adjustedsize() with the netfree
* argument set to TRUE.
*
+ * Operations that are almost guaranteed to free up space in the absence of
+ * a pool checkpoint can use up to three quarters of the slop space
+ * (e.g zfs destroy).
+ *
* A very restricted set of operations are always permitted, regardless of
* the amount of free space. These are the operations that call
- * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these
- * operations result in a net increase in the amount of space used,
- * it is possible to run the pool completely out of space, causing it to
- * be permanently read-only.
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
+ * increase in the amount of space used, it is possible to run the pool
+ * completely out of space, causing it to be permanently read-only.
*
* Note that on very small pools, the slop space will be larger than
* 3.2%, in an effort to have it be at least spa_min_slop (128MB),
@@ -1718,6 +1721,12 @@ spa_get_dspace(spa_t *spa)
return (spa->spa_dspace);
}
+uint64_t
+spa_get_checkpoint_space(spa_t *spa)
+{
+ return (spa->spa_checkpoint_info.sci_dspace);
+}
+
void
spa_update_dspace(spa_t *spa)
{
@@ -2065,7 +2074,8 @@ spa_writeable(spa_t *spa)
boolean_t
spa_has_pending_synctask(spa_t *spa)
{
- return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
+ return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
+ !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
}
int
@@ -2293,6 +2303,63 @@ spa_state_to_name(spa_t *spa)
return ("UNKNOWN");
}
+boolean_t
+spa_top_vdevs_spacemap_addressable(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+boolean_t
+spa_has_checkpoint(spa_t *spa)
+{
+ return (spa->spa_checkpoint_txg != 0);
+}
+
+boolean_t
+spa_importing_readonly_checkpoint(spa_t *spa)
+{
+ return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
+ spa->spa_mode == FREAD);
+}
+
+uint64_t
+spa_min_claim_txg(spa_t *spa)
+{
+ uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
+
+ if (checkpoint_txg != 0)
+ return (checkpoint_txg + 1);
+
+ return (spa->spa_first_txg);
+}
+
+/*
+ * If there is a checkpoint, async destroys may consume more space from
+ * the pool instead of freeing it. In an attempt to save the pool from
+ * getting suspended when it is about to run out of space, we stop
+ * processing async destroys.
+ */
+boolean_t
+spa_suspend_async_destroy(spa_t *spa)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+
+ uint64_t unreserved = dsl_pool_unreserved_space(dp,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
+ uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+ uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
+
+ if (spa_has_checkpoint(spa) && avail == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
#if defined(_KERNEL)
#include <linux/mod_compat.h>
@@ -2446,6 +2513,11 @@ EXPORT_SYMBOL(spa_trust_config);
EXPORT_SYMBOL(spa_missing_tvds_allowed);
EXPORT_SYMBOL(spa_set_missing_tvds);
EXPORT_SYMBOL(spa_state_to_name);
+EXPORT_SYMBOL(spa_importing_readonly_checkpoint);
+EXPORT_SYMBOL(spa_min_claim_txg);
+EXPORT_SYMBOL(spa_suspend_async_destroy);
+EXPORT_SYMBOL(spa_has_checkpoint);
+EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable);
/* BEGIN CSTYLED */
module_param(zfs_flags, uint, 0644);
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index d84dd7583..0e5a4b976 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -38,12 +38,13 @@
#include <sys/zfeature.h>
/*
+ * Note on space map block size:
+ *
* The data for a given space map can be kept on blocks of any size.
* Larger blocks entail fewer i/o operations, but they also cause the
* DMU to keep more data in-core, and also to waste more i/o bandwidth
* when only a few blocks have changed since the last transaction group.
*/
-int space_map_blksz = (1 << 12);
/*
* Iterate through the space map, invoking the callback on each (non-debug)
@@ -105,6 +106,137 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
return (error);
}
+/*
+ * Note: This function performs destructive actions - specifically
+ * it deletes entries from the end of the space map. Thus, callers
+ * should ensure that they are holding the appropriate locks for
+ * the space map that they provide.
+ */
+int
+space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+ dmu_tx_t *tx)
+{
+ uint64_t bufsize, len;
+ uint64_t *entry_map;
+ int error = 0;
+
+ len = space_map_length(sm);
+ bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+ entry_map = zio_buf_alloc(bufsize);
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ /*
+ * Since we can't move the starting offset of the space map
+ * (e.g there are reference on-disk pointing to it), we destroy
+ * its entries incrementally starting from the end.
+ *
+ * The logic that follows is basically the same as the one used
+ * in space_map_iterate() but it traverses the space map
+ * backwards:
+ *
+ * 1] We figure out the size of the buffer that we want to use
+ * to read the on-disk space map entries.
+ * 2] We figure out the offset at the end of the space map where
+ * we will start reading entries into our buffer.
+ * 3] We read the on-disk entries into the buffer.
+ * 4] We iterate over the entries from end to beginning calling
+ * the callback function on each one. As we move from entry
+ * to entry we decrease the size of the space map, deleting
+ * effectively each entry.
+ * 5] If there are no more entries in the space map or the
+ * callback returns a value other than 0, we stop iterating
+ * over the space map. If there are entries remaining and
+ * the callback returned zero we go back to step [1].
+ */
+ uint64_t offset = 0, size = 0;
+ while (len > 0 && error == 0) {
+ size = MIN(bufsize, len);
+
+ VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+ VERIFY3U(size, >, 0);
+ ASSERT3U(sm->sm_blksz, !=, 0);
+
+ offset = len - size;
+
+ IMPLY(bufsize > len, offset == 0);
+ IMPLY(bufsize == len, offset == 0);
+ IMPLY(bufsize < len, offset > 0);
+
+
+ EQUIV(size == len, offset == 0);
+ IMPLY(size < len, bufsize < len);
+
+ dprintf("object=%llu offset=%llx size=%llx\n",
+ space_map_object(sm), offset, size);
+
+ error = dmu_read(sm->sm_os, space_map_object(sm),
+ offset, size, entry_map, DMU_READ_PREFETCH);
+ if (error != 0)
+ break;
+
+ uint64_t num_entries = size / sizeof (uint64_t);
+
+ ASSERT3U(num_entries, >, 0);
+
+ while (num_entries > 0) {
+ uint64_t e, entry_offset, entry_size;
+ maptype_t type;
+
+ e = entry_map[num_entries - 1];
+
+ ASSERT3U(num_entries, >, 0);
+ ASSERT0(error);
+
+ if (SM_DEBUG_DECODE(e)) {
+ sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+ space_map_update(sm);
+ len -= sizeof (uint64_t);
+ num_entries--;
+ continue;
+ }
+
+ type = SM_TYPE_DECODE(e);
+ entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
+ sm->sm_start;
+ entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+
+ VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+ VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+ VERIFY3U(entry_offset, >=, sm->sm_start);
+ VERIFY3U(entry_offset + entry_size, <=,
+ sm->sm_start + sm->sm_size);
+
+ error = callback(type, entry_offset, entry_size, arg);
+ if (error != 0)
+ break;
+
+ if (type == SM_ALLOC)
+ sm->sm_phys->smp_alloc -= entry_size;
+ else
+ sm->sm_phys->smp_alloc += entry_size;
+
+ sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+ space_map_update(sm);
+ len -= sizeof (uint64_t);
+ num_entries--;
+ }
+ IMPLY(error == 0, num_entries == 0);
+ EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
+ }
+
+ if (len == 0) {
+ ASSERT0(error);
+ ASSERT0(offset);
+ ASSERT0(sm->sm_length);
+ ASSERT0(sm->sm_phys->smp_objsize);
+ ASSERT0(sm->sm_alloc);
+ }
+
+ zio_buf_free(entry_map, bufsize);
+ return (error);
+}
+
typedef struct space_map_load_arg {
space_map_t *smla_sm;
range_tree_t *smla_rt;
@@ -279,7 +411,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
*/
sm->sm_phys->smp_object = sm->sm_object;
- if (range_tree_space(rt) == 0) {
+ if (range_tree_is_empty(rt)) {
VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
return;
}
@@ -418,7 +550,7 @@ space_map_close(space_map_t *sm)
}
void
-space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
+space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
{
objset_t *os = sm->sm_os;
spa_t *spa = dmu_objset_spa(os);
@@ -440,7 +572,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
*/
if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
- doi.doi_data_block_size != space_map_blksz) {
+ doi.doi_data_block_size != blocksize) {
zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
"object[%llu]: old bonus %u, old blocksz %u",
dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
@@ -449,7 +581,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
space_map_free(sm, tx);
dmu_buf_rele(sm->sm_dbuf, sm);
- sm->sm_object = space_map_alloc(sm->sm_os, tx);
+ sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
VERIFY0(space_map_open_impl(sm));
} else {
VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
@@ -482,7 +614,7 @@ space_map_update(space_map_t *sm)
}
uint64_t
-space_map_alloc(objset_t *os, dmu_tx_t *tx)
+space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
{
spa_t *spa = dmu_objset_spa(os);
uint64_t object;
@@ -496,8 +628,7 @@ space_map_alloc(objset_t *os, dmu_tx_t *tx)
bonuslen = SPACE_MAP_SIZE_V0;
}
- object = dmu_object_alloc(os,
- DMU_OT_SPACE_MAP, space_map_blksz,
+ object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize,
DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
return (object);
diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c
index c1e85bdce..3b8526076 100644
--- a/module/zfs/uberblock.c
+++ b/module/zfs/uberblock.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -60,6 +60,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
ub->ub_mmp_magic = MMP_MAGIC;
ub->ub_mmp_delay = spa_multihost(rvd->vdev_spa) ? mmp_delay : 0;
ub->ub_mmp_seq = 0;
+ ub->ub_checkpoint_txg = 0;
return (ub->ub_rootbp.blk_birth == txg);
}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 37bb5a0c5..cf1bf2837 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -52,11 +52,22 @@
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
+/* maximum number of metaslabs per top-level vdev */
+int vdev_max_ms_count = 200;
+
+/* minimum amount of metaslabs per top-level vdev */
+int vdev_min_ms_count = 16;
+
+/* see comment in vdev_metaslab_set_size() */
+int vdev_default_ms_shift = 29;
+
+int vdev_validate_skip = B_FALSE;
+
/*
- * When a vdev is added, it will be divided into approximately (but no
- * more than) this number of metaslabs.
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
*/
-int metaslabs_per_vdev = 200;
+int vdev_dtl_sm_blksz = (1 << 12);
/*
* Rate limit delay events to this many IO delays per second.
@@ -74,7 +85,12 @@ unsigned int zfs_checksums_per_second = 20;
*/
int zfs_scan_ignore_errors = 0;
-int vdev_validate_skip = B_FALSE;
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+int vdev_standard_sm_blksz = (1 << 17);
/*PRINTFLIKE2*/
void
@@ -926,6 +942,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
if (tvd->vdev_mg != NULL)
tvd->vdev_mg->mg_vd = tvd;
+ tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
+ svd->vdev_checkpoint_sm = NULL;
+
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
@@ -1169,6 +1188,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
void
vdev_metaslab_fini(vdev_t *vd)
{
+ if (vd->vdev_checkpoint_sm != NULL) {
+ ASSERT(spa_feature_is_active(vd->vdev_spa,
+ SPA_FEATURE_POOL_CHECKPOINT));
+ space_map_close(vd->vdev_checkpoint_sm);
+ /*
+ * Even though we close the space map, we need to set its
+ * pointer to NULL. The reason is that vdev_metaslab_fini()
+ * may be called multiple times for certain operations
+ * (i.e. when destroying a pool) so we need to ensure that
+ * this clause never executes twice. This logic is similar
+ * to the one used for the vdev_ms clause below.
+ */
+ vd->vdev_checkpoint_sm = NULL;
+ }
+
if (vd->vdev_ms != NULL) {
uint64_t count = vd->vdev_ms_count;
@@ -2095,11 +2129,39 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
void
vdev_metaslab_set_size(vdev_t *vd)
{
+ uint64_t asize = vd->vdev_asize;
+ uint64_t ms_shift = 0;
+
/*
- * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
+ * For vdevs that are bigger than 8G the metaslab size varies in
+ * a way that the number of metaslabs increases in powers of two,
+ * linearly in terms of vdev_asize, starting from 16 metaslabs.
+ * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32,
+ * and so on, until we hit the maximum metaslab count limit
+ * [vdev_max_ms_count] from which point the metaslab count stays
+ * the same.
*/
- vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
- vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+ ms_shift = vdev_default_ms_shift;
+
+ if ((asize >> ms_shift) < vdev_min_ms_count) {
+ /*
+ * For devices that are less than 8G we want to have
+ * exactly 16 metaslabs. We don't want less as integer
+ * division rounds down, so less metaslabs mean more
+ * wasted space. We don't want more as these vdevs are
+ * small and in the likely event that we are running
+ * out of space, the SPA will have a hard time finding
+ * space due to fragmentation.
+ */
+ ms_shift = highbit64(asize / vdev_min_ms_count);
+ ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT);
+
+ } else if ((asize >> ms_shift) > vdev_max_ms_count) {
+ ms_shift = highbit64(asize / vdev_max_ms_count);
+ }
+
+ vd->vdev_ms_shift = ms_shift;
+ ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
}
void
@@ -2204,7 +2266,7 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
return (B_FALSE);
mutex_enter(&vd->vdev_dtl_lock);
- if (range_tree_space(rt) != 0)
+ if (!range_tree_is_empty(rt))
dirty = range_tree_contains(rt, txg, size);
mutex_exit(&vd->vdev_dtl_lock);
@@ -2218,7 +2280,7 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
boolean_t empty;
mutex_enter(&vd->vdev_dtl_lock);
- empty = (range_tree_space(rt) == 0);
+ empty = range_tree_is_empty(rt);
mutex_exit(&vd->vdev_dtl_lock);
return (empty);
@@ -2292,7 +2354,7 @@ vdev_dtl_should_excise(vdev_t *vd)
return (B_FALSE);
if (vd->vdev_resilver_txg == 0 ||
- range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
+ range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
return (B_TRUE);
/*
@@ -2396,8 +2458,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
* the top level so that we persist the change.
*/
if (vd->vdev_resilver_txg != 0 &&
- range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
- range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
+ range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+ range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
vd->vdev_resilver_txg = 0;
vdev_config_dirty(vd->vdev_top);
}
@@ -2557,7 +2619,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
if (vd->vdev_dtl_sm == NULL) {
uint64_t new_object;
- new_object = space_map_alloc(mos, tx);
+ new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
VERIFY3U(new_object, !=, 0);
VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
@@ -2571,7 +2633,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
range_tree_walk(rt, range_tree_add, rtsync);
mutex_exit(&vd->vdev_dtl_lock);
- space_map_truncate(vd->vdev_dtl_sm, tx);
+ space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
range_tree_vacate(rtsync, NULL, NULL);
@@ -2642,7 +2704,7 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
if (vd->vdev_children == 0) {
mutex_enter(&vd->vdev_dtl_lock);
- if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
+ if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
vdev_writeable(vd)) {
thismin = vdev_dtl_min(vd);
@@ -2670,6 +2732,28 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
return (needed);
}
+/*
+ * Gets the checkpoint space map object from the vdev's ZAP.
+ * Returns the spacemap object, or 0 if it wasn't in the ZAP
+ * or the ZAP doesn't exist yet.
+ */
+int
+vdev_checkpoint_sm_object(vdev_t *vd)
+{
+ ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+ if (vd->vdev_top_zap == 0) {
+ return (0);
+ }
+
+ uint64_t sm_obj = 0;
+ int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
+
+ VERIFY(err == 0 || err == ENOENT);
+
+ return (sm_obj);
+}
+
int
vdev_load(vdev_t *vd)
{
@@ -2705,6 +2789,35 @@ vdev_load(vdev_t *vd)
VDEV_AUX_CORRUPT_DATA);
return (error);
}
+
+ uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
+ if (checkpoint_sm_obj != 0) {
+ objset_t *mos = spa_meta_objset(vd->vdev_spa);
+ ASSERT(vd->vdev_asize != 0);
+ ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
+
+ if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+ mos, checkpoint_sm_obj, 0, vd->vdev_asize,
+ vd->vdev_ashift))) {
+ vdev_dbgmsg(vd, "vdev_load: space_map_open "
+ "failed for checkpoint spacemap (obj %llu) "
+ "[error=%d]",
+ (u_longlong_t)checkpoint_sm_obj, error);
+ return (error);
+ }
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+ space_map_update(vd->vdev_checkpoint_sm);
+
+ /*
+ * Since the checkpoint_sm contains free entries
+ * exclusively we can use sm_alloc to indicate the
+ * culmulative checkpointed space that has been freed.
+ */
+ vd->vdev_stat.vs_checkpoint_space =
+ -vd->vdev_checkpoint_sm->sm_alloc;
+ vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
+ vd->vdev_stat.vs_checkpoint_space;
+ }
}
/*
@@ -2722,7 +2835,7 @@ vdev_load(vdev_t *vd)
if (obsolete_sm_object != 0) {
objset_t *mos = vd->vdev_spa->spa_meta_objset;
ASSERT(vd->vdev_asize != 0);
- ASSERT(vd->vdev_obsolete_sm == NULL);
+ ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
obsolete_sm_object, 0, vd->vdev_asize, 0))) {
@@ -2848,6 +2961,12 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg)
mutex_exit(&msp->ms_lock);
}
+ if (vd->vdev_checkpoint_sm != NULL) {
+ ASSERT(spa_has_checkpoint(spa));
+ space_map_close(vd->vdev_checkpoint_sm);
+ vd->vdev_checkpoint_sm = NULL;
+ }
+
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
@@ -3181,6 +3300,17 @@ top:
error = spa_reset_logs(spa);
+ /*
+ * If the log device was successfully reset but has
+ * checkpointed data, do not offline it.
+ */
+ if (error == 0 &&
+ tvd->vdev_checkpoint_sm != NULL) {
+ ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
+ !=, 0);
+ error = ZFS_ERR_CHECKPOINT_EXISTS;
+ }
+
spa_vdev_state_enter(spa, SCL_ALLOC);
/*
@@ -3419,6 +3549,23 @@ vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
}
+boolean_t
+vdev_is_spacemap_addressable(vdev_t *vd)
+{
+ /*
+ * Assuming 47 bits of the space map entry dedicated for the entry's
+ * offset (see description in space_map.h), we calculate the maximum
+ * address that can be described by a space map entry for the given
+ * device.
+ */
+ uint64_t shift = vd->vdev_ashift + 47;
+
+ if (shift >= 63) /* detect potential overflow */
+ return (B_TRUE);
+
+ return (vd->vdev_asize < (1ULL << shift));
+}
+
/*
* Get statistics for the given vdev.
*/
@@ -4243,11 +4390,15 @@ EXPORT_SYMBOL(vdev_online);
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
/* BEGIN CSTYLED */
-module_param(metaslabs_per_vdev, int, 0644);
-MODULE_PARM_DESC(metaslabs_per_vdev,
+module_param(vdev_max_ms_count, int, 0644);
+MODULE_PARM_DESC(vdev_max_ms_count,
"Divide added vdev into approximately (but no more than) this number "
"of metaslabs");
+module_param(vdev_min_ms_count, int, 0644);
+MODULE_PARM_DESC(vdev_min_ms_count,
+ "Minimum number of metaslabs per top-level vdev");
+
module_param(zfs_delays_per_second, uint, 0644);
MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
"IO delays per second");
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index a93e41258..b14b153b2 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -298,14 +298,13 @@ static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
};
/*
- * Mark the given offset and size as being obsolete in the given txg.
+ * Mark the given offset and size as being obsolete.
*/
void
-vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
- uint64_t txg)
+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
{
spa_t *spa = vd->vdev_spa;
- ASSERT3U(spa_syncing_txg(spa), ==, txg);
+
ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
ASSERT(size > 0);
@@ -316,7 +315,7 @@ vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
mutex_enter(&vd->vdev_obsolete_lock);
range_tree_add(vd->vdev_obsolete_segments, offset, size);
mutex_exit(&vd->vdev_obsolete_lock);
- vdev_dirty(vd, 0, NULL, txg);
+ vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
}
}
@@ -334,7 +333,7 @@ spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
/* The DMU can only remap indirect vdevs. */
ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
- vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
+ vdev_indirect_mark_obsolete(vd, offset, size);
}
static spa_condensing_indirect_t *
@@ -727,7 +726,8 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)
return (0);
VERIFY0(dsl_sync_task(spa_name(spa), NULL,
- spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
+ spa_condense_indirect_complete_sync, sci, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED));
return (0);
}
@@ -804,7 +804,8 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
if (vdev_obsolete_sm_object(vd) == 0) {
uint64_t obsolete_sm_object =
- space_map_alloc(spa->spa_meta_objset, tx);
+ space_map_alloc(spa->spa_meta_objset,
+ vdev_standard_sm_blksz, tx);
ASSERT(vd->vdev_top_zap != 0);
VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 7ea8da1e6..29d7d651b 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
/*
@@ -352,6 +352,37 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
kmem_free(vsx, sizeof (*vsx));
}
+static void
+root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd != spa->spa_root_vdev)
+ return;
+
+ /* provide either current or previous scan information */
+ pool_scan_stat_t ps;
+ if (spa_scan_get_stats(spa, &ps) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+ }
+
+ pool_removal_stat_t prs;
+ if (spa_removal_get_stats(spa, &prs) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
+ sizeof (prs) / sizeof (uint64_t));
+ }
+
+ pool_checkpoint_stat_t pcs;
+ if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
+ sizeof (pcs) / sizeof (uint64_t));
+ }
+}
+
/*
* Generate the nvlist representing this vdev's config.
*/
@@ -474,20 +505,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (getstats) {
vdev_config_generate_stats(vd, nv);
- /* provide either current or previous scan information */
- pool_scan_stat_t ps;
- if (spa_scan_get_stats(spa, &ps) == 0) {
- fnvlist_add_uint64_array(nv,
- ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
- sizeof (pool_scan_stat_t) / sizeof (uint64_t));
- }
-
- pool_removal_stat_t prs;
- if (spa_removal_get_stats(spa, &prs) == 0) {
- fnvlist_add_uint64_array(nv,
- ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
- sizeof (prs) / sizeof (uint64_t));
- }
+ root_vdev_actions_getprogress(vd, nv);
/*
* Note: this can be called from open context
@@ -1525,11 +1543,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
{
spa_t *spa = svd[0]->vdev_spa;
uberblock_t *ub = &spa->spa_uberblock;
- vdev_t *vd;
- zio_t *zio;
int error = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ ASSERT(svdcount != 0);
retry:
/*
* Normally, we don't want to try too hard to write every label and
@@ -1571,9 +1588,10 @@ retry:
* written in this txg will be committed to stable storage
* before any uberblock that references them.
*/
- zio = zio_root(spa, NULL, NULL, flags);
+ zio_t *zio = zio_root(spa, NULL, NULL, flags);
- for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+ for (vdev_t *vd =
+ txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
zio_flush(zio, vd);
@@ -1588,8 +1606,14 @@ retry:
* the new labels to disk to ensure that all even-label updates
* are committed to stable storage before the uberblock update.
*/
- if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
+ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+ "for pool '%s' when syncing out the even labels "
+ "of dirty vdevs", error, spa_name(spa));
+ }
goto retry;
+ }
/*
* Sync the uberblocks to all vdevs in svd[].
@@ -1606,8 +1630,13 @@ retry:
* been successfully committed) will be valid with respect
* to the new uberblocks.
*/
- if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
+ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
+ "%d for pool '%s'", error, spa_name(spa));
+ }
goto retry;
+ }
if (spa_multihost(spa))
mmp_update_uberblock(spa, ub);
@@ -1622,8 +1651,14 @@ retry:
* to disk to ensure that all odd-label updates are committed to
* stable storage before the next transaction group begins.
*/
- if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
+ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+ zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+ "for pool '%s' when syncing out the odd labels of "
+ "dirty vdevs", error, spa_name(spa));
+ }
goto retry;
+ }
return (0);
}
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index f9084e8cf..f2bdd6389 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -117,6 +117,12 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
*/
int vdev_removal_max_span = 32 * 1024;
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+unsigned long zfs_remove_max_bytes_pause = -1UL;
+
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
static void spa_vdev_remove_thread(void *arg);
@@ -286,11 +292,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
* be copied.
*/
spa->spa_removing_phys.sr_to_copy -=
- range_tree_space(ms->ms_freeingtree);
+ range_tree_space(ms->ms_freeing);
- ASSERT0(range_tree_space(ms->ms_freedtree));
+ ASSERT0(range_tree_space(ms->ms_freed));
for (int t = 0; t < TXG_SIZE; t++)
- ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+ ASSERT0(range_tree_space(ms->ms_allocating[t]));
}
/*
@@ -467,19 +473,18 @@ spa_restart_removal(spa_t *spa)
* and we correctly free already-copied data.
*/
void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
- uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
{
spa_t *spa = vd->vdev_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t txg = spa_syncing_txg(spa);
uint64_t max_offset_yet = 0;
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
vdev_indirect_mapping_object(vim));
ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
- ASSERT3U(spa_syncing_txg(spa), ==, txg);
mutex_enter(&svr->svr_lock);
@@ -494,8 +499,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
* held, so that the remove_thread can not load this metaslab and then
* visit this offset between the time that we metaslab_free_concrete()
* and when we check to see if it has been visited.
+ *
+ * Note: The checkpoint flag is set to false as having/taking
+ * a checkpoint and removing a device can't happen at the same
+ * time.
*/
- metaslab_free_concrete(vd, offset, size, txg);
+ ASSERT(!spa_has_checkpoint(spa));
+ metaslab_free_concrete(vd, offset, size, B_FALSE);
uint64_t synced_size = 0;
uint64_t synced_offset = 0;
@@ -627,16 +637,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
* of this free.
*/
if (synced_size > 0) {
- vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
- txg);
+ vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
/*
* Note: this can only be called from syncing context,
* and the vdev_indirect_mapping is only changed from the
* sync thread, so we don't need svr_lock while doing
* metaslab_free_impl_cb.
*/
+ boolean_t checkpoint = B_FALSE;
vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
- metaslab_free_impl_cb, &txg);
+ metaslab_free_impl_cb, &checkpoint);
}
}
@@ -684,10 +695,10 @@ static void
free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
{
vdev_t *vd = arg;
- vdev_indirect_mark_obsolete(vd, offset, size,
- vd->vdev_spa->spa_syncing_txg);
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ boolean_t checkpoint = B_FALSE;
vdev_indirect_ops.vdev_op_remap(vd, offset, size,
- metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+ metaslab_free_impl_cb, &checkpoint);
}
/*
@@ -1363,7 +1374,7 @@ spa_vdev_remove_thread(void *arg)
* Assert nothing in flight -- ms_*tree is empty.
*/
for (int i = 0; i < TXG_SIZE; i++) {
- ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
}
/*
@@ -1393,7 +1404,7 @@ spa_vdev_remove_thread(void *arg)
SM_ALLOC));
space_map_close(sm);
- range_tree_walk(msp->ms_freeingtree,
+ range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
/*
@@ -1412,7 +1423,7 @@ spa_vdev_remove_thread(void *arg)
msp->ms_id);
while (!svr->svr_thread_exit &&
- range_tree_space(svr->svr_allocd_segs) != 0) {
+ !range_tree_is_empty(svr->svr_allocd_segs)) {
mutex_exit(&svr->svr_lock);
@@ -1427,6 +1438,19 @@ spa_vdev_remove_thread(void *arg)
*/
spa_config_exit(spa, SCL_CONFIG, FTAG);
+ /*
+ * This delay will pause the removal around the point
+ * specified by zfs_remove_max_bytes_pause. We do this
+ * solely from the test suite or during debugging.
+ */
+ uint64_t bytes_copied =
+ spa->spa_removing_phys.sr_copied;
+ for (int i = 0; i < TXG_SIZE; i++)
+ bytes_copied += svr->svr_bytes_done[i];
+ while (zfs_remove_max_bytes_pause <= bytes_copied &&
+ !svr->svr_thread_exit)
+ delay(hz);
+
mutex_enter(&vca.vca_lock);
while (vca.vca_outstanding_bytes >
zfs_remove_max_copy_bytes) {
@@ -1567,10 +1591,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
* Assert nothing in flight -- ms_*tree is empty.
*/
for (int i = 0; i < TXG_SIZE; i++)
- ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
for (int i = 0; i < TXG_DEFER_SIZE; i++)
- ASSERT0(range_tree_space(msp->ms_defertree[i]));
- ASSERT0(range_tree_space(msp->ms_freedtree));
+ ASSERT0(range_tree_space(msp->ms_defer[i]));
+ ASSERT0(range_tree_space(msp->ms_freed));
if (msp->ms_sm != NULL) {
/*
@@ -1586,7 +1610,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
mutex_enter(&svr->svr_lock);
VERIFY0(space_map_load(msp->ms_sm,
svr->svr_allocd_segs, SM_ALLOC));
- range_tree_walk(msp->ms_freeingtree,
+ range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
/*
@@ -1662,7 +1686,8 @@ spa_vdev_remove_cancel(spa_t *spa)
uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
- spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+ spa_vdev_remove_cancel_sync, NULL, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
if (error == 0) {
spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
@@ -1999,6 +2024,17 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
if (!locked)
txg = spa_vdev_enter(spa);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+ if (!locked)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ return (error);
+ }
+
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (spa->spa_spares.sav_vdevs != NULL &&
@@ -2111,6 +2147,13 @@ module_param(vdev_removal_max_span, int, 0644);
MODULE_PARM_DESC(vdev_removal_max_span,
"Largest span of free chunks a remap segment can span");
+/* BEGIN CSTYLED */
+module_param(zfs_remove_max_bytes_pause, ulong, 0644);
+MODULE_PARM_DESC(zfs_remove_max_bytes_pause,
+ "Pause device removal after this many bytes are copied "
+ "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
EXPORT_SYMBOL(free_from_removing_vdev);
EXPORT_SYMBOL(spa_removal_get_stats);
EXPORT_SYMBOL(spa_remove_init);
diff --git a/module/zfs/zcp.c b/module/zfs/zcp.c
index dad09da50..475138621 100644
--- a/module/zfs/zcp.c
+++ b/module/zfs/zcp.c
@@ -1142,7 +1142,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
if (sync) {
err = dsl_sync_task(poolname, NULL,
- zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_NONE);
+ zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
if (err != 0)
zcp_pool_error(&evalargs, poolname);
} else {
diff --git a/module/zfs/zcp_synctask.c b/module/zfs/zcp_synctask.c
index 196a3d4b7..e089666f2 100644
--- a/module/zfs/zcp_synctask.c
+++ b/module/zfs/zcp_synctask.c
@@ -110,7 +110,7 @@ static zcp_synctask_info_t zcp_synctask_destroy_info = {
{.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
{NULL, 0}
},
- .space_check = ZFS_SPACE_CHECK_NONE,
+ .space_check = ZFS_SPACE_CHECK_DESTROY,
.blocks_modified = 0
};
@@ -303,10 +303,9 @@ zcp_synctask_wrapper(lua_State *state)
zcp_parse_args(state, info->name, info->pargs, info->kwargs);
err = 0;
- if (info->space_check != ZFS_SPACE_CHECK_NONE && funcspace > 0) {
- uint64_t quota = dsl_pool_adjustedsize(dp,
- info->space_check == ZFS_SPACE_CHECK_RESERVED) -
- metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+ if (info->space_check != ZFS_SPACE_CHECK_NONE) {
+ uint64_t quota = dsl_pool_unreserved_space(dp,
+ info->space_check);
uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
ri->zri_space_used;
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index f95b77db7..e70207aa5 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3731,6 +3731,29 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
}
/*
+ * innvl: unused
+ * outnvl: empty
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ return (spa_checkpoint(poolname));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ return (spa_checkpoint_discard(poolname));
+}
+
+/*
* inputs:
* zc_name name of dataset to destroy
* zc_objset_type type of objset
@@ -6422,6 +6445,15 @@ zfs_ioctl_init(void)
POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
B_TRUE);
+ zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
+ zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+ zfs_ioctl_register("zpool_discard_checkpoint",
+ ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
+ zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index e8adc6d99..d0b1c1d14 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -29,6 +29,7 @@
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/arc.h>
@@ -430,6 +431,35 @@ done:
return (error);
}
+/* ARGSUSED */
+static int
+zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ ASSERT(!BP_IS_HOLE(bp));
+
+ /*
+ * As we call this function from the context of a rewind to a
+ * checkpoint, each ZIL block whose txg is later than the txg
+ * that we rewind to is invalid. Thus, we return -1 so
+ * zil_parse() doesn't attempt to read it.
+ */
+ if (bp->blk_birth >= first_txg)
+ return (-1);
+
+ if (zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
+
+ zio_free(zilog->zl_spa, first_txg, bp);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ return (0);
+}
+
static int
zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
{
@@ -476,7 +506,7 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
static int
zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
{
- zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
return (0);
}
@@ -662,7 +692,7 @@ zil_create(zilog_t *zilog)
txg = dmu_tx_get_txg(tx);
if (!BP_IS_HOLE(&blk)) {
- zio_free_zil(zilog->zl_spa, txg, &blk);
+ zio_free(zilog->zl_spa, txg, &blk);
BP_ZERO(&blk);
}
@@ -767,8 +797,8 @@ int
zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
{
dmu_tx_t *tx = txarg;
- uint64_t first_txg = dmu_tx_get_txg(tx);
zilog_t *zilog;
+ uint64_t first_txg;
zil_header_t *zh;
objset_t *os;
int error;
@@ -790,10 +820,43 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
zilog = dmu_objset_zil(os);
zh = zil_header_in_syncing_context(zilog);
+ ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
+ first_txg = spa_min_claim_txg(zilog->zl_spa);
- if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
- if (!BP_IS_HOLE(&zh->zh_log))
- zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
+ /*
+ * If the spa_log_state is not set to be cleared, check whether
+ * the current uberblock is a checkpoint one and if the current
+ * header has been claimed before moving on.
+ *
+ * If the current uberblock is a checkpointed uberblock then
+ * one of the following scenarios took place:
+ *
+ * 1] We are currently rewinding to the checkpoint of the pool.
+ * 2] We crashed in the middle of a checkpoint rewind but we
+ * did manage to write the checkpointed uberblock to the
+ * vdev labels, so when we tried to import the pool again
+ * the checkpointed uberblock was selected from the import
+ * procedure.
+ *
+ * In both cases we want to zero out all the ZIL blocks, except
+ * the ones that have been claimed at the time of the checkpoint
+ * (their zh_claim_txg != 0). The reason is that these blocks
+ * may be corrupted since we may have reused their locations on
+ * disk after we took the checkpoint.
+ *
+ * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
+ * when we first figure out whether the current uberblock is
+ * checkpointed or not. Unfortunately, that would discard all
+ * the logs, including the ones that are claimed, and we would
+ * leak space.
+ */
+ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
+ (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)) {
+ if (!BP_IS_HOLE(&zh->zh_log)) {
+ (void) zil_parse(zilog, zil_clear_log_block,
+ zil_noop_log_record, tx, first_txg, B_FALSE);
+ }
BP_ZERO(&zh->zh_log);
if (os->os_encrypted)
os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
@@ -803,6 +866,12 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
}
/*
+ * If we are not rewinding and opening the pool normally, then
+ * the min_claim_txg should be equal to the first txg of the pool.
+ */
+ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
+
+ /*
* Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can
* read only part of the log now (e.g. due to a missing device),
@@ -855,16 +924,17 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
zilog = dmu_objset_zil(os);
bp = (blkptr_t *)&zilog->zl_header->zh_log;
- /*
- * Check the first block and determine if it's on a log device
- * which may have been removed or faulted prior to loading this
- * pool. If so, there's no point in checking the rest of the log
- * as its content should have already been synced to the pool.
- */
if (!BP_IS_HOLE(bp)) {
vdev_t *vd;
boolean_t valid = B_TRUE;
+ /*
+ * Check the first block and determine if it's on a log device
+ * which may have been removed or faulted prior to loading this
+ * pool. If so, there's no point in checking the rest of the
+ * log as its content should have already been synced to the
+ * pool.
+ */
spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
if (vd->vdev_islog && vdev_is_dead(vd))
@@ -873,6 +943,18 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
if (!valid)
return (0);
+
+ /*
+ * Check whether the current uberblock is checkpointed (e.g.
+ * we are rewinding) and whether the current header has been
+ * claimed or not. If it hasn't then skip verifying it. We
+ * do this because its ZIL blocks may be part of the pool's
+ * state before the rewind, which is no longer valid.
+ */
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)
+ return (0);
}
/*
@@ -883,8 +965,8 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
* which will update spa_max_claim_txg. See spa_load() for details.
*/
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
- zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa),
- B_FALSE);
+ zilog->zl_header->zh_claim_txg ? -1ULL :
+ spa_min_claim_txg(os->os_spa), B_FALSE);
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 8a495988b..9a98d4fc0 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1147,8 +1147,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
* starts allocating blocks -- so that nothing is allocated twice.
* If txg == 0 we just verify that the block is claimable.
*/
- ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
- ASSERT(txg == spa_first_txg(spa) || txg == 0);
+ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+ spa_min_claim_txg(spa));
+ ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
@@ -3458,18 +3459,6 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
}
/*
- * Free an intent log block.
- */
-void
-zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
-{
- ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
- ASSERT(!BP_IS_GANG(bp));
-
- zio_free(spa, txg, bp);
-}
-
-/*
* ==========================================================================
* Read and write to physical devices
* ==========================================================================
diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c
index dc0f6d983..1c4a8e02c 100644
--- a/module/zfs/zthr.c
+++ b/module/zfs/zthr.c
@@ -235,8 +235,6 @@ zthr_destroy(zthr_t *t)
void
zthr_wakeup(zthr_t *t)
{
- ASSERT3P(t->zthr_thread, !=, NULL);
-
mutex_enter(&t->zthr_lock);
cv_broadcast(&t->zthr_cv);
mutex_exit(&t->zthr_lock);