30 files changed, 2328 insertions, 461 deletions
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index ea1bccf50..b010c8843 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
@@ -210,6 +210,11 @@ zpool_feature_init(void)
 	    hole_birth_deps);
 	}
 
+	zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
+	    "com.delphix:zpool_checkpoint", "zpool_checkpoint",
+	    "Pool state can be checkpointed, allowing rewind later.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
 	zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
 	    "com.delphix:extensible_dataset", "extensible_dataset",
 	    "Enhanced dataset functionality, used by other features.",
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index bc38eca7d..dc0bb59bc 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zio.h>
@@ -79,6 +79,8 @@ zpool_prop_init(void)
 	    ZFS_TYPE_POOL, "<size>", "FREE");
 	zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "FREEING");
+	zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
 	zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "LEAKED");
 	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 1c2187c56..d8d1e3a23 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -68,6 +68,7 @@ $(MODULE)-objs += sha256.o
 $(MODULE)-objs += skein_zfs.o
 $(MODULE)-objs += spa.o
 $(MODULE)-objs += spa_boot.o
+$(MODULE)-objs += spa_checkpoint.o
 $(MODULE)-objs += spa_config.o
 $(MODULE)-objs += spa_errlog.o
 $(MODULE)-objs += spa_history.o
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index a5f468ac8..f0b535618 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -81,8 +81,8 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 	if (BP_IS_HOLE(bp))
 		return (0);
 
-	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
-		return (0);
+	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+		return (-1);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
@@ -121,20 +121,17 @@ static void
 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
-	zilog_t *zilog;
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed; plus, in read-only mode, blocks that are already stable.
+	 * replayed; plus blocks that are already stable in read-only mode.
 	 */
 	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
 
-	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
-
+	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 	    claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
-
 	zil_free(zilog);
 }
 
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index ab687f7cc..fddad607d 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1284,6 +1284,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 	    (spa_is_root(os->os_spa) &&
 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
+	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
+
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
 	    object == DMU_PROJECTUSED_OBJECT) {
 		if (object == DMU_USERUSED_OBJECT)
@@ -1519,7 +1521,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 		mutex_exit(&dn->dn_mtx);
 		dnode_slots_rele(dnc, idx, slots);
 		dbuf_rele(db, FTAG);
-		return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
+		return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
+		    ENOENT : EEXIST));
 	}
 
 	if (refcount_add(&dn->dn_holds, tag) == 1)
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 96e7bccc9..044031e4f 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -644,7 +644,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 		    dnp->dn_datablkszsec ||
-		    range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
+		    !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
 		dnp->dn_datablkszsec =
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
 		dn->dn_next_blksz[txgoff] = 0;
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 9db6d1e0b..bb9b4a1c7 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -46,6 +46,7 @@
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
@@ -208,7 +209,9 @@ int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
 {
-	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	int used = bp_get_dsize_sync(spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
@@ -3821,7 +3824,8 @@ dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
 	ddsqra.ddsqra_value = refquota;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
-	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_dataset_set_refquota_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static int
@@ -3936,8 +3940,8 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
 	ddsqra.ddsqra_value = refreservation;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
-	    dsl_dataset_set_refreservation_sync, &ddsqra,
-	    0, ZFS_SPACE_CHECK_NONE));
+	    dsl_dataset_set_refreservation_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 /*
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index b3296ceee..b450073cc 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -1036,7 +1036,7 @@ dsl_destroy_head(const char *name)
 
 		error = dsl_sync_task(name, dsl_destroy_head_check,
 		    dsl_destroy_head_begin_sync, &ddha,
-		    0, ZFS_SPACE_CHECK_NONE);
+		    0, ZFS_SPACE_CHECK_DESTROY);
 		if (error != 0)
 			return (error);
 
@@ -1062,7 +1062,7 @@ dsl_destroy_head(const char *name)
 	}
 
 	return (dsl_sync_task(name, dsl_destroy_head_check,
-	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
 }
 
 /*
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index 36abfe024..519c94b64 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -942,14 +942,14 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 	if (pds) {
-		VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
+		VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
 		    name, sizeof (uint64_t), 1, &ddobj, tx));
 	} else {
 		/* it's the root dir */
-		VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 	}
-	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+	VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	ddphys = dbuf->db_data;
 
@@ -988,6 +988,12 @@ dsl_dir_get_used(dsl_dir_t *dd)
 }
 
 uint64_t
+dsl_dir_get_compressed(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_compressed_bytes);
+}
+
+uint64_t
 dsl_dir_get_quota(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_quota);
@@ -1215,7 +1221,8 @@ dsl_dir_space_available(dsl_dir_t *dd,
 		used += dsl_dir_space_towrite(dd);
 
 	if (dd->dd_parent == NULL) {
-		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
+		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
+		    ZFS_SPACE_CHECK_NORMAL);
 		quota = MIN(quota, poolsize);
 	}
 
@@ -1326,11 +1333,12 @@ top_of_function:
 	 */
 	uint64_t deferred = 0;
 	if (dd->dd_parent == NULL) {
-		spa_t *spa = dd->dd_pool->dp_spa;
-		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-		deferred = metaslab_class_get_deferred(spa_normal_class(spa));
-		if (poolsize - deferred < quota) {
-			quota = poolsize - deferred;
+		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
+		    (netfree) ?
+		    ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
+
+		if (avail < quota) {
+			quota = avail;
 			retval = ENOSPC;
 		}
 	}
@@ -1684,7 +1692,8 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 	ddsqra.ddsqra_value = quota;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
-	    dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_dir_set_quota_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 int
@@ -1727,7 +1736,8 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
 		avail = dsl_dir_space_available(dd->dd_parent,
 		    NULL, 0, FALSE);
 	} else {
-		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+		avail = dsl_pool_adjustedsize(dd->dd_pool,
+		    ZFS_SPACE_CHECK_NORMAL) - used;
 	}
 
 	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
@@ -1805,7 +1815,8 @@ dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
 	ddsqra.ddsqra_value = reservation;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
-	    dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_dir_set_reservation_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static dsl_dir_t *
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 1bb49c13a..e8f519b18 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -43,6 +43,8 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
@@ -201,6 +203,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks, spa,
 	    offsetof(dsl_sync_task_t, dst_node));
+	txg_list_create(&dp->dp_early_sync_tasks, spa,
+	    offsetof(dsl_sync_task_t, dst_node));
 
 	dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
 	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
@@ -385,6 +389,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_zilogs);
 	txg_list_destroy(&dp->dp_sync_tasks);
+	txg_list_destroy(&dp->dp_early_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
 	taskq_destroy(dp->dp_zil_clean_taskq);
@@ -574,6 +579,29 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 		cv_signal(&dp->dp_spaceavail_cv);
 }
 
+#ifdef ZFS_DEBUG
+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+	spa_t *spa = dp->dp_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+		txg_list_t *tl = &vd->vdev_ms_list;
+		metaslab_t *ms;
+
+		for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+		    ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+			VERIFY(range_tree_is_empty(ms->ms_freeing));
+			VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+		}
+	}
+
+	return (B_TRUE);
+}
+#endif
+
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
@@ -590,6 +618,23 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
+	 * Run all early sync tasks before writing out any dirty blocks.
+	 * For more info on early sync tasks see block comment in
+	 * dsl_early_sync_task().
+	 */
+	if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+		dsl_sync_task_t *dst;
+
+		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+		while ((dst =
+		    txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+			ASSERT(dsl_early_sync_task_verify(dp, txg));
+			dsl_sync_task_sync(dst, tx);
+		}
+		ASSERT(dsl_early_sync_task_verify(dp, txg));
+	}
+
+	/*
 	 * Write out all dirty blocks of dirty datasets.
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -744,22 +789,66 @@ dsl_pool_sync_context(dsl_pool_t *dp)
 	    taskq_member(dp->dp_sync_taskq, curthread));
 }
 
+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
 uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
 {
-	uint64_t space, resv;
-
-	/*
-	 * If we're trying to assess whether it's OK to do a free,
-	 * cut the reservation in half to allow forward progress
-	 * (e.g. make it possible to rm(1) files from a full pool).
-	 */
-	space = spa_get_dspace(dp->dp_spa);
-	resv = spa_get_slop_space(dp->dp_spa);
-	if (netfree)
+	spa_t *spa = dp->dp_spa;
+	uint64_t space, resv, adjustedsize;
+	uint64_t spa_deferred_frees =
+	    spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
+
+	space = spa_get_dspace(spa)
+	    - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+	resv = spa_get_slop_space(spa);
+
+	switch (slop_policy) {
+	case ZFS_SPACE_CHECK_NORMAL:
+		break;
+	case ZFS_SPACE_CHECK_RESERVED:
 		resv >>= 1;
+		break;
+	case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+		resv >>= 2;
+		break;
+	case ZFS_SPACE_CHECK_NONE:
+		resv = 0;
+		break;
+	default:
+		panic("invalid slop policy value: %d", slop_policy);
+		break;
+	}
+	adjustedsize = (space >= resv) ? (space - resv) : 0;
 
-	return (space - resv);
+	return (adjustedsize);
+}
+
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+	uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+	uint64_t deferred =
+	    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+	uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+	return (quota);
 }
 
 boolean_t
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 2c3494746..986dccdea 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -733,7 +733,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 	}
 
 	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 /* ARGSUSED */
@@ -810,13 +810,23 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 		 * If the scrub/resilver completed, update all DTLs to
 		 * reflect this.  Whether it succeeded or not, vacate
 		 * all temporary scrub DTLs.
+		 *
+		 * As the scrub does not currently support traversing
+		 * data that have been freed but are part of a checkpoint,
+		 * we don't mark the scrub as done in the DTLs as faults
+		 * may still exist in those vdevs.
 		 */
-		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
-		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
-		if (complete) {
+		if (complete &&
+		    !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+			    scn->scn_phys.scn_max_txg, B_TRUE);
+
 			spa_event_notify(spa, NULL, NULL,
 			    scn->scn_phys.scn_min_txg ?
 			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+		} else {
+			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+			    0, B_TRUE);
 		}
 		spa_errlog_rotate(spa);
 
@@ -1217,7 +1227,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1268,11 +1278,13 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 	zil_scan_arg_t zsa = { dp, zh };
 	zilog_t *zilog;
 
+	ASSERT(spa_writeable(dp->dp_spa));
+
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
-	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+	if (claim_txg == 0)
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -3004,79 +3016,16 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
 	return (B_TRUE);
 }
 
-/*
- * This is the primary entry point for scans that is called from syncing
- * context. Scans must happen entirely during syncing context so that we
- * cna guarantee that blocks we are currently scanning will not change out
- * from under us. While a scan is active, this function controls how quickly
- * transaction groups proceed, instead of the normal handling provided by
- * txg_sync_thread().
- */
-void
-dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 {
-	int err = 0;
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	state_sync_type_t sync_type = SYNC_OPTIONAL;
-
-	/*
-	 * Check for scn_restart_txg before checking spa_load_state, so
-	 * that we can restart an old-style scan while the pool is being
-	 * imported (see dsl_scan_init).
-	 */
-	if (dsl_scan_restarting(scn, tx)) {
-		pool_scan_func_t func = POOL_SCAN_SCRUB;
-		dsl_scan_done(scn, B_FALSE, tx);
-		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
-			func = POOL_SCAN_RESILVER;
-		zfs_dbgmsg("restarting scan func=%u txg=%llu",
-		    func, (longlong_t)tx->tx_txg);
-		dsl_scan_setup_sync(&func, tx);
-	}
-
-	/*
-	 * Only process scans in sync pass 1.
-	 */
-	if (spa_sync_pass(spa) > 1)
-		return;
-
-	/*
-	 * If the spa is shutting down, then stop scanning. This will
-	 * ensure that the scan does not dirty any new data during the
-	 * shutdown phase.
-	 */
-	if (spa_shutting_down(spa))
-		return;
-
-	/*
-	 * If the scan is inactive due to a stalled async destroy, try again.
-	 */
-	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
-		return;
+	int err = 0;
 
-	/* reset scan statistics */
-	scn->scn_visited_this_txg = 0;
-	scn->scn_holes_this_txg = 0;
-	scn->scn_lt_min_this_txg = 0;
-	scn->scn_gt_max_this_txg = 0;
-	scn->scn_ddt_contained_this_txg = 0;
-	scn->scn_objsets_visited_this_txg = 0;
-	scn->scn_avg_seg_size_this_txg = 0;
-	scn->scn_segs_this_txg = 0;
-	scn->scn_avg_zio_size_this_txg = 0;
-	scn->scn_zios_this_txg = 0;
-	scn->scn_suspending = B_FALSE;
-	scn->scn_sync_start_time = gethrtime();
-	spa->spa_scrub_active = B_TRUE;
+	if (spa_suspend_async_destroy(spa))
+		return (0);
 
-	/*
-	 * First process the async destroys.  If we suspend, don't do
-	 * any scrubbing or resilvering.  This ensures that there are no
-	 * async destroys while we are scanning, so the scan code doesn't
-	 * have to worry about traversing it.  It is also faster to free the
-	 * blocks than to scrub them.
-	 */
 	if (zfs_free_bpobj_enabled &&
 	    spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
@@ -3152,7 +3101,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		ddt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
-		return;
+		return (err);
 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
 	    zfs_free_leak_on_eio &&
 	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
@@ -3205,6 +3154,85 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
 			dsl_pool_destroy_obsolete_bpobj(dp, tx);
 	}
+	return (0);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this function controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	int err = 0;
+	dsl_scan_t *scn = dp->dp_scan;
+	spa_t *spa = dp->dp_spa;
+	state_sync_type_t sync_type = SYNC_OPTIONAL;
+
+	/*
+	 * Check for scn_restart_txg before checking spa_load_state, so
+	 * that we can restart an old-style scan while the pool is being
+	 * imported (see dsl_scan_init).
+	 */
+	if (dsl_scan_restarting(scn, tx)) {
+		pool_scan_func_t func = POOL_SCAN_SCRUB;
+		dsl_scan_done(scn, B_FALSE, tx);
+		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+			func = POOL_SCAN_RESILVER;
+		zfs_dbgmsg("restarting scan func=%u txg=%llu",
+		    func, (longlong_t)tx->tx_txg);
+		dsl_scan_setup_sync(&func, tx);
+	}
+
+	/*
+	 * Only process scans in sync pass 1.
+	 */
+	if (spa_sync_pass(spa) > 1)
+		return;
+
+	/*
+	 * If the spa is shutting down, then stop scanning. This will
+	 * ensure that the scan does not dirty any new data during the
+	 * shutdown phase.
+	 */
+	if (spa_shutting_down(spa))
+		return;
+
+	/*
+	 * If the scan is inactive due to a stalled async destroy, try again.
+	 */
+	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+		return;
+
+	/* reset scan statistics */
+	scn->scn_visited_this_txg = 0;
+	scn->scn_holes_this_txg = 0;
+	scn->scn_lt_min_this_txg = 0;
+	scn->scn_gt_max_this_txg = 0;
+	scn->scn_ddt_contained_this_txg = 0;
+	scn->scn_objsets_visited_this_txg = 0;
+	scn->scn_avg_seg_size_this_txg = 0;
+	scn->scn_segs_this_txg = 0;
+	scn->scn_avg_zio_size_this_txg = 0;
+	scn->scn_zios_this_txg = 0;
+	scn->scn_suspending = B_FALSE;
+	scn->scn_sync_start_time = gethrtime();
+	spa->spa_scrub_active = B_TRUE;
+
+	/*
+	 * First process the async destroys.  If we suspend, don't do
+	 * any scrubbing or resilvering.  This ensures that there are no
+	 * async destroys while we are scanning, so the scan code doesn't
+	 * have to worry about traversing it.  It is also faster to free the
+	 * blocks than to scrub them.
+	 */
+	err = dsl_process_async_destroys(dp, tx);
+	if (err != 0)
+		return;
 
 	if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
 		return;
diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c
index d8eb10d37..b63ce5cad 100644
--- a/module/zfs/dsl_synctask.c
+++ b/module/zfs/dsl_synctask.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -39,33 +39,10 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
 	return (0);
 }
 
-/*
- * Called from open context to perform a callback in syncing context.  Waits
- * for the operation to complete.
- *
- * The checkfunc will be called from open context as a preliminary check
- * which can quickly fail.  If it succeeds, it will be called again from
- * syncing context.  The checkfunc should generally be designed to work
- * properly in either context, but if necessary it can check
- * dmu_tx_is_syncing(tx).
- *
- * The synctask infrastructure enforces proper locking strategy with respect
- * to the dp_config_rwlock -- the lock will always be held when the callbacks
- * are called.  It will be held for read during the open-context (preliminary)
- * call to the checkfunc, and then held for write from syncing context during
- * the calls to the check and sync funcs.
- *
- * A dataset or pool name can be passed as the first argument.  Typically,
- * the check func will hold, check the return value of the hold, and then
- * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
- * This is safe because no changes can be made between the check and sync funcs,
- * and the sync func will only be called if the check func successfully opened
- * the dataset.
- */
-int
-dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+static int
+dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
     dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check)
+    int blocks_modified, zfs_space_check_t space_check, boolean_t early)
 {
 	spa_t *spa;
 	dmu_tx_t *tx;
@@ -102,7 +79,9 @@ top:
 		return (err);
 	}
 
-	VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg));
+	txg_list_t *task_list = (early) ?
+	    &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+	VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));
 
 	dmu_tx_commit(tx);
 
@@ -117,9 +96,64 @@ top:
 	return (dst.dst_error);
 }
 
-void
-dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+/*
+ * Called from open context to perform a callback in syncing context.  Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail.  If it succeeds, it will be called again from
+ * syncing context.  The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called.  It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument.  Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
+int
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+	return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
+	    blocks_modified, space_check, B_FALSE));
+}
+
+/*
+ * An early synctask works exactly as a standard synctask with one important
+ * difference on the way it is handled during syncing context. Standard
+ * synctasks run after we've written out all the dirty blocks of dirty
+ * datasets. Early synctasks are executed before writing out any dirty data,
+ * and thus before standard synctasks.
+ *
+ * For that reason, early synctasks can affect the process of writing dirty
+ * changes to disk for the txg that they run and should be used with caution.
+ * In addition, early synctasks should not dirty any metaslabs as this would
+ * invalidate the precodition/invariant for subsequent early synctasks.
+ * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
+ */
+int
+dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+	return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
+	    blocks_modified, space_check, B_TRUE));
+}
+
+static void
+dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
+    boolean_t early)
 {
 	dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
 
@@ -133,7 +167,25 @@ dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
 	dst->dst_error = 0;
 	dst->dst_nowaiter = B_TRUE;
 
-	VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg));
+	txg_list_t *task_list = (early) ?
+	    &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+	VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
+}
+
+void
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+	dsl_sync_task_nowait_common(dp, syncfunc, arg,
+	    blocks_modified, space_check, tx, B_FALSE);
+}
+
+void
+dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+	dsl_sync_task_nowait_common(dp, syncfunc, arg,
+	    blocks_modified, space_check, tx, B_TRUE);
 }
 
 /*
@@ -160,12 +212,12 @@ dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
 	 * (arc_tempreserve, dsl_pool_tempreserve).
 	 */
 	if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
-		uint64_t quota = dsl_pool_adjustedsize(dp,
-		    dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
-		    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+		uint64_t quota = dsl_pool_unreserved_space(dp,
+		    dst->dst_space_check);
 		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+
 		/* MOS space is triple-dittoed, so we multiply by 3. */
-		if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+		if (used + dst->dst_space * 3 > quota) {
 			dst->dst_error = SET_ERROR(ENOSPC);
 			if (dst->dst_nowaiter)
 				kmem_free(dst, sizeof (*dst));
diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c
index b5a684f0b..c80b35d48 100644
--- a/module/zfs/dsl_userhold.c
+++ b/module/zfs/dsl_userhold.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
@@ -604,7 +604,8 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
 	    KM_SLEEP));
 
 	error = dsl_sync_task(pool, dsl_dataset_user_release_check,
-	    dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
+	    dsl_dataset_user_release_sync, &ddura, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
 	fnvlist_free(ddura.ddura_todelete);
 	fnvlist_free(ddura.ddura_chkholds);
 
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index c11e459e0..76fa99e8b 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -34,6 +34,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>
 
 #define	WITH_DF_BLOCK_ALLOCATOR
 
@@ -54,6 +55,14 @@ unsigned long metaslab_aliquot = 512 << 10;
 unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
 /*
+ * Since we can touch multiple metaslabs (and their respective space maps)
+ * with each transaction group, we benefit from having a smaller space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk.
+ */
+int zfs_metaslab_sm_blksz = (1 << 12);
+
+/*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
@@ -211,7 +220,7 @@ uint64_t metaslab_trace_max_entries = 5000;
 
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
-static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 
 #ifdef _METASLAB_TRACING
@@ -484,11 +493,11 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 	 */
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		allocated +=
-		    range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
+		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 	}
 
-	msp_free_space = range_tree_space(msp->ms_tree) + allocated +
-	    msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
+	msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
 
 	VERIFY3U(sm_free_space, ==, msp_free_space);
 }
@@ -1021,7 +1030,7 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
 uint64_t
 metaslab_block_maxsize(metaslab_t *msp)
 {
-	avl_tree_t *t = &msp->ms_size_tree;
+	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;
 
 	if (t == NULL || (rs = avl_last(t)) == NULL)
@@ -1101,7 +1110,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-	avl_tree_t *t = &msp->ms_tree->rt_root;
+	avl_tree_t *t = &msp->ms_allocatable->rt_root;
 
 	return (metaslab_block_picker(t, cursor, size, align));
 }
@@ -1134,13 +1143,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-	range_tree_t *rt = msp->ms_tree;
+	range_tree_t *rt = msp->ms_allocatable;
 	avl_tree_t *t = &rt->rt_root;
 	uint64_t max_size = metaslab_block_maxsize(msp);
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+	ASSERT3U(avl_numnodes(t), ==,
+	    avl_numnodes(&msp->ms_allocatable_by_size));
 
 	if (max_size < size)
 		return (-1ULL);
@@ -1151,7 +1161,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	 */
 	if (max_size < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
-		t = &msp->ms_size_tree;
+		t = &msp->ms_allocatable_by_size;
 		*cursor = 0;
 	}
 
@@ -1178,8 +1188,8 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
-	range_tree_t *rt = msp->ms_tree;
-	avl_tree_t *t = &msp->ms_size_tree;
+	range_tree_t *rt = msp->ms_allocatable;
+	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
@@ -1192,7 +1202,7 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 	if ((*cursor + size) > *cursor_end) {
 		range_seg_t *rs;
 
-		rs = avl_last(&msp->ms_size_tree);
+		rs = avl_last(&msp->ms_allocatable_by_size);
 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
 			return (-1ULL);
 
@@ -1232,7 +1242,7 @@ uint64_t metaslab_ndf_clump_shift = 4;
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &msp->ms_tree->rt_root;
+	avl_tree_t *t = &msp->ms_allocatable->rt_root;
 	avl_index_t where;
 	range_seg_t *rs, rsearch;
 	uint64_t hbit = highbit64(size);
@@ -1240,7 +1250,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 	uint64_t max_size = metaslab_block_maxsize(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+	ASSERT3U(avl_numnodes(t), ==,
+	    avl_numnodes(&msp->ms_allocatable_by_size));
 
 	if (max_size < size)
 		return (-1ULL);
@@ -1250,7 +1261,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 
 	rs = avl_find(t, &rsearch, &where);
 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
-		t = &msp->ms_size_tree;
+		t = &msp->ms_allocatable_by_size;
 
 		rsearch.rs_start = 0;
 		rsearch.rs_end = MIN(max_size,
@@ -1316,13 +1327,15 @@ metaslab_load(metaslab_t *msp)
 
 	/*
 	 * If the space map has not been allocated yet, then treat
-	 * all the space in the metaslab as free and add it to the
-	 * ms_tree.
+	 * all the space in the metaslab as free and add it to ms_allocatable.
 	 */
-	if (msp->ms_sm != NULL)
-		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
-	else
-		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+	if (msp->ms_sm != NULL) {
+		error = space_map_load(msp->ms_sm, msp->ms_allocatable,
+		    SM_FREE);
+	} else {
+		range_tree_add(msp->ms_allocatable,
+		    msp->ms_start, msp->ms_size);
+	}
 
 	success = (error == 0);
 
@@ -1333,9 +1346,16 @@ metaslab_load(metaslab_t *msp)
 		ASSERT3P(msp->ms_group, !=, NULL);
 		msp->ms_loaded = B_TRUE;
 
-		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			range_tree_walk(msp->ms_defertree[t],
-			    range_tree_remove, msp->ms_tree);
+		/*
+		 * If the metaslab already has a spacemap, then we need to
+		 * remove all segments from the defer tree; otherwise, the
+		 * metaslab is completely empty and we can skip this.
+		 */
+		if (msp->ms_sm != NULL) {
+			for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+				range_tree_walk(msp->ms_defer[t],
+				    range_tree_remove, msp->ms_allocatable);
+			}
 		}
 		msp->ms_max_size = metaslab_block_maxsize(msp);
 	}
@@ -1347,7 +1367,7 @@ void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	range_tree_vacate(msp->ms_tree, NULL, NULL);
+	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 	msp->ms_max_size = 0;
@@ -1393,8 +1413,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
-	    metaslab_rangesize_compare, 0);
+	ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
+	    &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
 	metaslab_group_add(mg, ms);
 
 	metaslab_set_fragmentation(ms);
@@ -1446,20 +1466,21 @@ metaslab_fini(metaslab_t *msp)
 	space_map_close(msp->ms_sm);
 
 	metaslab_unload(msp);
-	range_tree_destroy(msp->ms_tree);
-	range_tree_destroy(msp->ms_freeingtree);
-	range_tree_destroy(msp->ms_freedtree);
+	range_tree_destroy(msp->ms_allocatable);
+	range_tree_destroy(msp->ms_freeing);
+	range_tree_destroy(msp->ms_freed);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
-		range_tree_destroy(msp->ms_alloctree[t]);
+		range_tree_destroy(msp->ms_allocating[t]);
 	}
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		range_tree_destroy(msp->ms_defertree[t]);
+		range_tree_destroy(msp->ms_defer[t]);
 	}
-
 	ASSERT0(msp->ms_deferspace);
 
+	range_tree_destroy(msp->ms_checkpointing);
+
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
@@ -1679,7 +1700,7 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		segments <<= 1;
-		segments += msp->ms_tree->rt_histogram[i];
+		segments += msp->ms_allocatable->rt_histogram[i];
 
 		/*
 		 * The range tree provides more precision than the space map
@@ -1895,7 +1916,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
 	 */
 	ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
 	    size >= SPA_MINBLOCKSIZE ||
-	    range_tree_space(msp->ms_tree) == 0);
+	    range_tree_space(msp->ms_allocatable) == 0);
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
 	msp->ms_activation_weight = 0;
@@ -2028,18 +2049,37 @@ metaslab_should_condense(metaslab_t *msp)
 	range_seg_t *rs;
 	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
 	dmu_object_info_t doi;
-	uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
+	vdev_t *vd = msp->ms_group->mg_vd;
+	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
 	/*
-	 * Use the ms_size_tree range tree, which is ordered by size, to
-	 * obtain the largest segment in the free tree. We always condense
-	 * metaslabs that are empty and metaslabs for which a condense
-	 * request has been made.
+	 * Allocations and frees in early passes are generally more space
+	 * efficient (in terms of blocks described in space map entries)
+	 * than the ones in later passes (e.g. we don't compress after
+	 * sync pass 5) and condensing a metaslab multiple times in a txg
+	 * could degrade performance.
+	 *
+	 * Thus we prefer condensing each metaslab at most once every txg at
+	 * the earliest sync pass possible. If a metaslab is eligible for
+	 * condensing again after being considered for condensing within the
+	 * same txg, it will hopefully be dirty in the next txg where it will
+	 * be condensed at an earlier pass.
+	 */
+	if (msp->ms_condense_checked_txg == current_txg)
+		return (B_FALSE);
+	msp->ms_condense_checked_txg = current_txg;
+
+	/*
+	 * Use the ms_allocatable_by_size range tree, which is ordered by
+	 * size, to obtain the largest segment in the free tree. We always
+	 * condense metaslabs that are empty and metaslabs for which a
+	 * condense request has been made.
 	 */
-	rs = avl_last(&msp->ms_size_tree);
+	rs = avl_last(&msp->ms_allocatable_by_size);
 	if (rs == NULL || msp->ms_condense_wanted)
 		return (B_TRUE);
 
@@ -2053,7 +2093,8 @@ metaslab_should_condense(metaslab_t *msp)
 	entries = size / (MIN(size, SM_RUN_MAX));
 	segsz = entries * sizeof (uint64_t);
 
-	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+	optimal_size =
+	    sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
 	object_size = space_map_length(msp->ms_sm);
 
 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
@@ -2076,7 +2117,6 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	space_map_t *sm = msp->ms_sm;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(spa_sync_pass(msp->ms_group->mg_vd->vdev_spa), ==, 1);
 	ASSERT(msp->ms_loaded);
 
 
@@ -2084,7 +2124,8 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
 	    msp->ms_group->mg_vd->vdev_spa->spa_name,
-	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+	    space_map_length(msp->ms_sm),
+	    avl_numnodes(&msp->ms_allocatable->rt_root),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
@@ -2099,20 +2140,16 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	condense_tree = range_tree_create(NULL, NULL);
 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
 
-	/*
-	 * Remove what's been freed in this txg from the condense_tree.
-	 * Since we're in sync_pass 1, we know that all the frees from
-	 * this txg are in the freeingtree.
-	 */
-	range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
+	range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
+	range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		range_tree_walk(msp->ms_defertree[t],
+		range_tree_walk(msp->ms_defer[t],
 		    range_tree_remove, condense_tree);
 	}
 
 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    range_tree_remove, condense_tree);
 	}
 
@@ -2122,13 +2159,13 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	 * metaslab's ms_condensing flag to ensure that
 	 * allocations on this metaslab do not occur while we're
 	 * in the middle of committing it to disk. This is only critical
-	 * for the ms_tree as all other range trees use per txg
+	 * for ms_allocatable as all other range trees use per txg
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
-	space_map_truncate(sm, tx);
+	space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
 
 	/*
 	 * While we would ideally like to create a space map representation
@@ -2144,7 +2181,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 
-	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
 	mutex_enter(&msp->ms_lock);
 	msp->ms_condensing = B_FALSE;
 }
@@ -2159,7 +2196,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
-	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(msp->ms_sm);
 
@@ -2168,23 +2205,24 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
-	if (msp->ms_freeingtree == NULL) {
+	if (msp->ms_freeing == NULL) {
 		ASSERT3P(alloctree, ==, NULL);
 		return;
 	}
 
 	ASSERT3P(alloctree, !=, NULL);
-	ASSERT3P(msp->ms_freeingtree, !=, NULL);
-	ASSERT3P(msp->ms_freedtree, !=, NULL);
+	ASSERT3P(msp->ms_freeing, !=, NULL);
+	ASSERT3P(msp->ms_freed, !=, NULL);
+	ASSERT3P(msp->ms_checkpointing, !=, NULL);
 
 	/*
-	 * Normally, we don't want to process a metaslab if there
-	 * are no allocations or frees to perform. However, if the metaslab
-	 * is being forced to condense and it's loaded, we need to let it
-	 * through.
+	 * Normally, we don't want to process a metaslab if there are no
+	 * allocations or frees to perform. However, if the metaslab is being
+	 * forced to condense and it's loaded, we need to let it through.
 	 */
-	if (range_tree_space(alloctree) == 0 &&
-	    range_tree_space(msp->ms_freeingtree) == 0 &&
+	if (range_tree_is_empty(alloctree) &&
+	    range_tree_is_empty(msp->ms_freeing) &&
+	    range_tree_is_empty(msp->ms_checkpointing) &&
 	    !(msp->ms_loaded && msp->ms_condense_wanted))
 		return;
 
@@ -2193,10 +2231,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
 	/*
 	 * The only state that can actually be changing concurrently with
-	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
-	 * be modifying this txg's alloctree, freeingtree, freedtree, or
-	 * space_map_phys_t.  We drop ms_lock whenever we could call
-	 * into the DMU, because the DMU can call down to us
+	 * metaslab_sync() is the metaslab's ms_allocatable.  No other
+	 * thread can be modifying this txg's alloc, freeing,
+	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
+	 * could call into the DMU, because the DMU can call down to us
 	 * (e.g. via zio_free()) at any time.
 	 *
 	 * The spa_vdev_remove_thread() can be reading metaslab state
@@ -2204,13 +2242,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 * that the ms_lock is insufficient for this, because it is dropped
 	 * by space_map_write().
 	 */
-
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	if (msp->ms_sm == NULL) {
 		uint64_t new_object;
 
-		new_object = space_map_alloc(mos, tx);
+		new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
@@ -2218,6 +2255,28 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		ASSERT(msp->ms_sm != NULL);
 	}
 
+	if (!range_tree_is_empty(msp->ms_checkpointing) &&
+	    vd->vdev_checkpoint_sm == NULL) {
+		ASSERT(spa_has_checkpoint(spa));
+
+		uint64_t new_object = space_map_alloc(mos,
+		    vdev_standard_sm_blksz, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+		/*
+		 * We save the space map object as an entry in vdev_top_zap
+		 * so it can be retrieved when the pool is reopened after an
+		 * export or through zdb.
+		 */
+		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+		    sizeof (new_object), 1, &new_object, tx));
+	}
+
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
@@ -2230,16 +2289,40 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
-	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
-	    metaslab_should_condense(msp)) {
+	if (msp->ms_loaded && metaslab_should_condense(msp)) {
 		metaslab_condense(msp, txg, tx);
 	} else {
 		mutex_exit(&msp->ms_lock);
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
-		space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
+		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
+	if (!range_tree_is_empty(msp->ms_checkpointing)) {
+		ASSERT(spa_has_checkpoint(spa));
+		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+		/*
+		 * Since we are doing writes to disk and the ms_checkpointing
+		 * tree won't be changing during that time, we drop the
+		 * ms_lock while writing to the checkpoint space map.
+		 */
+		mutex_exit(&msp->ms_lock);
+		space_map_write(vd->vdev_checkpoint_sm,
+		    msp->ms_checkpointing, SM_FREE, tx);
+		mutex_enter(&msp->ms_lock);
+		space_map_update(vd->vdev_checkpoint_sm);
+
+		spa->spa_checkpoint_info.sci_dspace +=
+		    range_tree_space(msp->ms_checkpointing);
+		vd->vdev_stat.vs_checkpoint_space +=
+		    range_tree_space(msp->ms_checkpointing);
+		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+		    -vd->vdev_checkpoint_sm->sm_alloc);
+
+		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+	}
+
 	if (msp->ms_loaded) {
 		/*
 		 * When the space map is loaded, we have an accurate
@@ -2248,7 +2331,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 * it first before updating it.
 		 */
 		space_map_histogram_clear(msp->ms_sm);
-		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
+		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 
 		/*
 		 * Since we've cleared the histogram we need to add back
@@ -2257,7 +2340,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 * to accurately reflect all free space even if some space
 		 * is not yet available for allocation (i.e. deferred).
 		 */
-		space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
+		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 
 		/*
 		 * Add back any deferred free space that has not been
@@ -2268,7 +2351,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 */
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
-			    msp->ms_defertree[t], tx);
+			    msp->ms_defer[t], tx);
 		}
 	}
 
@@ -2279,7 +2362,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 * then we will lose some accuracy but will correct it the next
 	 * time we load the space map.
 	 */
-	space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
+	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 
 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
@@ -2287,21 +2370,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
-	 * and instead will just swap the pointers for freeingtree and
-	 * freedtree. We can safely do this since the freed_tree is
+	 * and instead will just swap the pointers for freeing and
+	 * freed. We can safely do this since the freed_tree is
 	 * guaranteed to be empty on the initial pass.
 	 */
 	if (spa_sync_pass(spa) == 1) {
-		range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
+		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
 	} else {
-		range_tree_vacate(msp->ms_freeingtree,
-		    range_tree_add, msp->ms_freedtree);
+		range_tree_vacate(msp->ms_freeing,
+		    range_tree_add, msp->ms_freed);
 	}
 	range_tree_vacate(alloctree, NULL, NULL);
 
-	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_freeingtree));
+	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+	    & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	mutex_exit(&msp->ms_lock);
 
@@ -2336,29 +2421,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 * If this metaslab is just becoming available, initialize its
 	 * range trees and add its capacity to the vdev.
 	 */
-	if (msp->ms_freedtree == NULL) {
+	if (msp->ms_freed == NULL) {
 		for (int t = 0; t < TXG_SIZE; t++) {
-			ASSERT(msp->ms_alloctree[t] == NULL);
+			ASSERT(msp->ms_allocating[t] == NULL);
 
-			msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
+			msp->ms_allocating[t] = range_tree_create(NULL, NULL);
 		}
 
-		ASSERT3P(msp->ms_freeingtree, ==, NULL);
-		msp->ms_freeingtree = range_tree_create(NULL, NULL);
+		ASSERT3P(msp->ms_freeing, ==, NULL);
+		msp->ms_freeing = range_tree_create(NULL, NULL);
 
-		ASSERT3P(msp->ms_freedtree, ==, NULL);
-		msp->ms_freedtree = range_tree_create(NULL, NULL);
+		ASSERT3P(msp->ms_freed, ==, NULL);
+		msp->ms_freed = range_tree_create(NULL, NULL);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			ASSERT(msp->ms_defertree[t] == NULL);
+			ASSERT(msp->ms_defer[t] == NULL);
 
-			msp->ms_defertree[t] = range_tree_create(NULL, NULL);
+			msp->ms_defer[t] = range_tree_create(NULL, NULL);
 		}
 
+		ASSERT3P(msp->ms_checkpointing, ==, NULL);
+		msp->ms_checkpointing = range_tree_create(NULL, NULL);
+
 		vdev_space_update(vd, 0, 0, msp->ms_size);
 	}
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
-	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
+	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 
 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 	    metaslab_class_get_alloc(spa_normal_class(spa));
@@ -2369,7 +2459,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	defer_delta = 0;
 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
 	if (defer_allowed) {
-		defer_delta = range_tree_space(msp->ms_freedtree) -
+		defer_delta = range_tree_space(msp->ms_freed) -
 		    range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= range_tree_space(*defer_tree);
@@ -2385,19 +2475,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
 	/*
 	 * Move the frees from the defer_tree back to the free
-	 * range tree (if it's loaded). Swap the freed_tree and the
-	 * defer_tree -- this is safe to do because we've just emptied out
-	 * the defer_tree.
+	 * range tree (if it's loaded). Swap the freed_tree and
+	 * the defer_tree -- this is safe to do because we've
+	 * just emptied out the defer_tree.
 	 */
 	range_tree_vacate(*defer_tree,
-	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
 	if (defer_allowed) {
-		range_tree_swap(&msp->ms_freedtree, defer_tree);
+		range_tree_swap(&msp->ms_freed, defer_tree);
 	} else {
-		range_tree_vacate(msp->ms_freedtree,
-		    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+		range_tree_vacate(msp->ms_freed,
+		    msp->ms_loaded ? range_tree_add : NULL,
+		    msp->ms_allocatable);
 	}
-
 	space_map_update(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
@@ -2426,16 +2516,17 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 			VERIFY0(range_tree_space(
-			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
+			    msp->ms_allocating[(txg + t) & TXG_MASK]));
 		}
 
 		if (!metaslab_debug_unload)
 			metaslab_unload(msp);
 	}
 
-	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_freeingtree));
-	ASSERT0(range_tree_space(msp->ms_freedtree));
+	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_freed));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	mutex_exit(&msp->ms_lock);
 }
@@ -2666,7 +2757,7 @@ static uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 {
 	uint64_t start;
-	range_tree_t *rt = msp->ms_tree;
+	range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	VERIFY(!msp->ms_condensing);
@@ -2681,10 +2772,10 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 		range_tree_remove(rt, start, size);
 
-		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
+		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
 
 		/* Track the last successful allocation */
 		msp->ms_alloc_txg = txg;
@@ -3183,12 +3274,11 @@ next:
 
 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
-    uint64_t txg)
+    boolean_t checkpoint)
 {
 	metaslab_t *msp;
-	ASSERTV(spa_t *spa = vd->vdev_spa);
+	spa_t *spa = vd->vdev_spa;
 
-	ASSERT3U(txg, ==, spa->spa_syncing_txg);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
@@ -3202,11 +3292,19 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 
 	metaslab_check_free_impl(vd, offset, asize);
+
 	mutex_enter(&msp->ms_lock);
-	if (range_tree_space(msp->ms_freeingtree) == 0) {
-		vdev_dirty(vd, VDD_METASLAB, msp, txg);
+	if (range_tree_is_empty(msp->ms_freeing) &&
+	    range_tree_is_empty(msp->ms_checkpointing)) {
+		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+	}
+
+	if (checkpoint) {
+		ASSERT(spa_has_checkpoint(spa));
+		range_tree_add(msp->ms_checkpointing, offset, asize);
+	} else {
+		range_tree_add(msp->ms_freeing, offset, asize);
 	}
-	range_tree_add(msp->ms_freeingtree, offset, asize);
 	mutex_exit(&msp->ms_lock);
 }
 
@@ -3215,23 +3313,25 @@ void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
-	uint64_t *txgp = arg;
+	boolean_t *checkpoint = arg;
+
+	ASSERT3P(checkpoint, !=, NULL);
 
 	if (vd->vdev_ops->vdev_op_remap != NULL)
-		vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
+		vdev_indirect_mark_obsolete(vd, offset, size);
 	else
-		metaslab_free_impl(vd, offset, size, *txgp);
+		metaslab_free_impl(vd, offset, size, *checkpoint);
 }
 
 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+    boolean_t checkpoint)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
-	if (txg > spa_freeze_txg(spa))
+	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
@@ -3243,13 +3343,13 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
 		 * an indirect vdev (in open context), and then (in syncing
 		 * context) clear spa_vdev_removal.
 		 */
-		free_from_removing_vdev(vd, offset, size, txg);
+		free_from_removing_vdev(vd, offset, size);
 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
-		vdev_indirect_mark_obsolete(vd, offset, size, txg);
+		vdev_indirect_mark_obsolete(vd, offset, size);
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
-		    metaslab_free_impl_cb, &txg);
+		    metaslab_free_impl_cb, &checkpoint);
 	} else {
-		metaslab_free_concrete(vd, offset, size, txg);
+		metaslab_free_concrete(vd, offset, size, checkpoint);
 	}
 }
 
@@ -3426,26 +3526,25 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
-	range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
+	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 	    offset, size);
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
-	VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
 	    msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-	range_tree_add(msp->ms_tree, offset, size);
+	range_tree_add(msp->ms_allocatable, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 /*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
+ * Free the block represented by the given DVA.
  */
 void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
@@ -3459,7 +3558,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 	}
 
-	metaslab_free_impl(vd, offset, size, txg);
+	metaslab_free_impl(vd, offset, size, checkpoint);
 }
 
 /*
@@ -3529,7 +3628,8 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
-	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
+	if (error == 0 &&
+	    !range_tree_contains(msp->ms_allocatable, offset, size))
 		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
@@ -3540,13 +3640,15 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
-	range_tree_remove(msp->ms_tree, offset, size);
+	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+	    msp->ms_size);
+	range_tree_remove(msp->ms_allocatable, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
-		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
+		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+		    offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -3691,13 +3793,41 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
+	/*
+	 * If we have a checkpoint for the pool we need to make sure that
+	 * the blocks that we free that are part of the checkpoint won't be
+	 * reused until the checkpoint is discarded or we revert to it.
+	 *
+	 * The checkpoint flag is passed down the metaslab_free code path
+	 * and is set whenever we want to add a block to the checkpoint's
+	 * accounting. That is, we "checkpoint" blocks that existed at the
+	 * time the checkpoint was created and are therefore referenced by
+	 * the checkpointed uberblock.
+	 *
+	 * Note that, we don't checkpoint any blocks if the current
+	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+	 * normally as they will be referenced by the checkpointed uberblock.
+	 */
+	boolean_t checkpoint = B_FALSE;
+	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+		/*
+		 * At this point, if the block is part of the checkpoint
+		 * there is no way it was created in the current txg.
+		 */
+		ASSERT(!now);
+		ASSERT3U(spa_syncing_txg(spa), ==, txg);
+		checkpoint = B_TRUE;
+	}
+
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		if (now) {
 			metaslab_unalloc_dva(spa, &dva[d], txg);
 		} else {
-			metaslab_free_dva(spa, &dva[d], txg);
+			ASSERT3U(txg, ==, spa_syncing_txg(spa));
+			metaslab_free_dva(spa, &dva[d], checkpoint);
 		}
 	}
 
@@ -3818,12 +3948,13 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 
 	mutex_enter(&msp->ms_lock);
 	if (msp->ms_loaded)
-		range_tree_verify(msp->ms_tree, offset, size);
+		range_tree_verify(msp->ms_allocatable, offset, size);
 
-	range_tree_verify(msp->ms_freeingtree, offset, size);
-	range_tree_verify(msp->ms_freedtree, offset, size);
+	range_tree_verify(msp->ms_freeing, offset, size);
+	range_tree_verify(msp->ms_checkpointing, offset, size);
+	range_tree_verify(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
-		range_tree_verify(msp->ms_defertree[j], offset, size);
+		range_tree_verify(msp->ms_defer[j], offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 448d00c1e..2181a92df 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index cdc03e66c..8ab7c3428 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -153,8 +153,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
-    boolean_t reloading);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
@@ -216,6 +215,7 @@ unsigned long	zfs_max_missing_tvds = 0;
  * and we get a chance to retrieve the trusted config.
  */
 uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+
 /*
  * In the case where config was assembled by scanning device paths (/dev/dsks
  * by default) we are less tolerant since all the existing devices should have
@@ -224,6 +224,11 @@ uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
 uint64_t	zfs_max_missing_tvds_scan = 0;
 
 /*
+ * Debugging aid that pauses spa_sync() towards the end.
+ */
+boolean_t	zfs_pause_spa_sync = B_FALSE;
+
+/*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
@@ -274,6 +279,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+		    spa->spa_checkpoint_info.sci_dspace, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
@@ -811,6 +818,12 @@ spa_change_guid_check(void *arg, dmu_tx_t *tx)
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		int error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (SET_ERROR(error));
+	}
+
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1452,6 +1465,12 @@ spa_unload(spa_t *spa)
 		spa->spa_condense_zthr = NULL;
 	}
 
+	if (spa->spa_checkpoint_discard_zthr != NULL) {
+		ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
+		zthr_destroy(spa->spa_checkpoint_discard_zthr);
+		spa->spa_checkpoint_discard_zthr = NULL;
+	}
+
 	spa_condense_fini(spa);
 
 	bpobj_close(&spa->spa_deferred_bpobj);
@@ -1535,6 +1554,18 @@ spa_load_spares(spa_t *spa)
 	int i;
 	vdev_t *vd, *tvd;
 
+#ifndef _KERNEL
+	/*
+	 * zdb opens both the current state of the pool and the
+	 * checkpointed state (if present), with a different spa_t.
+	 *
+	 * As spare vdevs are shared among open pools, we skip loading
+	 * them when we load the checkpointed state of the pool.
+	 */
+	if (!spa_writeable(spa))
+		return;
+#endif
+
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
@@ -1654,6 +1685,19 @@ spa_load_l2cache(spa_t *spa)
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
+#ifndef _KERNEL
+	/*
+	 * zdb opens both the current state of the pool and the
+	 * checkpointed state (if present), with a different spa_t.
+	 *
+	 * As L2 caches are part of the ARC which is shared among open
+	 * pools, we skip loading them when we load the checkpointed
+	 * state of the pool.
+	 */
+	if (!spa_writeable(spa))
+		return;
+#endif
+
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	oldvdevs = sav->sav_vdevs;
@@ -2206,6 +2250,11 @@ spa_spawn_aux_threads(spa_t *spa)
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_start_indirect_condensing_thread(spa);
+
+	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
+	spa->spa_checkpoint_discard_zthr =
+	    zthr_create(spa_checkpoint_discard_thread_check,
+	    spa_checkpoint_discard_thread, spa);
 }
 
 /*
@@ -2299,7 +2348,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 	spa->spa_load_state = state;
 
 	gethrestime(&spa->spa_loaded_ts);
-	error = spa_load_impl(spa, type, &ereport, B_FALSE);
+	error = spa_load_impl(spa, type, &ereport);
 
 	/*
 	 * Don't count references from objsets that are already closed
@@ -2606,8 +2655,25 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 		return (SET_ERROR(EINVAL));
 	}
 
-	if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state ==
-	    SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) {
+	/*
+	 * If we are doing an import, ensure that the pool is not already
+	 * imported by checking if its pool guid already exists in the
+	 * spa namespace.
+	 *
+	 * The only case that we allow an already imported pool to be
+	 * imported again, is when the pool is checkpointed and we want to
+	 * look at its checkpointed state from userland tools like zdb.
+	 */
+#ifdef _KERNEL
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0)) {
+#else
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0) &&
+	    !spa_importing_readonly_checkpoint(spa)) {
+#endif
 		spa_load_failed(spa, "a pool with guid %llu is already open",
 		    (u_longlong_t)pool_guid);
 		return (SET_ERROR(EEXIST));
@@ -2766,6 +2832,19 @@ spa_ld_validate_vdevs(spa_t *spa)
 	return (0);
 }
 
+static void
+spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
+{
+	spa->spa_state = POOL_STATE_ACTIVE;
+	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+	spa->spa_claim_max_txg = spa->spa_first_txg;
+	spa->spa_prev_software_version = ub->ub_software_version;
+}
+
 static int
 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 {
@@ -2775,6 +2854,29 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 	boolean_t activity_check = B_FALSE;
 
 	/*
+	 * If we are opening the checkpointed state of the pool by
+	 * rewinding to it, at this point we will have written the
+	 * checkpointed uberblock to the vdev labels, so searching
+	 * the labels will find the right uberblock.  However, if
+	 * we are opening the checkpointed state read-only, we have
+	 * not modified the labels. Therefore, we must ignore the
+	 * labels and continue using the spa_uberblock that was set
+	 * by spa_ld_checkpoint_rewind.
+	 *
+	 * Note that it would be fine to ignore the labels when
+	 * rewinding (opening writeable) as well. However, if we
+	 * crash just after writing the labels, we will end up
+	 * searching the labels. Doing so in the common case means
+	 * that this code path gets exercised normally, rather than
+	 * just in the edge case.
+	 */
+	if (ub->ub_checkpoint_txg != 0 &&
+	    spa_importing_readonly_checkpoint(spa)) {
+		spa_ld_select_uberblock_done(spa, ub);
+		return (0);
+	}
+
+	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
@@ -2905,14 +3007,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 	/*
 	 * Initialize internal SPA structures.
 	 */
-	spa->spa_state = POOL_STATE_ACTIVE;
-	spa->spa_ubsync = spa->spa_uberblock;
-	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
-	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
-	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
-	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
-	spa->spa_claim_max_txg = spa->spa_first_txg;
-	spa->spa_prev_software_version = ub->ub_software_version;
+	spa_ld_select_uberblock_done(spa, ub);
 
 	return (0);
 }
@@ -2935,7 +3030,7 @@ spa_ld_open_rootbp(spa_t *spa)
 }
 
 static int
-spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type,
+spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t reloading)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
@@ -3609,7 +3704,7 @@ spa_ld_claim_log_blocks(spa_t *spa)
 
 static void
 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
-    boolean_t reloading)
+    boolean_t update_config_cache)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int need_update = B_FALSE;
@@ -3621,7 +3716,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
 	 * If this is a verbatim import, trust the current
 	 * in-core spa_config and update the disk labels.
 	 */
-	if (reloading || config_cache_txg != spa->spa_config_txg ||
+	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
@@ -3657,18 +3752,38 @@ spa_ld_prepare_for_reload(spa_t *spa)
 	spa->spa_async_suspended = async_suspended;
 }
 
-/*
- * Load an existing storage pool, using the config provided. This config
- * describes which vdevs are part of the pool and is later validated against
- * partial configs present in each vdev's label and an entire copy of the
- * config stored in the MOS.
- */
 static int
-spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
-    boolean_t reloading)
+spa_ld_read_checkpoint_txg(spa_t *spa)
+{
+	uberblock_t checkpoint;
+	int error = 0;
+
+	ASSERT0(spa->spa_checkpoint_txg);
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+	if (error == ENOENT)
+		return (0);
+
+	if (error != 0)
+		return (error);
+
+	ASSERT3U(checkpoint.ub_txg, !=, 0);
+	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
+	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
+	spa->spa_checkpoint_txg = checkpoint.ub_txg;
+	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+	return (0);
+}
+
+static int
+spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
-	boolean_t missing_feat_write = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@@ -3684,11 +3799,6 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 	if (type != SPA_IMPORT_ASSEMBLE)
 		spa->spa_trust_config = B_FALSE;
 
-	if (reloading)
-		spa_load_note(spa, "RELOADING");
-	else
-		spa_load_note(spa, "LOADING");
-
 	/*
 	 * Parse the config provided to create a vdev tree.
 	 */
@@ -3721,11 +3831,11 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 	}
 
 	/*
-	 * Read vdev labels to find the best uberblock (i.e. latest, unless
-	 * spa_load_max_txg is set) and store it in spa_uberblock. We get the
-	 * list of features required to read blkptrs in the MOS from the vdev
-	 * label with the best uberblock and verify that our version of zfs
-	 * supports them all.
+	 * Read all vdev labels to find the best uberblock (i.e. latest,
+	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
+	 * get the list of features required to read blkptrs in the MOS from
+	 * the vdev label with the best uberblock and verify that our version
+	 * of zfs supports them all.
 	 */
 	error = spa_ld_select_uberblock(spa, type);
 	if (error != 0)
@@ -3740,23 +3850,211 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 	if (error != 0)
 		return (error);
 
+	return (0);
+}
+
+static int
+spa_ld_checkpoint_rewind(spa_t *spa)
+{
+	uberblock_t checkpoint;
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+	if (error != 0) {
+		spa_load_failed(spa, "unable to retrieve checkpointed "
+		    "uberblock from the MOS config [error=%d]", error);
+
+		if (error == ENOENT)
+			error = ZFS_ERR_NO_CHECKPOINT;
+
+		return (error);
+	}
+
+	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
+	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
+
+	/*
+	 * We need to update the txg and timestamp of the checkpointed
+	 * uberblock to be higher than the latest one. This ensures that
+	 * the checkpointed uberblock is selected if we were to close and
+	 * reopen the pool right after we've written it in the vdev labels.
+	 * (also see block comment in vdev_uberblock_compare)
+	 */
+	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
+	checkpoint.ub_timestamp = gethrestime_sec();
+
+	/*
+	 * Set current uberblock to be the checkpointed uberblock.
+	 */
+	spa->spa_uberblock = checkpoint;
+
+	/*
+	 * If we are doing a normal rewind, then the pool is open for
+	 * writing and we sync the "updated" checkpointed uberblock to
+	 * disk. Once this is done, we've basically rewound the whole
+	 * pool and there is no way back.
+	 *
+	 * There are cases when we don't want to attempt and sync the
+	 * checkpointed uberblock to disk because we are opening a
+	 * pool as read-only. Specifically, verifying the checkpointed
+	 * state with zdb, and importing the checkpointed state to get
+	 * a "preview" of its content.
+	 */
+	if (spa_writeable(spa)) {
+		vdev_t *rvd = spa->spa_root_vdev;
+
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+		int svdcount = 0;
+		int children = rvd->vdev_children;
+		int c0 = spa_get_random(children);
+
+		for (int c = 0; c < children; c++) {
+			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
+
+			/* Stop when revisiting the first vdev */
+			if (c > 0 && svd[0] == vd)
+				break;
+
+			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+			    !vdev_is_concrete(vd))
+				continue;
+
+			svd[svdcount++] = vd;
+			if (svdcount == SPA_SYNC_MIN_VDEVS)
+				break;
+		}
+		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
+		if (error == 0)
+			spa->spa_last_synced_guid = rvd->vdev_guid;
+		spa_config_exit(spa, SCL_ALL, FTAG);
+
+		if (error != 0) {
+			spa_load_failed(spa, "failed to write checkpointed "
+			    "uberblock to the vdev labels [error=%d]", error);
+			return (error);
+		}
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
+    boolean_t *update_config_cache)
+{
+	int error;
+
+	/*
+	 * Parse the config for pool, open and validate vdevs,
+	 * select an uberblock, and use that uberblock to open
+	 * the MOS.
+	 */
+	error = spa_ld_mos_init(spa, type);
+	if (error != 0)
+		return (error);
+
 	/*
 	 * Retrieve the trusted config stored in the MOS and use it to create
 	 * a new, exact version of the vdev tree, then reopen all vdevs.
 	 */
-	error = spa_ld_load_trusted_config(spa, type, reloading);
+	error = spa_ld_trusted_config(spa, type, B_FALSE);
 	if (error == EAGAIN) {
-		VERIFY(!reloading);
+		if (update_config_cache != NULL)
+			*update_config_cache = B_TRUE;
+
 		/*
 		 * Redo the loading process with the trusted config if it is
 		 * too different from the untrusted config.
 		 */
 		spa_ld_prepare_for_reload(spa);
-		return (spa_load_impl(spa, type, ereport, B_TRUE));
+		spa_load_note(spa, "RELOADING");
+		error = spa_ld_mos_init(spa, type);
+		if (error != 0)
+			return (error);
+
+		error = spa_ld_trusted_config(spa, type, B_TRUE);
+		if (error != 0)
+			return (error);
+
 	} else if (error != 0) {
 		return (error);
 	}
 
+	return (0);
+}
+
+/*
+ * Load an existing storage pool, using the config provided. This config
+ * describes which vdevs are part of the pool and is later validated against
+ * partial configs present in each vdev's label and an entire copy of the
+ * config stored in the MOS.
+ */
+static int
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+	int error = 0;
+	boolean_t missing_feat_write = B_FALSE;
+	boolean_t checkpoint_rewind =
+	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+	boolean_t update_config_cache = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+	spa_load_note(spa, "LOADING");
+
+	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * If we are rewinding to the checkpoint then we need to repeat
+	 * everything we've done so far in this function but this time
+	 * selecting the checkpointed uberblock and using that to open
+	 * the MOS.
+	 */
+	if (checkpoint_rewind) {
+		/*
+		 * If we are rewinding to the checkpoint update config cache
+		 * anyway.
+		 */
+		update_config_cache = B_TRUE;
+
+		/*
+		 * Extract the checkpointed uberblock from the current MOS
+		 * and use this as the pool's uberblock from now on. If the
+		 * pool is imported as writeable we also write the checkpoint
+		 * uberblock to the labels, making the rewind permanent.
+		 */
+		error = spa_ld_checkpoint_rewind(spa);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Redo the loading process process again with the
+		 * checkpointed uberblock.
+		 */
+		spa_ld_prepare_for_reload(spa);
+		spa_load_note(spa, "LOADING checkpointed uberblock");
+		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
+		if (error != 0)
+			return (error);
+	}
+
+	/*
+	 * Retrieve the checkpoint txg if the pool has a checkpoint.
+	 */
+	error = spa_ld_read_checkpoint_txg(spa);
+	if (error != 0)
+		return (error);
+
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
 	 * from the pool and their contents were re-mapped to other vdevs. Note
@@ -3860,6 +4158,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
 		/*
+		 * In case of a checkpoint rewind, log the original txg
+		 * of the checkpointed uberblock.
+		 */
+		if (checkpoint_rewind) {
+			spa_history_log_internal(spa, "checkpoint rewind",
+			    NULL, "rewound state to txg=%llu",
+			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
+		}
+
+		/*
 		 * Traverse the ZIL and claim all blocks.
 		 */
 		spa_ld_claim_log_blocks(spa);
@@ -3886,7 +4194,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
 		spa_ld_check_for_config_update(spa, config_cache_txg,
-		    reloading);
+		    update_config_cache);
 
 		/*
 		 * Check all DTLs to see if anything needs resilvering.
@@ -3970,6 +4278,15 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
 	if (load_error == 0)
 		return (0);
+	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
+		/*
+		 * When attempting checkpoint-rewind on a pool with no
+		 * checkpoint, we should not attempt to load uberblocks
+		 * from previous txgs when spa_load fails.
+		 */
+		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+		return (load_error);
+	}
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
@@ -5564,6 +5881,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
 	if (spa->spa_vdev_removal != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
@@ -5776,6 +6100,27 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
+	/*
+	 * Besides being called directly from the userland through the
+	 * ioctl interface, spa_vdev_detach() can be potentially called
+	 * at the end of spa_vdev_resilver_done().
+	 *
+	 * In the regular case, when we have a checkpoint this shouldn't
+	 * happen as we never empty the DTLs of a vdev during the scrub
+	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
+	 * should never get here when we have a checkpoint.
+	 *
+	 * That said, even in a case when we checkpoint the pool exactly
+	 * as spa_vdev_resilver_done() calls this function everything
+	 * should be fine as the resilver will return right away.
+	 */
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
@@ -6014,6 +6359,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 
 	txg = spa_vdev_enter(spa);
 
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
@@ -6665,6 +7017,10 @@ spa_async_suspend(spa_t *spa)
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL && zthr_isrunning(condense_thread))
 		VERIFY0(zthr_cancel(condense_thread));
+
+	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+	if (discard_thread != NULL && zthr_isrunning(discard_thread))
+		VERIFY0(zthr_cancel(discard_thread));
 }
 
 void
@@ -6679,6 +7035,10 @@ spa_async_resume(spa_t *spa)
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL && !zthr_isrunning(condense_thread))
 		zthr_resume(condense_thread);
+
+	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+	if (discard_thread != NULL && !zthr_isrunning(discard_thread))
+		zthr_resume(discard_thread);
 }
 
 static boolean_t
@@ -7454,6 +7814,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 				    txg));
 				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+				ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
+				    txg));
 				break;
 			}
 			spa_sync_deferred_frees(spa, tx);
@@ -7499,16 +7861,22 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
-			vdev_t *svd[SPA_SYNC_MIN_VDEVS];
+			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);
 
 			for (int c = 0; c < children; c++) {
 				vd = rvd->vdev_child[(c0 + c) % children];
+
+				/* Stop when revisiting the first vdev */
+				if (c > 0 && svd[0] == vd)
+					break;
+
 				if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
 				    !vdev_is_concrete(vd))
 					continue;
+
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_SYNC_MIN_VDEVS)
 					break;
@@ -7572,6 +7940,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
+	while (zfs_pause_spa_sync)
+		delay(1);
+
 	spa->spa_sync_pass = 0;
 
 	/*
diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c
new file mode 100644
index 000000000..544658821
--- /dev/null
+++ b/module/zfs/spa_checkpoint.c
@@ -0,0 +1,638 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Storage Pool Checkpoint
+ *
+ * A storage pool checkpoint can be thought of as a pool-wide snapshot or
+ * a stable version of extreme rewind that guarantees no blocks from the
+ * checkpointed state will have been overwritten. It remembers the entire
+ * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
+ * point that it was taken and the user can rewind back to that point even if
+ * they applied destructive operations on their datasets or even enabled new
+ * zpool on-disk features. If a pool has a checkpoint that is no longer
+ * needed, the user can discard it.
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ *   flag is set to active when we create the checkpoint and remains active
+ *   until the checkpoint is fully discarded. The entry in the MOS config
+ *   (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
+ *   references the state of the pool when we take the checkpoint. The entry
+ *   remains populated until we start discarding the checkpoint or we rewind
+ *   back to it.
+ *
+ * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
+ *   which persists until the checkpoint is fully discarded. The space map
+ *   contains entries that have been freed in the current state of the pool
+ *   but we want to keep around in case we decide to rewind to the checkpoint.
+ *   [see vdev_checkpoint_sm]
+ *
+ * - Each metaslab's ms_sm space map behaves the same as without the
+ *   checkpoint, with the only exception being the scenario when we free
+ *   blocks that belong to the checkpoint. In this case, these blocks remain
+ *   ALLOCATED in the metaslab's space map and they are added as FREE in the
+ *   vdev's checkpoint space map.
+ *
+ * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
+ *   the uberblock was checkpointed. For normal uberblocks this field is 0.
+ *
+ * == Overview of operations ==
+ *
+ * - To create a checkpoint, we first wait for the current TXG to be synced,
+ *   so we can use the most recently synced uberblock (spa_ubsync) as the
+ *   checkpointed uberblock. Then we use an early synctask to place that
+ *   uberblock in MOS config, increment the feature flag for the checkpoint
+ *   (marking it active), and setting spa_checkpoint_txg (see its use below)
+ *   to the TXG of the checkpointed uberblock. We use an early synctask for
+ *   the aforementioned operations to ensure that no blocks were dirtied
+ *   between the current TXG and the TXG of the checkpointed uberblock
+ *   (e.g the previous txg).
+ *
+ * - When a checkpoint exists, we need to ensure that the blocks that
+ *   belong to the checkpoint are freed but never reused. This means that
+ *   these blocks should never end up in the ms_allocatable or the ms_freeing
+ *   trees of a metaslab. Therefore, whenever there is a checkpoint the new
+ *   ms_checkpointing tree is used in addition to the aforementioned ones.
+ *
+ *   Whenever a block is freed and we find out that it is referenced by the
+ *   checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
+ *   we place it in the ms_checkpointing tree instead of the ms_freeingtree.
+ *   This way, we divide the blocks that are being freed into checkpointed
+ *   and not-checkpointed blocks.
+ *
+ *   In order to persist these frees, we write the extents from the
+ *   ms_freeingtree to the ms_sm as usual, and the extents from the
+ *   ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
+ *   checkpointed extents will remain allocated in the metaslab's ms_sm space
+ *   map, and therefore won't be reused [see metaslab_sync()]. In addition,
+ *   when we discard the checkpoint, we can find the entries that have
+ *   actually been freed in vdev_checkpoint_sm.
+ *   [see spa_checkpoint_discard_thread_sync()]
+ *
+ * - To discard the checkpoint we use an early synctask to delete the
+ *   checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
+ *   and wakeup the discarding zthr thread (an open-context async thread).
+ *   We use an early synctask to ensure that the operation happens before any
+ *   new data end up in the checkpoint's data structures.
+ *
+ *   Once the synctask is done and the discarding zthr is awake, we discard
+ *   the checkpointed data over multiple TXGs by having the zthr prefetching
+ *   entries from vdev_checkpoint_sm and then starting a synctask that places
+ *   them as free blocks in to their respective ms_allocatable and ms_sm
+ *   structures.
+ *   [see spa_checkpoint_discard_thread()]
+ *
+ *   When there are no entries left in the vdev_checkpoint_sm of all
+ *   top-level vdevs, a final synctask runs that decrements the feature flag.
+ *
+ * - To rewind to the checkpoint, we first use the current uberblock and
+ *   open the MOS so we can access the checkpointed uberblock from the MOS
+ *   config. After we retrieve the checkpointed uberblock, we use it as the
+ *   current uberblock for the pool by writing it to disk with an updated
+ *   TXG, opening its version of the MOS, and moving on as usual from there.
+ *   [see spa_ld_checkpoint_rewind()]
+ *
+ *   An important note on rewinding to the checkpoint has to do with how we
+ *   handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
+ *   blocks that have not been claimed by the time we took the checkpoint
+ *   as they should no longer be valid.
+ *   [see comment in zil_claim()]
+ *
+ * == Miscellaneous information ==
+ *
+ * - In the hypothetical event that we take a checkpoint, remove a vdev,
+ *   and attempt to rewind, the rewind would fail as the checkpointed
+ *   uberblock would reference data in the removed device. For this reason
+ *   and others of similar nature, we disallow the following operations that
+ *   can change the config:
+ *   	vdev removal and attach/detach, mirror splitting, and pool reguid.
+ *
+ * - As most of the checkpoint logic is implemented in the SPA and doesn't
+ *   distinguish datasets when it comes to space accounting, having a
+ *   checkpoint can potentially break the boundaries set by dataset
+ *   reservations.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_checkpoint.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+
+/*
+ * The following parameter limits the amount of memory to be used for the
+ * prefetching of the checkpoint space map done on each vdev while
+ * discarding the checkpoint.
+ *
+ * The reason it exists is because top-level vdevs with long checkpoint
+ * space maps can potentially take up a lot of memory depending on the
+ * amount of checkpointed data that has been freed within them while
+ * the pool had a checkpoint.
+ */
+unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
+
+int
+spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+	bzero(pcs, sizeof (pool_checkpoint_stat_t));
+
+	int error = zap_contains(spa_meta_objset(spa),
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
+	ASSERT(error == 0 || error == ENOENT);
+
+	if (error == ENOENT)
+		pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
+	else
+		pcs->pcs_state = CS_CHECKPOINT_EXISTS;
+
+	pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
+	pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
+
+	return (0);
+}
+
+static void
+spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = arg;
+
+	spa->spa_checkpoint_info.sci_timestamp = 0;
+
+	spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+	spa_history_log_internal(spa, "spa discard checkpoint", tx,
+	    "finished discarding checkpointed state from the pool");
+}
+
+typedef struct spa_checkpoint_discard_sync_callback_arg {
+	vdev_t *sdc_vd;
+	uint64_t sdc_txg;
+	uint64_t sdc_entry_limit;
+} spa_checkpoint_discard_sync_callback_arg_t;
+
+static int
+spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
+    uint64_t size, void *arg)
+{
+	spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
+	vdev_t *vd = sdc->sdc_vd;
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	uint64_t end = offset + size;
+
+	if (sdc->sdc_entry_limit == 0)
+		return (EINTR);
+
+	/*
+	 * Since the space map is not condensed, we know that
+	 * none of its entries is crossing the boundaries of
+	 * its respective metaslab.
+	 *
+	 * That said, there is no fundamental requirement that
+	 * the checkpoint's space map entries should not cross
+	 * metaslab boundaries. So if needed we could add code
+	 * that handles metaslab-crossing segments in the future.
+	 */
+	VERIFY3U(type, ==, SM_FREE);
+	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+	/*
+	 * At this point we should not be processing any
+	 * other frees concurrently, so the lock is technically
+	 * unnecessary. We use the lock anyway though to
+	 * potentially save ourselves from future headaches.
+	 */
+	mutex_enter(&ms->ms_lock);
+	if (range_tree_is_empty(ms->ms_freeing))
+		vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
+	range_tree_add(ms->ms_freeing, offset, size);
+	mutex_exit(&ms->ms_lock);
+
+	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
+	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
+
+	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
+	vd->vdev_stat.vs_checkpoint_space -= size;
+	sdc->sdc_entry_limit--;
+
+	return (0);
+}
+
+#ifdef ZFS_DEBUG
+static void
+spa_checkpoint_accounting_verify(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t ckpoint_sm_space_sum = 0;
+	uint64_t vs_ckpoint_space_sum = 0;
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		if (vd->vdev_checkpoint_sm != NULL) {
+			ckpoint_sm_space_sum +=
+			    -vd->vdev_checkpoint_sm->sm_alloc;
+			vs_ckpoint_space_sum +=
+			    vd->vdev_stat.vs_checkpoint_space;
+			ASSERT3U(ckpoint_sm_space_sum, ==,
+			    vs_ckpoint_space_sum);
+		} else {
+			ASSERT0(vd->vdev_stat.vs_checkpoint_space);
+		}
+	}
+	ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
+}
+#endif
+
+static void
+spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
+{
+	vdev_t *vd = arg;
+	int error;
+
+	/*
+	 * The space map callback is applied only to non-debug entries.
+	 * Because the number of debug entries is less or equal to the
+	 * number of non-debug entries, we want to ensure that we only
+	 * read what we prefetched from open-context.
+	 *
+	 * Thus, we set the maximum entries that the space map callback
+	 * will be applied to be half the entries that could fit in the
+	 * imposed memory limit.
+	 */
+	uint64_t max_entry_limit =
+	    (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
+
+	uint64_t entries_in_sm =
+	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+	/*
+	 * Iterate from the end of the space map towards the beginning,
+	 * placing its entries on ms_freeing and removing them from the
+	 * space map. The iteration stops if one of the following
+	 * conditions is true:
+	 *
+	 * 1] We reached the beginning of the space map. At this point
+	 *    the space map should be completely empty and
+	 *    space_map_incremental_destroy should have returned 0.
+	 *    The next step would be to free and close the space map
+	 *    and remove its entry from its vdev's top zap. This allows
+	 *    spa_checkpoint_discard_thread() to move on to the next vdev.
+	 *
+	 * 2] We reached the memory limit (amount of memory used to hold
+	 *    space map entries in memory) and space_map_incremental_destroy
+	 *    returned EINTR. This means that there are entries remaining
+	 *    in the space map that will be cleared in a future invocation
+	 *    of this function by spa_checkpoint_discard_thread().
+	 */
+	spa_checkpoint_discard_sync_callback_arg_t sdc;
+	sdc.sdc_vd = vd;
+	sdc.sdc_txg = tx->tx_txg;
+	sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
+
+	uint64_t entries_before = entries_in_sm;
+
+	error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
+	    spa_checkpoint_discard_sync_callback, &sdc, tx);
+
+	uint64_t entries_after =
+	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+#ifdef ZFS_DEBUG
+	spa_checkpoint_accounting_verify(vd->vdev_spa);
+#endif
+
+	zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
+	    "deleted %llu entries - %llu entries are left",
+	    tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
+	    entries_after);
+
+	if (error != EINTR) {
+		if (error != 0) {
+			zfs_panic_recover("zfs: error %d was returned "
+			    "while incrementally destroying the checkpoint "
+			    "space map of vdev %llu\n",
+			    error, vd->vdev_id);
+		}
+		ASSERT0(entries_after);
+		ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
+		ASSERT0(vd->vdev_checkpoint_sm->sm_length);
+
+		space_map_free(vd->vdev_checkpoint_sm, tx);
+		space_map_close(vd->vdev_checkpoint_sm);
+		vd->vdev_checkpoint_sm = NULL;
+
+		VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
+	}
+}
+
+static boolean_t
+spa_checkpoint_discard_is_done(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	ASSERT(!spa_has_checkpoint(spa));
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
+			return (B_FALSE);
+		ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
+	}
+
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
+boolean_t
+spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
+{
+	spa_t *spa = arg;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (B_FALSE);
+
+	if (spa_has_checkpoint(spa))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+int
+spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
+{
+	spa_t *spa = arg;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		while (vd->vdev_checkpoint_sm != NULL) {
+			space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
+			int numbufs;
+			dmu_buf_t **dbp;
+
+			if (zthr_iscancelled(zthr))
+				return (0);
+
+			ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+
+			uint64_t size = MIN(space_map_length(checkpoint_sm),
+			    zfs_spa_discard_memory_limit);
+			uint64_t offset =
+			    space_map_length(checkpoint_sm) - size;
+
+			/*
+			 * Ensure that the part of the space map that will
+			 * be destroyed by the synctask, is prefetched in
+			 * memory before the synctask runs.
+			 */
+			int error = dmu_buf_hold_array_by_bonus(
+			    checkpoint_sm->sm_dbuf, offset, size,
+			    B_TRUE, FTAG, &numbufs, &dbp);
+			if (error != 0) {
+				zfs_panic_recover("zfs: error %d was returned "
+				    "while prefetching checkpoint space map "
+				    "entries of vdev %llu\n",
+				    error, vd->vdev_id);
+			}
+
+			VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+			    spa_checkpoint_discard_thread_sync, vd,
+			    0, ZFS_SPACE_CHECK_NONE));
+
+			dmu_buf_rele_array(dbp, numbufs, FTAG);
+		}
+	}
+
+	VERIFY(spa_checkpoint_discard_is_done(spa));
+	VERIFY0(spa->spa_checkpoint_info.sci_dspace);
+	VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+	    spa_checkpoint_discard_complete_sync, spa,
+	    0, ZFS_SPACE_CHECK_NONE));
+
+	return (0);
+}
+
+
+/* ARGSUSED */
+static int
+spa_checkpoint_check(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (SET_ERROR(ENOTSUP));
+
+	if (!spa_top_vdevs_spacemap_addressable(spa))
+		return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
+
+	if (spa->spa_vdev_removal != NULL)
+		return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
+
+	if (spa->spa_checkpoint_txg != 0)
+		return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	uberblock_t checkpoint = spa->spa_ubsync;
+
+	/*
+	 * At this point, there should not be a checkpoint in the MOS.
+	 */
+	ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
+
+	ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
+	ASSERT0(spa->spa_checkpoint_info.sci_dspace);
+
+	/*
+	 * Since the checkpointed uberblock is the one that just got synced
+	 * (we use spa_ubsync), its txg must be equal to the txg number of
+	 * the txg we are syncing, minus 1.
+	 */
+	ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
+
+	/*
+	 * Once the checkpoint is in place, we need to ensure that none of
+	 * its blocks will be marked for reuse after it has been freed.
+	 * When there is a checkpoint and a block is freed, we compare its
+	 * birth txg to the txg of the checkpointed uberblock to see if the
+	 * block is part of the checkpoint or not. Therefore, we have to set
+	 * spa_checkpoint_txg before any frees happen in this txg (which is
+	 * why this is done as an early_synctask as explained in the comment
+	 * in spa_checkpoint()).
+	 */
+	spa->spa_checkpoint_txg = checkpoint.ub_txg;
+	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+	checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
+	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
+	    sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
+	    &checkpoint, tx));
+
+	/*
+	 * Increment the feature refcount and thus activate the feature.
+	 * Note that the feature will be deactivated when we've
+	 * completely discarded all checkpointed state (both vdev
+	 * space maps and uberblock).
+	 */
+	spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+	spa_history_log_internal(spa, "spa checkpoint", tx,
+	    "checkpointed uberblock txg=%llu", checkpoint.ub_txg);
+}
+
+/*
+ * Create a checkpoint for the pool.
+ */
+int
+spa_checkpoint(const char *pool)
+{
+	int error;
+	spa_t *spa;
+
+	error = spa_open(pool, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	mutex_enter(&spa->spa_vdev_top_lock);
+
+	/*
+	 * Wait for current syncing txg to finish so the latest synced
+	 * uberblock (spa_ubsync) has all the changes that we expect
+	 * to see if we were to revert later to the checkpoint. In other
+	 * words we want the checkpointed uberblock to include/reference
+	 * all the changes that were pending at the time that we issued
+	 * the checkpoint command.
+	 */
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	/*
+	 * As the checkpointed uberblock references blocks from the previous
+	 * txg (spa_ubsync) we want to ensure that are not freeing any of
+	 * these blocks in the same txg that the following synctask will
+	 * run. Thus, we run it as an early synctask, so the dirty changes
+	 * that are synced to disk afterwards during zios and other synctasks
+	 * do not reuse checkpointed blocks.
+	 */
+	error = dsl_early_sync_task(pool, spa_checkpoint_check,
+	    spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
+
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+	if (spa->spa_checkpoint_txg == 0)
+		return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+	VERIFY0(zap_contains(spa_meta_objset(spa),
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, tx));
+
+	spa->spa_checkpoint_txg = 0;
+
+	zthr_wakeup(spa->spa_checkpoint_discard_zthr);
+
+	spa_history_log_internal(spa, "spa discard checkpoint", tx,
+	    "started discarding checkpointed state from the pool");
+}
+
+/*
+ * Discard the checkpoint from a pool.
+ */
+int
+spa_checkpoint_discard(const char *pool)
+{
+	/*
+	 * Similarly to spa_checkpoint(), we want our synctask to run
+	 * before any pending dirty data are written to disk so they
+	 * won't end up in the checkpoint's data structures (e.g.
+	 * ms_checkpointing and vdev_checkpoint_sm) and re-create any
+	 * space maps that the discarding open-context thread has
+	 * deleted.
+	 * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
+	 */
+	return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
+	    spa_checkpoint_discard_sync, NULL, 0,
+	    ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(spa_checkpoint_get_stats);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
+
+/* BEGIN CSTYLED */
+module_param(zfs_spa_discard_memory_limit, ulong, 0644);
+MODULE_PARM_DESC(zfs_spa_discard_memory_limit,
+    "Maximum memory for prefetching checkpoint space "
+    "map per top-level vdev while discarding checkpoint");
+/* END CSTYLED */
+#endif
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 6c7e2f55c..9410fab07 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -357,12 +357,15 @@ int spa_asize_inflation = 24;
  * These are the operations that call dsl_pool_adjustedsize() with the netfree
  * argument set to TRUE.
  *
+ * Operations that are almost guaranteed to free up space in the absence of
+ * a pool checkpoint can use up to three quarters of the slop space
+ * (e.g zfs destroy).
+ *
  * A very restricted set of operations are always permitted, regardless of
  * the amount of free space.  These are the operations that call
- * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
- * operations result in a net increase in the amount of space used,
- * it is possible to run the pool completely out of space, causing it to
- * be permanently read-only.
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
+ * increase in the amount of space used, it is possible to run the pool
+ * completely out of space, causing it to be permanently read-only.
  *
  * Note that on very small pools, the slop space will be larger than
  * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
@@ -1718,6 +1721,12 @@ spa_get_dspace(spa_t *spa)
 	return (spa->spa_dspace);
 }
 
+uint64_t
+spa_get_checkpoint_space(spa_t *spa)
+{
+	return (spa->spa_checkpoint_info.sci_dspace);
+}
+
 void
 spa_update_dspace(spa_t *spa)
 {
@@ -2065,7 +2074,8 @@ spa_writeable(spa_t *spa)
 boolean_t
 spa_has_pending_synctask(spa_t *spa)
 {
-	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
+	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
+	    !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
 }
 
 int
@@ -2293,6 +2303,63 @@ spa_state_to_name(spa_t *spa)
 	return ("UNKNOWN");
 }
 
+boolean_t
+spa_top_vdevs_spacemap_addressable(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
+			return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+boolean_t
+spa_has_checkpoint(spa_t *spa)
+{
+	return (spa->spa_checkpoint_txg != 0);
+}
+
+boolean_t
+spa_importing_readonly_checkpoint(spa_t *spa)
+{
+	return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
+	    spa->spa_mode == FREAD);
+}
+
+uint64_t
+spa_min_claim_txg(spa_t *spa)
+{
+	uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
+
+	if (checkpoint_txg != 0)
+		return (checkpoint_txg + 1);
+
+	return (spa->spa_first_txg);
+}
+
+/*
+ * If there is a checkpoint, async destroys may consume more space from
+ * the pool instead of freeing it. In an attempt to save the pool from
+ * getting suspended when it is about to run out of space, we stop
+ * processing async destroys.
+ */
+boolean_t
+spa_suspend_async_destroy(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+
+	uint64_t unreserved = dsl_pool_unreserved_space(dp,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
+	uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+	uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
+
+	if (spa_has_checkpoint(spa) && avail == 0)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
 #if defined(_KERNEL)
 
 #include <linux/mod_compat.h>
@@ -2446,6 +2513,11 @@ EXPORT_SYMBOL(spa_trust_config);
 EXPORT_SYMBOL(spa_missing_tvds_allowed);
 EXPORT_SYMBOL(spa_set_missing_tvds);
 EXPORT_SYMBOL(spa_state_to_name);
+EXPORT_SYMBOL(spa_importing_readonly_checkpoint);
+EXPORT_SYMBOL(spa_min_claim_txg);
+EXPORT_SYMBOL(spa_suspend_async_destroy);
+EXPORT_SYMBOL(spa_has_checkpoint);
+EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable);
 
 /* BEGIN CSTYLED */
 module_param(zfs_flags, uint, 0644);
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index d84dd7583..0e5a4b976 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -38,12 +38,13 @@
 #include <sys/zfeature.h>
 
 /*
+ * Note on space map block size:
+ *
  * The data for a given space map can be kept on blocks of any size.
  * Larger blocks entail fewer i/o operations, but they also cause the
  * DMU to keep more data in-core, and also to waste more i/o bandwidth
  * when only a few blocks have changed since the last transaction group.
  */
-int space_map_blksz = (1 << 12);
 
 /*
  * Iterate through the space map, invoking the callback on each (non-debug)
@@ -105,6 +106,137 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
 	return (error);
 }
 
+/*
+ * Note: This function performs destructive actions - specifically
+ * it deletes entries from the end of the space map. Thus, callers
+ * should ensure that they are holding the appropriate locks for
+ * the space map that they provide.
+ */
+int
+space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+    dmu_tx_t *tx)
+{
+	uint64_t bufsize, len;
+	uint64_t *entry_map;
+	int error = 0;
+
+	len = space_map_length(sm);
+	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+	entry_map = zio_buf_alloc(bufsize);
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	/*
+	 * Since we can't move the starting offset of the space map
+	 * (e.g there are reference on-disk pointing to it), we destroy
+	 * its entries incrementally starting from the end.
+	 *
+	 * The logic that follows is basically the same as the one used
+	 * in space_map_iterate() but it traverses the space map
+	 * backwards:
+	 *
+	 * 1] We figure out the size of the buffer that we want to use
+	 *    to read the on-disk space map entries.
+	 * 2] We figure out the offset at the end of the space map where
+	 *    we will start reading entries into our buffer.
+	 * 3] We read the on-disk entries into the buffer.
+	 * 4] We iterate over the entries from end to beginning calling
+	 *    the callback function on each one. As we move from entry
+	 *    to entry we decrease the size of the space map, deleting
+	 *    effectively each entry.
+	 * 5] If there are no more entries in the space map or the
+	 *    callback returns a value other than 0, we stop iterating
+	 *    over the space map. If there are entries remaining and
+	 *    the callback returned zero we go back to step [1].
+	 */
+	uint64_t offset = 0, size = 0;
+	while (len > 0 && error == 0) {
+		size = MIN(bufsize, len);
+
+		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+		VERIFY3U(size, >, 0);
+		ASSERT3U(sm->sm_blksz, !=, 0);
+
+		offset = len - size;
+
+		IMPLY(bufsize > len, offset == 0);
+		IMPLY(bufsize == len, offset == 0);
+		IMPLY(bufsize < len, offset > 0);
+
+
+		EQUIV(size == len, offset == 0);
+		IMPLY(size < len, bufsize < len);
+
+		dprintf("object=%llu  offset=%llx  size=%llx\n",
+		    space_map_object(sm), offset, size);
+
+		error = dmu_read(sm->sm_os, space_map_object(sm),
+		    offset, size, entry_map, DMU_READ_PREFETCH);
+		if (error != 0)
+			break;
+
+		uint64_t num_entries = size / sizeof (uint64_t);
+
+		ASSERT3U(num_entries, >, 0);
+
+		while (num_entries > 0) {
+			uint64_t e, entry_offset, entry_size;
+			maptype_t type;
+
+			e = entry_map[num_entries - 1];
+
+			ASSERT3U(num_entries, >, 0);
+			ASSERT0(error);
+
+			if (SM_DEBUG_DECODE(e)) {
+				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+				space_map_update(sm);
+				len -= sizeof (uint64_t);
+				num_entries--;
+				continue;
+			}
+
+			type = SM_TYPE_DECODE(e);
+			entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
+			    sm->sm_start;
+			entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+
+			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+			VERIFY3U(entry_offset, >=, sm->sm_start);
+			VERIFY3U(entry_offset + entry_size, <=,
+			    sm->sm_start + sm->sm_size);
+
+			error = callback(type, entry_offset, entry_size, arg);
+			if (error != 0)
+				break;
+
+			if (type == SM_ALLOC)
+				sm->sm_phys->smp_alloc -= entry_size;
+			else
+				sm->sm_phys->smp_alloc += entry_size;
+
+			sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+			space_map_update(sm);
+			len -= sizeof (uint64_t);
+			num_entries--;
+		}
+		IMPLY(error == 0, num_entries == 0);
+		EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
+	}
+
+	if (len == 0) {
+		ASSERT0(error);
+		ASSERT0(offset);
+		ASSERT0(sm->sm_length);
+		ASSERT0(sm->sm_phys->smp_objsize);
+		ASSERT0(sm->sm_alloc);
+	}
+
+	zio_buf_free(entry_map, bufsize);
+	return (error);
+}
+
 typedef struct space_map_load_arg {
 	space_map_t	*smla_sm;
 	range_tree_t	*smla_rt;
@@ -279,7 +411,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	 */
 	sm->sm_phys->smp_object = sm->sm_object;
 
-	if (range_tree_space(rt) == 0) {
+	if (range_tree_is_empty(rt)) {
 		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
 		return;
 	}
@@ -418,7 +550,7 @@ space_map_close(space_map_t *sm)
 }
 
 void
-space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
+space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
 {
 	objset_t *os = sm->sm_os;
 	spa_t *spa = dmu_objset_spa(os);
@@ -440,7 +572,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
 	 */
 	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
-	    doi.doi_data_block_size != space_map_blksz) {
+	    doi.doi_data_block_size != blocksize) {
 		zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
 		    "object[%llu]: old bonus %u, old blocksz %u",
 		    dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
@@ -449,7 +581,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
 		space_map_free(sm, tx);
 		dmu_buf_rele(sm->sm_dbuf, sm);
 
-		sm->sm_object = space_map_alloc(sm->sm_os, tx);
+		sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
 		VERIFY0(space_map_open_impl(sm));
 	} else {
 		VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
@@ -482,7 +614,7 @@ space_map_update(space_map_t *sm)
 }
 
 uint64_t
-space_map_alloc(objset_t *os, dmu_tx_t *tx)
+space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
 	uint64_t object;
@@ -496,8 +628,7 @@ space_map_alloc(objset_t *os, dmu_tx_t *tx)
 		bonuslen = SPACE_MAP_SIZE_V0;
 	}
 
-	object = dmu_object_alloc(os,
-	    DMU_OT_SPACE_MAP, space_map_blksz,
+	object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize,
 	    DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
 
 	return (object);
diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c
index c1e85bdce..3b8526076 100644
--- a/module/zfs/uberblock.c
+++ b/module/zfs/uberblock.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -60,6 +60,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
 	ub->ub_mmp_magic = MMP_MAGIC;
 	ub->ub_mmp_delay = spa_multihost(rvd->vdev_spa) ? mmp_delay : 0;
 	ub->ub_mmp_seq = 0;
+	ub->ub_checkpoint_txg = 0;
 
 	return (ub->ub_rootbp.blk_birth == txg);
 }
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 37bb5a0c5..cf1bf2837 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -52,11 +52,22 @@
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 
+/* maximum number of metaslabs per top-level vdev */
+int vdev_max_ms_count = 200;
+
+/* minimum amount of metaslabs per top-level vdev */
+int vdev_min_ms_count = 16;
+
+/* see comment in vdev_metaslab_set_size() */
+int vdev_default_ms_shift = 29;
+
+int vdev_validate_skip = B_FALSE;
+
 /*
- * When a vdev is added, it will be divided into approximately (but no
- * more than) this number of metaslabs.
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
  */
-int metaslabs_per_vdev = 200;
+int vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * Rate limit delay events to this many IO delays per second.
@@ -74,7 +85,12 @@ unsigned int zfs_checksums_per_second = 20;
  */
 int zfs_scan_ignore_errors = 0;
 
-int vdev_validate_skip = B_FALSE;
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+int vdev_standard_sm_blksz = (1 << 17);
 
 /*PRINTFLIKE2*/
 void
@@ -926,6 +942,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 
+	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
+	svd->vdev_checkpoint_sm = NULL;
+
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
@@ -1169,6 +1188,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
+	if (vd->vdev_checkpoint_sm != NULL) {
+		ASSERT(spa_feature_is_active(vd->vdev_spa,
+		    SPA_FEATURE_POOL_CHECKPOINT));
+		space_map_close(vd->vdev_checkpoint_sm);
+		/*
+		 * Even though we close the space map, we need to set its
+		 * pointer to NULL. The reason is that vdev_metaslab_fini()
+		 * may be called multiple times for certain operations
+		 * (i.e. when destroying a pool) so we need to ensure that
+		 * this clause never executes twice. This logic is similar
+		 * to the one used for the vdev_ms clause below.
+		 */
+		vd->vdev_checkpoint_sm = NULL;
+	}
+
 	if (vd->vdev_ms != NULL) {
 		uint64_t count = vd->vdev_ms_count;
 
@@ -2095,11 +2129,39 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
+	uint64_t asize = vd->vdev_asize;
+	uint64_t ms_shift = 0;
+
 	/*
-	 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
+	 * For vdevs that are bigger than 8G the metaslab size varies in
+	 * a way that the number of metaslabs increases in powers of two,
+	 * linearly in terms of vdev_asize, starting from 16 metaslabs.
+	 * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32,
+	 * and so on, until we hit the maximum metaslab count limit
+	 * [vdev_max_ms_count] from which point the metaslab count stays
+	 * the same.
 	 */
-	vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
-	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+	ms_shift = vdev_default_ms_shift;
+
+	if ((asize >> ms_shift) < vdev_min_ms_count) {
+		/*
+		 * For devices that are less than 8G we want to have
+		 * exactly 16 metaslabs. We don't want less as integer
+		 * division rounds down, so less metaslabs mean more
+		 * wasted space. We don't want more as these vdevs are
+		 * small and in the likely event that we are running
+		 * out of space, the SPA will have a hard time finding
+		 * space due to fragmentation.
+		 */
+		ms_shift = highbit64(asize / vdev_min_ms_count);
+		ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT);
+
+	} else if ((asize >> ms_shift) > vdev_max_ms_count) {
+		ms_shift = highbit64(asize / vdev_max_ms_count);
+	}
+
+	vd->vdev_ms_shift = ms_shift;
+	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
 void
@@ -2204,7 +2266,7 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 		return (B_FALSE);
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	if (range_tree_space(rt) != 0)
+	if (!range_tree_is_empty(rt))
 		dirty = range_tree_contains(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 
@@ -2218,7 +2280,7 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 	boolean_t empty;
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	empty = (range_tree_space(rt) == 0);
+	empty = range_tree_is_empty(rt);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (empty);
@@ -2292,7 +2354,7 @@ vdev_dtl_should_excise(vdev_t *vd)
 		return (B_FALSE);
 
 	if (vd->vdev_resilver_txg == 0 ||
-	    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
+	    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);
 
 	/*
@@ -2396,8 +2458,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		 * the top level so that we persist the change.
 		 */
 		if (vd->vdev_resilver_txg != 0 &&
-		    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
-		    range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
+		    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+		    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
 			vd->vdev_resilver_txg = 0;
 			vdev_config_dirty(vd->vdev_top);
 		}
@@ -2557,7 +2619,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
-		new_object = space_map_alloc(mos, tx);
+		new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
@@ -2571,7 +2633,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
-	space_map_truncate(vd->vdev_dtl_sm, tx);
+	space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
@@ -2642,7 +2704,7 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
+		if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
@@ -2670,6 +2732,28 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 	return (needed);
 }
 
+/*
+ * Gets the checkpoint space map object from the vdev's ZAP.
+ * Returns the spacemap object, or 0 if it wasn't in the ZAP
+ * or the ZAP doesn't exist yet.
+ */
+int
+vdev_checkpoint_sm_object(vdev_t *vd)
+{
+	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+	if (vd->vdev_top_zap == 0) {
+		return (0);
+	}
+
+	uint64_t sm_obj = 0;
+	int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
+
+	VERIFY(err == 0 || err == ENOENT);
+
+	return (sm_obj);
+}
+
 int
 vdev_load(vdev_t *vd)
 {
@@ -2705,6 +2789,35 @@ vdev_load(vdev_t *vd)
 			    VDEV_AUX_CORRUPT_DATA);
 			return (error);
 		}
+
+		uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
+		if (checkpoint_sm_obj != 0) {
+			objset_t *mos = spa_meta_objset(vd->vdev_spa);
+			ASSERT(vd->vdev_asize != 0);
+			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
+
+			if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
+			    vd->vdev_ashift))) {
+				vdev_dbgmsg(vd, "vdev_load: space_map_open "
+				    "failed for checkpoint spacemap (obj %llu) "
+				    "[error=%d]",
+				    (u_longlong_t)checkpoint_sm_obj, error);
+				return (error);
+			}
+			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+			space_map_update(vd->vdev_checkpoint_sm);
+
+			/*
+			 * Since the checkpoint_sm contains free entries
+			 * exclusively we can use sm_alloc to indicate the
+			 * culmulative checkpointed space that has been freed.
+			 */
+			vd->vdev_stat.vs_checkpoint_space =
+			    -vd->vdev_checkpoint_sm->sm_alloc;
+			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
+			    vd->vdev_stat.vs_checkpoint_space;
+		}
 	}
 
 	/*
@@ -2722,7 +2835,7 @@ vdev_load(vdev_t *vd)
 	if (obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		ASSERT(vd->vdev_asize != 0);
-		ASSERT(vd->vdev_obsolete_sm == NULL);
+		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 
 		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
 		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
@@ -2848,6 +2961,12 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg)
 			mutex_exit(&msp->ms_lock);
 		}
 
+		if (vd->vdev_checkpoint_sm != NULL) {
+			ASSERT(spa_has_checkpoint(spa));
+			space_map_close(vd->vdev_checkpoint_sm);
+			vd->vdev_checkpoint_sm = NULL;
+		}
+
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
@@ -3181,6 +3300,17 @@ top:
 
 			error = spa_reset_logs(spa);
 
+			/*
+			 * If the log device was successfully reset but has
+			 * checkpointed data, do not offline it.
+			 */
+			if (error == 0 &&
+			    tvd->vdev_checkpoint_sm != NULL) {
+				ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
+				    !=, 0);
+				error = ZFS_ERR_CHECKPOINT_EXISTS;
+			}
+
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
@@ -3419,6 +3549,23 @@ vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 
 }
 
+boolean_t
+vdev_is_spacemap_addressable(vdev_t *vd)
+{
+	/*
+	 * Assuming 47 bits of the space map entry dedicated for the entry's
+	 * offset (see description in space_map.h), we calculate the maximum
+	 * address that can be described by a space map entry for the given
+	 * device.
+	 */
+	uint64_t shift = vd->vdev_ashift + 47;
+
+	if (shift >= 63) /* detect potential overflow */
+		return (B_TRUE);
+
+	return (vd->vdev_asize < (1ULL << shift));
+}
+
 /*
  * Get statistics for the given vdev.
  */
@@ -4243,11 +4390,15 @@ EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 /* BEGIN CSTYLED */
-module_param(metaslabs_per_vdev, int, 0644);
-MODULE_PARM_DESC(metaslabs_per_vdev,
+module_param(vdev_max_ms_count, int, 0644);
+MODULE_PARM_DESC(vdev_max_ms_count,
 	"Divide added vdev into approximately (but no more than) this number "
 	"of metaslabs");
 
+module_param(vdev_min_ms_count, int, 0644);
+MODULE_PARM_DESC(vdev_min_ms_count,
+	"Minimum number of metaslabs per top-level vdev");
+
 module_param(zfs_delays_per_second, uint, 0644);
 MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
 	"IO delays per second");
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index a93e41258..b14b153b2 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -298,14 +298,13 @@ static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
 };
 
 /*
- * Mark the given offset and size as being obsolete in the given txg.
+ * Mark the given offset and size as being obsolete.
  */
 void
-vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
-	ASSERT3U(spa_syncing_txg(spa), ==, txg);
+
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
 	ASSERT(size > 0);
@@ -316,7 +315,7 @@ vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
 		mutex_enter(&vd->vdev_obsolete_lock);
 		range_tree_add(vd->vdev_obsolete_segments, offset, size);
 		mutex_exit(&vd->vdev_obsolete_lock);
-		vdev_dirty(vd, 0, NULL, txg);
+		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
 	}
 }
 
@@ -334,7 +333,7 @@ spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
 
 	/* The DMU can only remap indirect vdevs. */
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-	vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
+	vdev_indirect_mark_obsolete(vd, offset, size);
 }
 
 static spa_condensing_indirect_t *
@@ -727,7 +726,8 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)
 		return (0);
 
 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
-	    spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
+	    spa_condense_indirect_complete_sync, sci, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 
 	return (0);
 }
@@ -804,7 +804,8 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
 
 	if (vdev_obsolete_sm_object(vd) == 0) {
 		uint64_t obsolete_sm_object =
-		    space_map_alloc(spa->spa_meta_objset, tx);
+		    space_map_alloc(spa->spa_meta_objset,
+		    vdev_standard_sm_blksz, tx);
 
 		ASSERT(vd->vdev_top_zap != 0);
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 7ea8da1e6..29d7d651b 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /*
@@ -352,6 +352,37 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 	kmem_free(vsx, sizeof (*vsx));
 }
 
+static void
+root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	if (vd != spa->spa_root_vdev)
+		return;
+
+	/* provide either current or previous scan information */
+	pool_scan_stat_t ps;
+	if (spa_scan_get_stats(spa, &ps) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+		    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+	}
+
+	pool_removal_stat_t prs;
+	if (spa_removal_get_stats(spa, &prs) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
+		    sizeof (prs) / sizeof (uint64_t));
+	}
+
+	pool_checkpoint_stat_t pcs;
+	if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
+		    sizeof (pcs) / sizeof (uint64_t));
+	}
+}
+
 /*
  * Generate the nvlist representing this vdev's config.
  */
@@ -474,20 +505,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 	if (getstats) {
 		vdev_config_generate_stats(vd, nv);
 
-		/* provide either current or previous scan information */
-		pool_scan_stat_t ps;
-		if (spa_scan_get_stats(spa, &ps) == 0) {
-			fnvlist_add_uint64_array(nv,
-			    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
-			    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
-		}
-
-		pool_removal_stat_t prs;
-		if (spa_removal_get_stats(spa, &prs) == 0) {
-			fnvlist_add_uint64_array(nv,
-			    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
-			    sizeof (prs) / sizeof (uint64_t));
-		}
+		root_vdev_actions_getprogress(vd, nv);
 
 		/*
 		 * Note: this can be called from open context
@@ -1525,11 +1543,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
-	vdev_t *vd;
-	zio_t *zio;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
+	ASSERT(svdcount != 0);
 retry:
 	/*
 	 * Normally, we don't want to try too hard to write every label and
@@ -1571,9 +1588,10 @@ retry:
 	 * written in this txg will be committed to stable storage
 	 * before any uberblock that references them.
 	 */
-	zio = zio_root(spa, NULL, NULL, flags);
+	zio_t *zio = zio_root(spa, NULL, NULL, flags);
 
-	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+	for (vdev_t *vd =
+	    txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
 	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
 		zio_flush(zio, vd);
 
@@ -1588,8 +1606,14 @@ retry:
 	 * the new labels to disk to ensure that all even-label updates
 	 * are committed to stable storage before the uberblock update.
 	 */
-	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
+	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+			    "for pool '%s' when syncing out the even labels "
+			    "of dirty vdevs", error, spa_name(spa));
+		}
 		goto retry;
+	}
 
 	/*
 	 * Sync the uberblocks to all vdevs in svd[].
@@ -1606,8 +1630,13 @@ retry:
 	 *	been successfully committed) will be valid with respect
 	 *	to the new uberblocks.
 	 */
-	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
+	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
+			    "%d for pool '%s'", error, spa_name(spa));
+		}
 		goto retry;
+	}
 
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, ub);
@@ -1622,8 +1651,14 @@ retry:
 	 * to disk to ensure that all odd-label updates are committed to
 	 * stable storage before the next transaction group begins.
 	 */
-	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
+	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+			    "for pool '%s' when syncing out the odd labels of "
+			    "dirty vdevs", error, spa_name(spa));
+		}
 		goto retry;
+	}
 
 	return (0);
 }
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index f9084e8cf..f2bdd6389 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -117,6 +117,12 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
  */
 int vdev_removal_max_span = 32 * 1024;
 
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+unsigned long zfs_remove_max_bytes_pause = -1UL;
+
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
 static void spa_vdev_remove_thread(void *arg);
@@ -286,11 +292,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 		 * be copied.
 		 */
 		spa->spa_removing_phys.sr_to_copy -=
-		    range_tree_space(ms->ms_freeingtree);
+		    range_tree_space(ms->ms_freeing);
 
-		ASSERT0(range_tree_space(ms->ms_freedtree));
+		ASSERT0(range_tree_space(ms->ms_freed));
 		for (int t = 0; t < TXG_SIZE; t++)
-			ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+			ASSERT0(range_tree_space(ms->ms_allocating[t]));
 	}
 
 	/*
@@ -467,19 +473,18 @@ spa_restart_removal(spa_t *spa)
  * and we correctly free already-copied data.
  */
 void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	uint64_t txg = spa_syncing_txg(spa);
 	uint64_t max_offset_yet = 0;
 
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vim));
 	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
-	ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
 	mutex_enter(&svr->svr_lock);
 
@@ -494,8 +499,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
 	 * held, so that the remove_thread can not load this metaslab and then
 	 * visit this offset between the time that we metaslab_free_concrete()
 	 * and when we check to see if it has been visited.
+	 *
+	 * Note: The checkpoint flag is set to false as having/taking
+	 * a checkpoint and removing a device can't happen at the same
+	 * time.
 	 */
-	metaslab_free_concrete(vd, offset, size, txg);
+	ASSERT(!spa_has_checkpoint(spa));
+	metaslab_free_concrete(vd, offset, size, B_FALSE);
 
 	uint64_t synced_size = 0;
 	uint64_t synced_offset = 0;
@@ -627,16 +637,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
 	 * of this free.
 	 */
 	if (synced_size > 0) {
-		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
-		    txg);
+		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
 		/*
 		 * Note: this can only be called from syncing context,
 		 * and the vdev_indirect_mapping is only changed from the
 		 * sync thread, so we don't need svr_lock while doing
 		 * metaslab_free_impl_cb.
 		 */
+		boolean_t checkpoint = B_FALSE;
 		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
-		    metaslab_free_impl_cb, &txg);
+		    metaslab_free_impl_cb, &checkpoint);
 	}
 }
 
@@ -684,10 +695,10 @@ static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
-	vdev_indirect_mark_obsolete(vd, offset, size,
-	    vd->vdev_spa->spa_syncing_txg);
+	vdev_indirect_mark_obsolete(vd, offset, size);
+	boolean_t checkpoint = B_FALSE;
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
-	    metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+	    metaslab_free_impl_cb, &checkpoint);
 }
 
 /*
@@ -1363,7 +1374,7 @@ spa_vdev_remove_thread(void *arg)
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++) {
-			ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+			ASSERT0(range_tree_space(msp->ms_allocating[i]));
 		}
 
 		/*
@@ -1393,7 +1404,7 @@ spa_vdev_remove_thread(void *arg)
 			    SM_ALLOC));
 			space_map_close(sm);
 
-			range_tree_walk(msp->ms_freeingtree,
+			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);
 
 			/*
@@ -1412,7 +1423,7 @@ spa_vdev_remove_thread(void *arg)
 		    msp->ms_id);
 
 		while (!svr->svr_thread_exit &&
-		    range_tree_space(svr->svr_allocd_segs) != 0) {
+		    !range_tree_is_empty(svr->svr_allocd_segs)) {
 
 			mutex_exit(&svr->svr_lock);
 
@@ -1427,6 +1438,19 @@ spa_vdev_remove_thread(void *arg)
 			 */
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+			/*
+			 * This delay will pause the removal around the point
+			 * specified by zfs_remove_max_bytes_pause. We do this
+			 * solely from the test suite or during debugging.
+			 */
+			uint64_t bytes_copied =
+			    spa->spa_removing_phys.sr_copied;
+			for (int i = 0; i < TXG_SIZE; i++)
+				bytes_copied += svr->svr_bytes_done[i];
+			while (zfs_remove_max_bytes_pause <= bytes_copied &&
+			    !svr->svr_thread_exit)
+				delay(hz);
+
 			mutex_enter(&vca.vca_lock);
 			while (vca.vca_outstanding_bytes >
 			    zfs_remove_max_copy_bytes) {
@@ -1567,10 +1591,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++)
-			ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+			ASSERT0(range_tree_space(msp->ms_allocating[i]));
 		for (int i = 0; i < TXG_DEFER_SIZE; i++)
-			ASSERT0(range_tree_space(msp->ms_defertree[i]));
-		ASSERT0(range_tree_space(msp->ms_freedtree));
+			ASSERT0(range_tree_space(msp->ms_defer[i]));
+		ASSERT0(range_tree_space(msp->ms_freed));
 
 		if (msp->ms_sm != NULL) {
 			/*
@@ -1586,7 +1610,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 			mutex_enter(&svr->svr_lock);
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
-			range_tree_walk(msp->ms_freeingtree,
+			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);
 
 			/*
@@ -1662,7 +1686,8 @@ spa_vdev_remove_cancel(spa_t *spa)
 	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
-	    spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+	    spa_vdev_remove_cancel_sync, NULL, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
 
 	if (error == 0) {
 		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
@@ -1999,6 +2024,17 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+		if (!locked)
+			return (spa_vdev_exit(spa, NULL, txg, error));
+
+		return (error);
+	}
+
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (spa->spa_spares.sav_vdevs != NULL &&
@@ -2111,6 +2147,13 @@ module_param(vdev_removal_max_span, int, 0644);
 MODULE_PARM_DESC(vdev_removal_max_span,
 	"Largest span of free chunks a remap segment can span");
 
+/* BEGIN CSTYLED */
+module_param(zfs_remove_max_bytes_pause, ulong, 0644);
+MODULE_PARM_DESC(zfs_remove_max_bytes_pause,
+	"Pause device removal after this many bytes are copied "
+	"(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
 EXPORT_SYMBOL(free_from_removing_vdev);
 EXPORT_SYMBOL(spa_removal_get_stats);
 EXPORT_SYMBOL(spa_remove_init);
diff --git a/module/zfs/zcp.c b/module/zfs/zcp.c
index dad09da50..475138621 100644
--- a/module/zfs/zcp.c
+++ b/module/zfs/zcp.c
@@ -1142,7 +1142,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
 
 	if (sync) {
 		err = dsl_sync_task(poolname, NULL,
-		    zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_NONE);
+		    zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
 		if (err != 0)
 			zcp_pool_error(&evalargs, poolname);
 	} else {
diff --git a/module/zfs/zcp_synctask.c b/module/zfs/zcp_synctask.c
index 196a3d4b7..e089666f2 100644
--- a/module/zfs/zcp_synctask.c
+++ b/module/zfs/zcp_synctask.c
@@ -110,7 +110,7 @@ static zcp_synctask_info_t zcp_synctask_destroy_info = {
 	    {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
 	    {NULL, 0}
 	},
-	.space_check = ZFS_SPACE_CHECK_NONE,
+	.space_check = ZFS_SPACE_CHECK_DESTROY,
 	.blocks_modified = 0
 };
 
@@ -303,10 +303,9 @@ zcp_synctask_wrapper(lua_State *state)
 	zcp_parse_args(state, info->name, info->pargs, info->kwargs);
 
 	err = 0;
-	if (info->space_check != ZFS_SPACE_CHECK_NONE && funcspace > 0) {
-		uint64_t quota = dsl_pool_adjustedsize(dp,
-		    info->space_check == ZFS_SPACE_CHECK_RESERVED) -
-		    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+	if (info->space_check != ZFS_SPACE_CHECK_NONE) {
+		uint64_t quota = dsl_pool_unreserved_space(dp,
+		    info->space_check);
 		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
 		    ri->zri_space_used;
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index f95b77db7..e70207aa5 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3731,6 +3731,29 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
 }
 
 /*
+ * innvl: unused
+ * outnvl: empty
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	return (spa_checkpoint(poolname));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+	return (spa_checkpoint_discard(poolname));
+}
+
+/*
  * inputs:
  * zc_name		name of dataset to destroy
  * zc_objset_type	type of objset
@@ -6422,6 +6445,15 @@ zfs_ioctl_init(void)
 	    POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
 	    B_TRUE);
 
+	zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
+	    zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("zpool_discard_checkpoint",
+	    ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
+	    zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index e8adc6d99..d0b1c1d14 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -29,6 +29,7 @@
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
@@ -430,6 +431,35 @@ done:
 	return (error);
 }
 
+/* ARGSUSED */
+static int
+zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+	ASSERT(!BP_IS_HOLE(bp));
+
+	/*
+	 * As we call this function from the context of a rewind to a
+	 * checkpoint, each ZIL block whose txg is later than the txg
+	 * that we rewind to is invalid. Thus, we return -1 so
+	 * zil_parse() doesn't attempt to read it.
+	 */
+	if (bp->blk_birth >= first_txg)
+		return (-1);
+
+	if (zil_bp_tree_add(zilog, bp) != 0)
+		return (0);
+
+	zio_free(zilog->zl_spa, first_txg, bp);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+	return (0);
+}
+
 static int
 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
@@ -476,7 +506,7 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 static int
 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
-	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
@@ -662,7 +692,7 @@ zil_create(zilog_t *zilog)
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
-			zio_free_zil(zilog->zl_spa, txg, &blk);
+			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
@@ -767,8 +797,8 @@ int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
-	uint64_t first_txg = dmu_tx_get_txg(tx);
 	zilog_t *zilog;
+	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
@@ -790,10 +820,43 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
+	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
+	first_txg = spa_min_claim_txg(zilog->zl_spa);
 
-	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
-		if (!BP_IS_HOLE(&zh->zh_log))
-			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
+	/*
+	 * If the spa_log_state is not set to be cleared, check whether
+	 * the current uberblock is a checkpoint one and if the current
+	 * header has been claimed before moving on.
+	 *
+	 * If the current uberblock is a checkpointed uberblock then
+	 * one of the following scenarios took place:
+	 *
+	 * 1] We are currently rewinding to the checkpoint of the pool.
+	 * 2] We crashed in the middle of a checkpoint rewind but we
+	 *    did manage to write the checkpointed uberblock to the
+	 *    vdev labels, so when we tried to import the pool again
+	 *    the checkpointed uberblock was selected from the import
+	 *    procedure.
+	 *
+	 * In both cases we want to zero out all the ZIL blocks, except
+	 * the ones that have been claimed at the time of the checkpoint
+	 * (their zh_claim_txg != 0). The reason is that these blocks
+	 * may be corrupted since we may have reused their locations on
+	 * disk after we took the checkpoint.
+	 *
+	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
+	 * when we first figure out whether the current uberblock is
+	 * checkpointed or not. Unfortunately, that would discard all
+	 * the logs, including the ones that are claimed, and we would
+	 * leak space.
+	 */
+	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
+	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+	    zh->zh_claim_txg == 0)) {
+		if (!BP_IS_HOLE(&zh->zh_log)) {
+			(void) zil_parse(zilog, zil_clear_log_block,
+			    zil_noop_log_record, tx, first_txg, B_FALSE);
+		}
 		BP_ZERO(&zh->zh_log);
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
@@ -803,6 +866,12 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 	}
 
 	/*
+	 * If we are not rewinding and opening the pool normally, then
+	 * the min_claim_txg should be equal to the first txg of the pool.
+	 */
+	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
+
+	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
@@ -855,16 +924,17 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
-	/*
-	 * Check the first block and determine if it's on a log device
-	 * which may have been removed or faulted prior to loading this
-	 * pool.  If so, there's no point in checking the rest of the log
-	 * as its content should have already been synced to the pool.
-	 */
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
+		/*
+		 * Check the first block and determine if it's on a log device
+		 * which may have been removed or faulted prior to loading this
+		 * pool.  If so, there's no point in checking the rest of the
+		 * log as its content should have already been synced to the
+		 * pool.
+		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
@@ -873,6 +943,18 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 
 		if (!valid)
 			return (0);
+
+		/*
+		 * Check whether the current uberblock is checkpointed (e.g.
+		 * we are rewinding) and whether the current header has been
+		 * claimed or not. If it hasn't then skip verifying it. We
+		 * do this because its ZIL blocks may be part of the pool's
+		 * state before the rewind, which is no longer valid.
+		 */
+		zil_header_t *zh = zil_header_in_syncing_context(zilog);
+		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+		    zh->zh_claim_txg == 0)
+			return (0);
 	}
 
 	/*
@@ -883,8 +965,8 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
-	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa),
-	    B_FALSE);
+	    zilog->zl_header->zh_claim_txg ? -1ULL :
+	    spa_min_claim_txg(os->os_spa), B_FALSE);
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 8a495988b..9a98d4fc0 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1147,8 +1147,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
-	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
-	ASSERT(txg == spa_first_txg(spa) || txg == 0);
+	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+	    spa_min_claim_txg(spa));
+	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
@@ -3458,18 +3459,6 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 }
 
 /*
- * Free an intent log block.
- */
-void
-zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
-{
-	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
-	ASSERT(!BP_IS_GANG(bp));
-
-	zio_free(spa, txg, bp);
-}
-
-/*
  * ==========================================================================
  * Read and write to physical devices
  * ==========================================================================
diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c
index dc0f6d983..1c4a8e02c 100644
--- a/module/zfs/zthr.c
+++ b/module/zfs/zthr.c
@@ -235,8 +235,6 @@ zthr_destroy(zthr_t *t)
 void
 zthr_wakeup(zthr_t *t)
 {
-	ASSERT3P(t->zthr_thread, !=, NULL);
-
 	mutex_enter(&t->zthr_lock);
 	cv_broadcast(&t->zthr_cv);
 	mutex_exit(&t->zthr_lock);