32 files changed, 2409 insertions, 514 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 98576d1d2..f1e32a19d 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu.o
@@ -59,6 +60,8 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_root.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap_micro.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature_common.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_acl.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_byteswap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ctldir.o
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 6ec9f04b7..5e21b1b71 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -189,6 +189,7 @@ unsigned long zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
+int zfs_disable_dup_eviction = 0;
 int zfs_arc_meta_prune = 0;
 
 /*
@@ -307,6 +308,9 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_size;
 	kstat_named_t arcstat_l2_hdr_size;
 	kstat_named_t arcstat_memory_throttle_count;
+	kstat_named_t arcstat_duplicate_buffers;
+	kstat_named_t arcstat_duplicate_buffers_size;
+	kstat_named_t arcstat_duplicate_reads;
 	kstat_named_t arcstat_memory_direct_count;
 	kstat_named_t arcstat_memory_indirect_count;
 	kstat_named_t arcstat_no_grow;
@@ -387,6 +391,9 @@ static arc_stats_t arc_stats = {
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
+	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
+	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
+	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
@@ -1369,6 +1376,17 @@ arc_buf_clone(arc_buf_t *from)
 	hdr->b_buf = buf;
 	arc_get_data_buf(buf);
 	bcopy(from->b_data, buf->b_data, size);
+
+	/*
+	 * This buffer already exists in the arc so create a duplicate
+	 * copy for the caller.  If the buffer is associated with user data
+	 * then track the size and number of duplicates.  These stats will be
+	 * updated as duplicate buffers are created and destroyed.
+	 */
+	if (hdr->b_type == ARC_BUFC_DATA) {
+		ARCSTAT_BUMP(arcstat_duplicate_buffers);
+		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+	}
 	hdr->b_datacnt += 1;
 	return (buf);
 }
@@ -1467,6 +1485,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 		ASSERT3U(state->arcs_size, >=, size);
 		atomic_add_64(&state->arcs_size, -size);
 		buf->b_data = NULL;
+
+		/*
+		 * If we're destroying a duplicate buffer make sure
+		 * that the appropriate statistics are updated.
+		 */
+		if (buf->b_hdr->b_datacnt > 1 &&
+		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
+			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+		}
 		ASSERT(buf->b_hdr->b_datacnt > 0);
 		buf->b_hdr->b_datacnt -= 1;
 	}
@@ -1652,6 +1680,48 @@ arc_buf_size(arc_buf_t *buf)
 }
 
 /*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr;
+	boolean_t evict_needed = B_FALSE;
+
+	if (zfs_disable_dup_eviction)
+		return (B_FALSE);
+
+	mutex_enter(&buf->b_evict_lock);
+	hdr = buf->b_hdr;
+	if (hdr == NULL) {
+		/*
+		 * We are in arc_do_user_evicts(); let that function
+		 * perform the eviction.
+		 */
+		ASSERT(buf->b_data == NULL);
+		mutex_exit(&buf->b_evict_lock);
+		return (B_FALSE);
+	} else if (buf->b_data == NULL) {
+		/*
+		 * We have already been added to the arc eviction list;
+		 * recommend eviction.
+		 */
+		ASSERT3P(hdr, ==, &arc_eviction_hdr);
+		mutex_exit(&buf->b_evict_lock);
+		return (B_TRUE);
+	}
+
+	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+		evict_needed = B_TRUE;
+
+	mutex_exit(&buf->b_evict_lock);
+	return (evict_needed);
+}
+
+/*
  * Evict buffers from list until we've removed the specified number of
  * bytes.  Move the removed buffers to the appropriate evict state.
  * If the recycle flag is set, then attempt to "recycle" a buffer:
@@ -2729,9 +2799,11 @@ arc_read_done(zio_t *zio)
 	callback_list = hdr->b_acb;
 	ASSERT(callback_list != NULL);
 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
+		dmu_object_byteswap_t bswap =
+		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
 		    byteswap_uint64_array :
-		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
+		    dmu_ot_byteswap[bswap].ob_func;
 		func(buf->b_data, hdr->b_size);
 	}
 
@@ -2751,8 +2823,10 @@ arc_read_done(zio_t *zio)
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
 		if (acb->acb_done) {
-			if (abuf == NULL)
+			if (abuf == NULL) {
+				ARCSTAT_BUMP(arcstat_duplicate_reads);
 				abuf = arc_buf_clone(buf);
+			}
 			acb->acb_buf = abuf;
 			abuf = NULL;
 		}
@@ -3322,6 +3396,16 @@ arc_release(arc_buf_t *buf, void *tag)
 			ASSERT3U(*size, >=, hdr->b_size);
 			atomic_add_64(size, -hdr->b_size);
 		}
+
+		/*
+		 * We're releasing a duplicate user data buffer, update
+		 * our statistics accordingly.
+		 */
+		if (hdr->b_type == ARC_BUFC_DATA) {
+			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+			    -hdr->b_size);
+		}
 		hdr->b_datacnt -= 1;
 		arc_cksum_verify(buf);
 
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index 022921c66..d5f8d4072 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -20,13 +20,61 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/bpobj.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfeature.h>
+#include <sys/zap.h>
+
+/*
+ * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
+ */
+uint64_t
+bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	zfeature_info_t *empty_bpobj_feat =
+	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
+	spa_t *spa = dmu_objset_spa(os);
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
+		if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
+			ASSERT3U(dp->dp_empty_bpobj, ==, 0);
+			dp->dp_empty_bpobj =
+			    bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+			VERIFY(zap_add(os,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+			    &dp->dp_empty_bpobj, tx) == 0);
+		}
+		spa_feature_incr(spa, empty_bpobj_feat, tx);
+		ASSERT(dp->dp_empty_bpobj != 0);
+		return (dp->dp_empty_bpobj);
+	} else {
+		return (bpobj_alloc(os, blocksize, tx));
+	}
+}
+
+void
+bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
+{
+	zfeature_info_t *empty_bpobj_feat =
+	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
+	if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
+		VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_EMPTY_BPOBJ, tx));
+		VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
+		dp->dp_empty_bpobj = 0;
+	}
+}
 
 uint64_t
 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
@@ -53,6 +101,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 	int epb;
 	dmu_buf_t *dbuf = NULL;
 
+	ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
 	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
 
 	mutex_enter(&bpo.bpo_lock);
@@ -320,6 +369,12 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 
 	ASSERT(bpo->bpo_havesubobj);
 	ASSERT(bpo->bpo_havecomp);
+	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
+		bpobj_decr_empty(bpo->bpo_os, tx);
+		return;
+	}
 
 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
 	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
@@ -388,6 +443,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
 	blkptr_t *bparray;
 
 	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
 	/* We never need the fill count. */
 	stored_bp.blk_fill = 0;
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c
new file mode 100644
index 000000000..8c5a7d40e
--- /dev/null
+++ b/module/zfs/bptree.c
@@ -0,0 +1,224 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/arc.h>
+#include <sys/bptree.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/refcount.h>
+#include <sys/spa.h>
+
+/*
+ * A bptree is a queue of root block pointers from destroyed datasets. When a
+ * dataset is destroyed its root block pointer is put on the end of the pool's
+ * bptree queue so the dataset's blocks can be freed asynchronously by
+ * dsl_scan_sync. This allows the delete operation to finish without traversing
+ * all the dataset's blocks.
+ *
+ * Note that while bt_begin and bt_end are only ever incremented in this code
+ * they are effectively reset to 0 every time the entire bptree is freed because
+ * the bptree's object is destroyed and re-created.
+ */
+
+struct bptree_args {
+	bptree_phys_t *ba_phys;	/* data in bonus buffer, dirtied if freeing */
+	boolean_t ba_free;	/* true if freeing during traversal */
+
+	bptree_itor_t *ba_func;	/* function to call for each blockpointer */
+	void *ba_arg;		/* caller supplied argument to ba_func */
+	dmu_tx_t *ba_tx;	/* caller supplied tx, NULL if not freeing */
+} bptree_args_t;
+
+uint64_t
+bptree_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+
+	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
+	    SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+	    sizeof (bptree_phys_t), tx);
+
+	/*
+	 * Bonus buffer contents are already initialized to 0, but for
+	 * readability we make it explicit.
+	 */
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	bt = db->db_data;
+	bt->bt_begin = 0;
+	bt->bt_end = 0;
+	bt->bt_bytes = 0;
+	bt->bt_comp = 0;
+	bt->bt_uncomp = 0;
+	dmu_buf_rele(db, FTAG);
+
+	return (obj);
+}
+
+int
+bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	ASSERT3U(bt->bt_begin, ==, bt->bt_end);
+	ASSERT3U(bt->bt_bytes, ==, 0);
+	ASSERT3U(bt->bt_comp, ==, 0);
+	ASSERT3U(bt->bt_uncomp, ==, 0);
+	dmu_buf_rele(db, FTAG);
+
+	return (dmu_object_free(os, obj, tx));
+}
+
+void
+bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	bptree_entry_phys_t bte;
+
+	/*
+	 * bptree objects are in the pool mos, therefore they can only be
+	 * modified in syncing context. Furthermore, this is only modified
+	 * by the sync thread, so no locking is necessary.
+	 */
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+
+	bte.be_birth_txg = birth_txg;
+	bte.be_bp = *bp;
+	bzero(&bte.be_zb, sizeof (bte.be_zb));
+	dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
+
+	dmu_buf_will_dirty(db, tx);
+	bt->bt_end++;
+	bt->bt_bytes += bytes;
+	bt->bt_comp += comp;
+	bt->bt_uncomp += uncomp;
+	dmu_buf_rele(db, FTAG);
+}
+
+/* ARGSUSED */
+static int
+bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	int err;
+	struct bptree_args *ba = arg;
+
+	if (bp == NULL)
+		return (0);
+
+	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
+	if (err == 0 && ba->ba_free) {
+		ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
+		ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
+		ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
+	}
+	return (err);
+}
+
+int
+bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
+    void *arg, dmu_tx_t *tx)
+{
+	int err;
+	uint64_t i;
+	dmu_buf_t *db;
+	struct bptree_args ba;
+
+	ASSERT(!free || dmu_tx_is_syncing(tx));
+
+	err = dmu_bonus_hold(os, obj, FTAG, &db);
+	if (err != 0)
+		return (err);
+
+	if (free)
+		dmu_buf_will_dirty(db, tx);
+
+	ba.ba_phys = db->db_data;
+	ba.ba_free = free;
+	ba.ba_func = func;
+	ba.ba_arg = arg;
+	ba.ba_tx = tx;
+
+	err = 0;
+	for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
+		bptree_entry_phys_t bte;
+
+		ASSERT(!free || i == ba.ba_phys->bt_begin);
+
+		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
+		    &bte, DMU_READ_NO_PREFETCH);
+		if (err != 0)
+			break;
+
+		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
+		    bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST,
+		    bptree_visit_cb, &ba);
+		if (free) {
+			ASSERT(err == 0 || err == ERESTART);
+			if (err != 0) {
+				/* save bookmark for future resume */
+				ASSERT3U(bte.be_zb.zb_objset, ==,
+				    ZB_DESTROYED_OBJSET);
+				ASSERT3U(bte.be_zb.zb_level, ==, 0);
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
+				break;
+			} else {
+				ba.ba_phys->bt_begin++;
+				(void) dmu_free_range(os, obj,
+				    i * sizeof (bte), sizeof (bte), tx);
+			}
+		}
+	}
+
+	ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+
+	/* if all blocks are free there should be no used space */
+	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+		ASSERT3U(ba.ba_phys->bt_bytes, ==, 0);
+		ASSERT3U(ba.ba_phys->bt_comp, ==, 0);
+		ASSERT3U(ba.ba_phys->bt_uncomp, ==, 0);
+	}
+
+	dmu_buf_rele(db, FTAG);
+
+	return (err);
+}
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 1f6fa9340..205abaada 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -260,7 +261,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
 		boolean_t is_metadata;
 
 		DB_DNODE_ENTER(db);
-		is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 		DB_DNODE_EXIT(db);
 
 		return (is_metadata);
@@ -2188,7 +2189,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 			dbuf_evict(db);
 		} else {
 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-			if (!DBUF_IS_CACHEABLE(db))
+
+			/*
+			 * A dbuf will be eligible for eviction if either the
+			 * 'primarycache' property is set or a duplicate
+			 * copy of this buffer is already cached in the arc.
+			 *
+			 * In the case of the 'primarycache' a buffer
+			 * is considered for eviction if it matches the
+			 * criteria set in the property.
+			 *
+			 * To decide if our buffer is considered a
+			 * duplicate, we must call into the arc to determine
+			 * if multiple buffers are referencing the same
+			 * block on-disk. If so, then we simply evict
+			 * ourselves.
+			 */
+			if (!DBUF_IS_CACHEABLE(db) ||
+			    arc_buf_eviction_needed(db->db_buf))
 				dbuf_clear(db);
 			else
 				mutex_exit(&db->db_mtx);
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 6221157f8..ef868619a 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1120,11 +1121,9 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
 
 	if (spa->spa_ddt_stat_object == 0) {
-		spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
-		    DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
-		VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
-		    &spa->spa_ddt_stat_object, tx) == 0);
+		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, tx);
 	}
 
 	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 00a7a07f4..e85635654 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -46,60 +47,73 @@
 #endif
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
-	{	zap_byteswap,		TRUE,	"object directory"	},
-	{	byteswap_uint64_array,	TRUE,	"object array"		},
-	{	byteswap_uint8_array,	TRUE,	"packed nvlist"		},
-	{	byteswap_uint64_array,	TRUE,	"packed nvlist size"	},
-	{	byteswap_uint64_array,	TRUE,	"bpobj"			},
-	{	byteswap_uint64_array,	TRUE,	"bpobj header"		},
-	{	byteswap_uint64_array,	TRUE,	"SPA space map header"	},
-	{	byteswap_uint64_array,	TRUE,	"SPA space map"		},
-	{	byteswap_uint64_array,	TRUE,	"ZIL intent log"	},
-	{	dnode_buf_byteswap,	TRUE,	"DMU dnode"		},
-	{	dmu_objset_byteswap,	TRUE,	"DMU objset"		},
-	{	byteswap_uint64_array,	TRUE,	"DSL directory"		},
-	{	zap_byteswap,		TRUE,	"DSL directory child map"},
-	{	zap_byteswap,		TRUE,	"DSL dataset snap map"	},
-	{	zap_byteswap,		TRUE,	"DSL props"		},
-	{	byteswap_uint64_array,	TRUE,	"DSL dataset"		},
-	{	zfs_znode_byteswap,	TRUE,	"ZFS znode"		},
-	{	zfs_oldacl_byteswap,	TRUE,	"ZFS V0 ACL"		},
-	{	byteswap_uint8_array,	FALSE,	"ZFS plain file"	},
-	{	zap_byteswap,		TRUE,	"ZFS directory"		},
-	{	zap_byteswap,		TRUE,	"ZFS master node"	},
-	{	zap_byteswap,		TRUE,	"ZFS delete queue"	},
-	{	byteswap_uint8_array,	FALSE,	"zvol object"		},
-	{	zap_byteswap,		TRUE,	"zvol prop"		},
-	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
-	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
-	{	zap_byteswap,		TRUE,	"other ZAP"		},
-	{	zap_byteswap,		TRUE,	"persistent error log"	},
-	{	byteswap_uint8_array,	TRUE,	"SPA history"		},
-	{	byteswap_uint64_array,	TRUE,	"SPA history offsets"	},
-	{	zap_byteswap,		TRUE,	"Pool properties"	},
-	{	zap_byteswap,		TRUE,	"DSL permissions"	},
-	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
-	{	byteswap_uint8_array,	TRUE,	"ZFS SYSACL"		},
-	{	byteswap_uint8_array,	TRUE,	"FUID table"		},
-	{	byteswap_uint64_array,	TRUE,	"FUID table size"	},
-	{	zap_byteswap,		TRUE,	"DSL dataset next clones"},
-	{	zap_byteswap,		TRUE,	"scan work queue"	},
-	{	zap_byteswap,		TRUE,	"ZFS user/group used"	},
-	{	zap_byteswap,		TRUE,	"ZFS user/group quota"	},
-	{	zap_byteswap,		TRUE,	"snapshot refcount tags"},
-	{	zap_byteswap,		TRUE,	"DDT ZAP algorithm"	},
-	{	zap_byteswap,		TRUE,	"DDT statistics"	},
-	{	byteswap_uint8_array,	TRUE,	"System attributes"	},
-	{	zap_byteswap,		TRUE,	"SA master node"	},
-	{	zap_byteswap,		TRUE,	"SA attr registration"	},
-	{	zap_byteswap,		TRUE,	"SA attr layouts"	},
-	{	zap_byteswap,		TRUE,	"scan translations"	},
-	{	byteswap_uint8_array,	FALSE,	"deduplicated block"	},
-	{	zap_byteswap,		TRUE,	"DSL deadlist map"	},
-	{	byteswap_uint64_array,	TRUE,	"DSL deadlist map hdr"	},
-	{	zap_byteswap,		TRUE,	"DSL dir clones"	},
-	{	byteswap_uint64_array,	TRUE,	"bpobj subobj"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
+	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
+	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
+	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
+	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
+	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
+	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
+	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
+	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
+	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
+	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
+	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
+	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
+};
+
+const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+	{	byteswap_uint8_array,	"uint8"		},
+	{	byteswap_uint16_array,	"uint16"	},
+	{	byteswap_uint32_array,	"uint32"	},
+	{	byteswap_uint64_array,	"uint64"	},
+	{	zap_byteswap,		"zap"		},
+	{	dnode_buf_byteswap,	"dnode"		},
+	{	dmu_objset_byteswap,	"objset"	},
+	{	zfs_znode_byteswap,	"znode"		},
+	{	zfs_oldacl_byteswap,	"oldacl"	},
+	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
@@ -176,7 +190,7 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
-	if (type > DMU_OT_NUMTYPES) {
+	if (!DMU_OT_IS_VALID(type)) {
 		error = EINVAL;
 	} else if (dn->dn_bonus != db) {
 		error = EINVAL;
@@ -1695,7 +1709,7 @@ void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
-	boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
+	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
@@ -1939,15 +1953,15 @@ dmu_init(void)
 	dbuf_init();
 	zfetch_init();
 	dmu_tx_init();
-	arc_init();
 	l2arc_init();
+	arc_init();
 }
 
 void
 dmu_fini(void)
 {
-	l2arc_fini();
 	arc_fini();
+	l2arc_fini();
 	dmu_tx_fini();
 	zfetch_fini();
 	dbuf_fini();
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 949f4d773..0cf3c4a9a 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -1077,8 +1077,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 	void *data = NULL;
 
 	if (drro->drr_type == DMU_OT_NONE ||
-	    drro->drr_type >= DMU_OT_NUMTYPES ||
-	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
+	    !DMU_OT_IS_VALID(drro->drr_type) ||
+	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
 	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
@@ -1143,7 +1143,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		bcopy(data, db->db_data, drro->drr_bonuslen);
 		if (ra->byteswap) {
-			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
+			dmu_object_byteswap_t byteswap =
+			    DMU_OT_BYTESWAP(drro->drr_bonustype);
+			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
 			    drro->drr_bonuslen);
 		}
 		dmu_buf_rele(db, FTAG);
@@ -1186,7 +1188,7 @@ restore_write(struct restorearg *ra, objset_t *os,
 	int err;
 
 	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
-	    drrw->drr_type >= DMU_OT_NUMTYPES)
+	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (EINVAL);
 
 	data = restore_read(ra, drrw->drr_length);
@@ -1205,8 +1207,11 @@ restore_write(struct restorearg *ra, objset_t *os,
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	if (ra->byteswap)
-		dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
+	if (ra->byteswap) {
+		dmu_object_byteswap_t byteswap =
+		    DMU_OT_BYTESWAP(drrw->drr_type);
+		dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
+	}
 	dmu_write(os, drrw->drr_object,
 	    drrw->drr_offset, drrw->drr_length, data, tx);
 	dmu_tx_commit(tx);
@@ -1604,13 +1609,6 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 	dsl_dataset_t *ds = drc->drc_logical_ds;
 	int err, myerr;
 
-	/*
-	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
-	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
-	 * can close it.
-	 */
-	txg_wait_synced(ds->ds_dir->dd_pool, 0);
-
 	if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
 		err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
 		    drc->drc_force);
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 376f60f82..980c1aa2f 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -53,6 +54,7 @@ typedef struct traverse_data {
 	uint64_t td_objset;
 	blkptr_t *td_rootbp;
 	uint64_t td_min_txg;
+	zbookmark_t *td_resume;
 	int td_flags;
 	prefetch_data_t *td_pfd;
 	blkptr_cb_t *td_func;
@@ -128,6 +130,54 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh)
 	zil_free(zilog);
 }
 
+typedef enum resume_skip {
+	RESUME_SKIP_ALL,
+	RESUME_SKIP_NONE,
+	RESUME_SKIP_CHILDREN
+} resume_skip_t;
+
+/*
+ * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
+ * the block indicated by zb does not need to be visited at all. Returns
+ * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
+ * resume point. This indicates that this block should be visited but not its
+ * children (since they must have been visited in a previous traversal).
+ * Otherwise returns RESUME_SKIP_NONE.
+ */
+static resume_skip_t
+resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+    const zbookmark_t *zb)
+{
+	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+		/*
+		 * If we already visited this bp & everything below,
+		 * don't bother doing it again.
+		 */
+		if (zbookmark_is_before(dnp, zb, td->td_resume))
+			return (RESUME_SKIP_ALL);
+
+		/*
+		 * If we found the block we're trying to resume from, zero
+		 * the bookmark out to indicate that we have resumed.
+		 */
+		ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
+		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
+			bzero(td->td_resume, sizeof (*zb));
+			if (td->td_flags & TRAVERSE_POST)
+				return (RESUME_SKIP_CHILDREN);
+		}
+	}
+	return (RESUME_SKIP_NONE);
+}
+
+static void
+traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
+{
+	ASSERT(td->td_resume != NULL);
+	ASSERT3U(zb->zb_level, ==, 0);
+	bcopy(zb, td->td_resume, sizeof (*td->td_resume));
+}
+
 static int
 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
@@ -137,8 +187,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 	arc_buf_t *buf = NULL;
 	prefetch_data_t *pd = td->td_pfd;
 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
+	boolean_t pause = B_FALSE;
+
+	switch (resume_skip_check(td, dnp, zb)) {
+	case RESUME_SKIP_ALL:
+		return (0);
+	case RESUME_SKIP_CHILDREN:
+		goto post;
+	case RESUME_SKIP_NONE:
+		break;
+	default:
+		ASSERT(0);
+	}
 
-	if (bp->blk_birth == 0) {
+	if (BP_IS_HOLE(bp)) {
 		err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
 		    td->td_arg);
 		return (err);
@@ -164,8 +226,10 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
-		if (err)
-			return (err);
+		if (err == ERESTART)
+			pause = B_TRUE; /* handle pausing at a common point */
+		if (err != 0)
+			goto post;
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
@@ -253,9 +317,18 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);
 
+post:
 	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
 		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
 		    td->td_arg);
+		if (err == ERESTART)
+			pause = B_TRUE;
+	}
+
+	if (pause && td->td_resume != NULL) {
+		ASSERT3U(err, ==, ERESTART);
+		ASSERT(!hard);
+		traverse_pause(td, zb);
 	}
 
 	return (err != 0 ? err : lasterr);
@@ -353,22 +426,27 @@ traverse_prefetch_thread(void *arg)
  * in syncing context).
  */
 static int
-traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
-    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
+    uint64_t txg_start, zbookmark_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
 {
 	traverse_data_t *td;
 	prefetch_data_t *pd;
 	zbookmark_t *czb;
 	int err;
 
+	ASSERT(ds == NULL || objset == ds->ds_object);
+	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
+
 	td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE);
 	pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE);
 	czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE);
 
 	td->td_spa = spa;
-	td->td_objset = ds ? ds->ds_object : 0;
+	td->td_objset = objset;
 	td->td_rootbp = rootbp;
 	td->td_min_txg = txg_start;
+	td->td_resume = resume;
 	td->td_func = func;
 	td->td_arg = arg;
 	td->td_pfd = pd;
@@ -424,8 +502,17 @@ int
 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
-	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
-	    &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
+	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
+	    &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
+}
+
+int
+traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+    uint64_t txg_start, zbookmark_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
+{
+	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
+	    blkptr, txg_start, resume, flags, func, arg));
 }
 
 /*
@@ -442,8 +529,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
 	boolean_t hard = (flags & TRAVERSE_HARD);
 
 	/* visit the MOS */
-	err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
-	    txg_start, flags, func, arg);
+	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
+	    txg_start, NULL, flags, func, arg);
 	if (err)
 		return (err);
 
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 81c6dfea2..47ec4c109 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -20,9 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-/*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -693,7 +692,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 		return;
 	}
 
-	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+	ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 
 	if (dn->dn_maxblkid == 0 && !add) {
 		blkptr_t *bp;
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 99ac62565..3a8a5e32e 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -194,7 +195,7 @@ dnode_verify(dnode_t *dn)
 	ASSERT(dn->dn_objset);
 	ASSERT(dn->dn_handle->dnh_dnode == dn);
 
-	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 		return;
@@ -212,7 +213,7 @@ dnode_verify(dnode_t *dn)
 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 		}
 		ASSERT3U(dn->dn_nlevels, <=, 30);
-		ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 		ASSERT3U(dn->dn_nblkptr, >=, 1);
 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 		ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
@@ -278,8 +279,10 @@ dnode_byteswap(dnode_phys_t *dnp)
 		 */
 		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
 		size_t len = DN_MAX_BONUSLEN - off;
-		ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
-		dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+		dmu_object_byteswap_t byteswap;
+		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
+		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
+		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
 	}
 
 	/* Swap SPILL block if we have one */
@@ -407,7 +410,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 
 	dmu_zfetch_init(&dn->dn_zfetch, dn);
 
-	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	mutex_enter(&os->os_lock);
 	list_insert_head(&os->os_dnodes, dn);
@@ -496,11 +499,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 	ASSERT(ot != DMU_OT_NONE);
-	ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(ot));
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
-	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT3U(dn->dn_maxblkid, ==, 0);
@@ -568,7 +571,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0));
-	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 
 	/* clean up any unreferenced dbufs */
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index f2dda867d..58fa473cb 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -18,8 +18,10 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -598,7 +600,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	}
 
 	if (dn->dn_next_bonustype[txgoff]) {
-		ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES);
+		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
 		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
 		dn->dn_next_bonustype[txgoff] = 0;
 	}
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 21fdd081c..c5b84a26c 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
@@ -35,6 +35,7 @@
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
+#include <sys/zfeature.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
@@ -102,16 +103,10 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	if (BP_IS_HOLE(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
-	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 	if (ds == NULL) {
-		/*
-		 * Account for the meta-objset space in its placeholder
-		 * dsl_dir.
-		 */
-		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
-		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-		    used, compressed, uncompressed, tx);
-		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+		dsl_pool_mos_diduse_space(tx->tx_pool,
+		    used, compressed, uncompressed);
 		return;
 	}
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
@@ -119,7 +114,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
-	ds->ds_phys->ds_used_bytes += used;
+	ds->ds_phys->ds_referenced_bytes += used;
 	ds->ds_phys->ds_compressed_bytes += compressed;
 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 	ds->ds_phys->ds_unique_bytes += used;
@@ -149,15 +144,9 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 
 	ASSERT(used > 0);
 	if (ds == NULL) {
-		/*
-		 * Account for the meta-objset space in its placeholder
-		 * dataset.
-		 */
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
-
-		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-		    -used, -compressed, -uncompressed, tx);
-		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+		dsl_pool_mos_diduse_space(tx->tx_pool,
+		    -used, -compressed, -uncompressed);
 		return (used);
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
@@ -215,8 +204,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		}
 	}
 	mutex_enter(&ds->ds_lock);
-	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
-	ds->ds_phys->ds_used_bytes -= used;
+	ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
+	ds->ds_phys->ds_referenced_bytes -= used;
 	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 	ds->ds_phys->ds_compressed_bytes -= compressed;
 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
@@ -823,8 +812,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
 		    origin->ds_phys->ds_creation_txg;
-		dsphys->ds_used_bytes =
-		    origin->ds_phys->ds_used_bytes;
+		dsphys->ds_referenced_bytes =
+		    origin->ds_phys->ds_referenced_bytes;
 		dsphys->ds_compressed_bytes =
 		    origin->ds_phys->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
@@ -938,7 +927,6 @@ dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed)
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		dsl_dataset_t *ds;
-		int err;
 
 		err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 		if (err == 0) {
@@ -1074,55 +1062,55 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
 	dummy_ds->ds_dir = dd;
 	dummy_ds->ds_object = ds->ds_object;
 
-	/*
-	 * Check for errors and mark this ds as inconsistent, in
-	 * case we crash while freeing the objects.
-	 */
-	err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
-	    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
-	if (err)
-		goto out_free;
+	if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
+	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+		/*
+		 * Check for errors and mark this ds as inconsistent, in
+		 * case we crash while freeing the objects.
+		 */
+		err = dsl_sync_task_do(dd->dd_pool,
+		    dsl_dataset_destroy_begin_check,
+		    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
+		if (err)
+			goto out_free;
 
-	err = dmu_objset_from_ds(ds, &os);
-	if (err)
-		goto out_free;
+		err = dmu_objset_from_ds(ds, &os);
+		if (err)
+			goto out_free;
 
-	/*
-	 * remove the objects in open context, so that we won't
-	 * have too much to do in syncing context.
-	 */
-	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
-	    ds->ds_phys->ds_prev_snap_txg)) {
 		/*
-		 * Ignore errors, if there is not enough disk space
-		 * we will deal with it in dsl_dataset_destroy_sync().
+		 * Remove all objects while in the open context so that
+		 * there is less work to do in the syncing context.
 		 */
-		(void) dmu_free_object(os, obj);
-	}
-	if (err != ESRCH)
-		goto out_free;
-
-	/*
-	 * Only the ZIL knows how to free log blocks.
-	 */
-	zil_destroy(dmu_objset_zil(os), B_FALSE);
+		for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
+		    ds->ds_phys->ds_prev_snap_txg)) {
+			/*
+			 * Ignore errors, if there is not enough disk space
+			 * we will deal with it in dsl_dataset_destroy_sync().
+			 */
+			(void) dmu_free_object(os, obj);
+		}
+		if (err != ESRCH)
+			goto out_free;
 
-	/*
-	 * Sync out all in-flight IO.
-	 */
-	txg_wait_synced(dd->dd_pool, 0);
+		/*
+		 * Sync out all in-flight IO.
+		 */
+		txg_wait_synced(dd->dd_pool, 0);
 
-	/*
-	 * If we managed to free all the objects in open
-	 * context, the user space accounting should be zero.
-	 */
-	if (ds->ds_phys->ds_bp.blk_fill == 0 &&
-	    dmu_objset_userused_enabled(os)) {
-		ASSERTV(uint64_t count);
-		ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
-		    count == 0);
-		ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
-		    count == 0);
+		/*
+		 * If we managed to free all the objects in open
+		 * context, the user space accounting should be zero.
+		 */
+		if (ds->ds_phys->ds_bp.blk_fill == 0 &&
+		    dmu_objset_userused_enabled(os)) {
+			ASSERTV(uint64_t count);
+
+			ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
+			    &count) != 0 || count == 0);
+			ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
+			    &count) != 0 || count == 0);
+		}
 	}
 
 	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
@@ -1261,7 +1249,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 	ASSERT(!dsl_dataset_is_snapshot(ds));
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0)
-		mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
+		mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
 	else
 		mrs_used = 0;
 
@@ -1269,7 +1257,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 
 	ASSERT3U(dlused, <=, mrs_used);
 	ds->ds_phys->ds_unique_bytes =
-	    ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
+	    ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
 	    SPA_VERSION_UNIQUE_ACCURATE)
@@ -1627,12 +1615,36 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
 	    ds_next->ds_phys->ds_deadlist_obj);
 }
 
+static int
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	int err;
+	struct killarg ka;
+
+	/*
+	 * Free everything that we point to (that's born after
+	 * the previous snapshot, if we are a clone)
+	 *
+	 * NB: this should be very quick, because we already
+	 * freed all the objects in open context.
+	 */
+	ka.ds = ds;
+	ka.tx = tx;
+	err = traverse_dataset(ds,
+	    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+	    kill_blkptr, &ka);
+	ASSERT3U(err, ==, 0);
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+
+	return (err);
+}
+
 void
 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	struct dsl_ds_destroyarg *dsda = arg1;
 	dsl_dataset_t *ds = dsda->ds;
-	int err;
+	int err = 0;
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
@@ -1773,7 +1785,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 			    tx);
 			dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
 			    DD_USED_HEAD, used, comp, uncomp, tx);
-			dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
 
 			/* Merge our deadlist into next's and free it. */
 			dsl_deadlist_merge(&ds_next->ds_deadlist,
@@ -1849,32 +1860,57 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 		}
 		dsl_dataset_rele(ds_next, FTAG);
 	} else {
+		zfeature_info_t *async_destroy =
+		    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
+		objset_t *os;
+
 		/*
 		 * There's no next snapshot, so this is a head dataset.
 		 * Destroy the deadlist.  Unless it's a clone, the
 		 * deadlist should be empty.  (If it's a clone, it's
 		 * safe to ignore the deadlist contents.)
 		 */
-		struct killarg ka;
-
 		dsl_deadlist_close(&ds->ds_deadlist);
 		dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
 		ds->ds_phys->ds_deadlist_obj = 0;
 
-		/*
-		 * Free everything that we point to (that's born after
-		 * the previous snapshot, if we are a clone)
-		 *
-		 * NB: this should be very quick, because we already
-		 * freed all the objects in open context.
-		 */
-		ka.ds = ds;
-		ka.tx = tx;
-		err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-		    TRAVERSE_POST, kill_blkptr, &ka);
-		ASSERT3U(err, ==, 0);
-		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-		    ds->ds_phys->ds_unique_bytes == 0);
+		VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
+
+		if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
+			err = old_synchronous_dataset_destroy(ds, tx);
+		} else {
+			/*
+			 * Move the bptree into the pool's list of trees to
+			 * clean up and update space accounting information.
+			 */
+			uint64_t used, comp, uncomp;
+
+			zil_destroy_sync(dmu_objset_zil(os), tx);
+
+			if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+				spa_feature_incr(dp->dp_spa, async_destroy, tx);
+				dp->dp_bptree_obj = bptree_alloc(mos, tx);
+				VERIFY(zap_add(mos,
+				    DMU_POOL_DIRECTORY_OBJECT,
+				    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+				    &dp->dp_bptree_obj, tx) == 0);
+			}
+
+			used = ds->ds_dir->dd_phys->dd_used_bytes;
+			comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
+			uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+
+			ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+			    ds->ds_phys->ds_unique_bytes == used);
+
+			bptree_add(mos, dp->dp_bptree_obj,
+			    &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+			    used, comp, uncomp, tx);
+			dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+			    -used, -comp, -uncomp, tx);
+			dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+			    used, comp, uncomp, tx);
+		}
 
 		if (ds->ds_prev != NULL) {
 			if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
@@ -2065,7 +2101,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = crtxg;
 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+	dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
 	dsphys->ds_flags = ds->ds_phys->ds_flags;
@@ -2154,7 +2190,6 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
 
-	dsl_dir_dirty(ds->ds_dir, tx);
 	dmu_objset_sync(ds->ds_objset, zio, tx);
 }
 
@@ -2189,10 +2224,22 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 		char buf[ZFS_MAXNAMELEN];
+		/*
+		 * Even though we hold the dp_config_rwlock, the dataset
+		 * may fail to open, returning ENOENT.  If there is a
+		 * thread concurrently attempting to destroy this
+		 * dataset, it will have the ds_rwlock held for
+		 * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
+		 * dsl_dataset_hold_ref() will fail its
+		 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
+		 * dp_config_rwlock, and wait for the destroy progress
+		 * and signal ds_exclusive_cv.  If the destroy was
+		 * successful, we will see that
+		 * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
+		 */
 		if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-		    za.za_first_integer, FTAG, &clone) != 0) {
-			goto fail;
-		}
+		    za.za_first_integer, FTAG, &clone) != 0)
+			continue;
 		dsl_dir_name(clone->ds_dir, buf);
 		VERIFY(nvlist_add_boolean(val, buf) == 0);
 		dsl_dataset_rele(clone, FTAG);
@@ -2316,7 +2363,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-	*refdbytesp = ds->ds_phys->ds_used_bytes;
+	*refdbytesp = ds->ds_phys->ds_referenced_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
 	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
 		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
@@ -2652,7 +2699,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	 * Note however, if we stop before we reach the ORIGIN we get:
 	 * uN + kN + kN-1 + ... + kM - uM-1
 	 */
-	pa->used = origin_ds->ds_phys->ds_used_bytes;
+	pa->used = origin_ds->ds_phys->ds_referenced_bytes;
 	pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
 	pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
 	for (snap = list_head(&pa->shared_snaps); snap;
@@ -2686,7 +2733,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	 * so we need to subtract out the clone origin's used space.
 	 */
 	if (pa->origin_origin) {
-		pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
+		pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
 		pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
 		pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
 	}
@@ -3203,8 +3250,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 		dsl_deadlist_space(&csa->ohds->ds_deadlist,
 		    &odl_used, &odl_comp, &odl_uncomp);
 
-		dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
-		    (csa->ohds->ds_phys->ds_used_bytes + odl_used);
+		dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
+		    (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
 		dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
 		    (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
 		duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
@@ -3233,8 +3280,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	}
 
 	/* swap ds_*_bytes */
-	SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
-	    csa->cds->ds_phys->ds_used_bytes);
+	SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
+	    csa->cds->ds_phys->ds_referenced_bytes);
 	SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
 	    csa->cds->ds_phys->ds_compressed_bytes);
 	SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
@@ -3363,8 +3410,9 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
-	if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
-		if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
+	if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
+		if (inflight > 0 ||
+		    ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
 			error = ERESTART;
 		else
 			error = EDQUOT;
@@ -3393,7 +3441,7 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (psa->psa_effective_value == 0)
 		return (0);
 
-	if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
+	if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
 	    psa->psa_effective_value < ds->ds_reserved)
 		return (ENOSPC);
 
@@ -4141,8 +4189,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 	dsl_pool_t *dp = new->ds_dir->dd_pool;
 
 	*usedp = 0;
-	*usedp += new->ds_phys->ds_used_bytes;
-	*usedp -= oldsnap->ds_phys->ds_used_bytes;
+	*usedp += new->ds_phys->ds_referenced_bytes;
+	*usedp -= oldsnap->ds_phys->ds_referenced_bytes;
 
 	*compp = 0;
 	*compp += new->ds_phys->ds_compressed_bytes;
@@ -4158,9 +4206,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 		dsl_dataset_t *snap;
 		uint64_t used, comp, uncomp;
 
-		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
-		if (err != 0)
-			break;
+		if (snapobj == new->ds_object) {
+			snap = new;
+		} else {
+			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+			if (err != 0)
+				break;
+		}
 
 		if (snap->ds_phys->ds_prev_snap_txg ==
 		    oldsnap->ds_phys->ds_creation_txg) {
@@ -4189,7 +4241,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 		 * was not a snapshot of/before new.
 		 */
 		snapobj = snap->ds_phys->ds_prev_snap_obj;
-		dsl_dataset_rele(snap, FTAG);
+		if (snap != new)
+			dsl_dataset_rele(snap, FTAG);
 		if (snapobj == 0) {
 			err = EINVAL;
 			break;
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index 1e89a68d7..909b5f8fc 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_dataset.h>
@@ -165,12 +165,49 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
 
 	for (zap_cursor_init(&zc, os, dlobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc))
-		bpobj_free(os, za.za_first_integer, tx);
+	    zap_cursor_advance(&zc)) {
+		uint64_t obj = za.za_first_integer;
+		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
+			bpobj_decr_empty(os, tx);
+		else
+			bpobj_free(os, obj, tx);
+	}
 	zap_cursor_fini(&zc);
 	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
 }
 
+static void
+dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    const blkptr_t *bp, dmu_tx_t *tx)
+{
+	if (dle->dle_bpobj.bpo_object ==
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+		bpobj_close(&dle->dle_bpobj);
+		bpobj_decr_empty(dl->dl_os, tx);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, obj, tx));
+	}
+	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+static void
+dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    uint64_t obj, dmu_tx_t *tx)
+{
+	if (dle->dle_bpobj.bpo_object !=
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+	} else {
+		bpobj_close(&dle->dle_bpobj);
+		bpobj_decr_empty(dl->dl_os, tx);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, obj, tx));
+	}
+}
+
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 {
@@ -199,7 +236,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	else
 		dle = AVL_PREV(&dl->dl_tree, dle);
-	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+	dle_enqueue(dl, dle, bp, tx);
 }
 
 /*
@@ -219,7 +256,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 
 	dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE);
 	dle->dle_mintxg = mintxg;
-	obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+	obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
 	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 	avl_add(&dl->dl_tree, dle);
 
@@ -245,8 +282,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
 	dle_prev = AVL_PREV(&dl->dl_tree, dle);
 
-	bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
-	    dle->dle_bpobj.bpo_object, tx);
+	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 
 	avl_remove(&dl->dl_tree, dle);
 	bpobj_close(&dle->dle_bpobj);
@@ -304,7 +340,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
 		if (dle->dle_mintxg >= maxtxg)
 			break;
 
-		obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+		obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
 		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
 		    dle->dle_mintxg, obj, tx));
 	}
@@ -402,7 +438,7 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
-	bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+	dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
 static int
diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c
index a4d4e42da..294932c45 100644
--- a/module/zfs/dsl_deleg.c
+++ b/module/zfs/dsl_deleg.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -171,10 +171,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 		VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
 
 		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
-			jumpobj = zap_create(mos, DMU_OT_DSL_PERMS,
-			    DMU_OT_NONE, 0, tx);
-			VERIFY(zap_update(mos, zapobj,
-			    whokey, 8, 1, &jumpobj, tx) == 0);
+			jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
+			    zapobj, whokey, tx);
 		}
 
 		while ((permpair = nvlist_next_nvpair(perms, permpair))) {
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index 377df40a8..741223984 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -189,7 +189,6 @@ errout:
 	kmem_free(dd, sizeof (dsl_dir_t));
 	dmu_buf_rele(dbuf, tag);
 	return (err);
-
 }
 
 void
@@ -223,7 +222,7 @@ dsl_dir_name(dsl_dir_t *dd, char *buf)
 	}
 }
 
-/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */
+/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 int
 dsl_dir_namelen(dsl_dir_t *dd)
 {
@@ -592,8 +591,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
 	mutex_enter(&dd->dd_lock);
 	ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
@@ -950,8 +947,6 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(type < DD_USED_NUM);
 
-	dsl_dir_dirty(dd, tx);
-
 	if (needlock)
 		mutex_enter(&dd->dd_lock);
 	accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
@@ -960,6 +955,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 	    dd->dd_phys->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
 	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_used_bytes += used;
 	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
 	dd->dd_phys->dd_compressed_bytes += compressed;
@@ -1003,13 +999,13 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 	if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
 		return;
 
-	dsl_dir_dirty(dd, tx);
 	if (needlock)
 		mutex_enter(&dd->dd_lock);
 	ASSERT(delta > 0 ?
 	    dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
 	    dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
 	ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
 	dd->dd_phys->dd_used_breakdown[newtype] += delta;
 	if (needlock)
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 7e0fba589..704f034e9 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
@@ -40,6 +40,9 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/bptree.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
@@ -222,12 +225,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 
 	txg_list_create(&dp->dp_dirty_datasets,
 	    offsetof(dsl_dataset_t, ds_dirty_link));
+	txg_list_create(&dp->dp_dirty_zilogs,
+	    offsetof(zilog_t, zl_dirty_link));
 	txg_list_create(&dp->dp_dirty_dirs,
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks,
 	    offsetof(dsl_sync_task_group_t, dstg_node));
-	list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
-	    offsetof(dsl_dataset_t, ds_synced_link));
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -240,20 +243,30 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 }
 
 int
-dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+
+	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+	    &dp->dp_meta_objset);
+	if (err != 0)
+		dsl_pool_close(dp);
+	else
+		*dpp = dp;
+
+	return (err);
+}
+
+int
+dsl_pool_open(dsl_pool_t *dp)
+{
+	int err;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
-	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
-	    &dp->dp_meta_objset);
-	if (err)
-		goto out;
-
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 	    &dp->dp_root_dir_obj);
@@ -269,7 +282,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 	if (err)
 		goto out;
 
-	if (spa_version(spa) >= SPA_VERSION_ORIGIN) {
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 		if (err)
 			goto out;
@@ -286,7 +299,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 			goto out;
 	}
 
-	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 		    &dp->dp_free_dir);
 		if (err)
@@ -300,6 +313,24 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 		    dp->dp_meta_objset, obj));
 	}
 
+	if (spa_feature_is_active(dp->dp_spa,
+	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+		    &dp->dp_bptree_obj);
+		if (err != 0)
+			goto out;
+	}
+
+	if (spa_feature_is_active(dp->dp_spa,
+	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+		    &dp->dp_empty_bpobj);
+		if (err != 0)
+			goto out;
+	}
+
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 	    &dp->dp_tmp_userrefs_obj);
@@ -308,15 +339,10 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 	if (err)
 		goto out;
 
-	err = dsl_scan_init(dp, txg);
+	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 
 out:
 	rw_exit(&dp->dp_config_rwlock);
-	if (err)
-		dsl_pool_close(dp);
-	else
-		*dpp = dp;
-
 	return (err);
 }
 
@@ -346,9 +372,9 @@ dsl_pool_close(dsl_pool_t *dp)
 		dmu_objset_evict(dp->dp_meta_objset);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
+	txg_list_destroy(&dp->dp_dirty_zilogs);
 	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
-	list_destroy(&dp->dp_synced_datasets);
 
 	arc_flush(dp->dp_spa);
 	txg_fini(dp);
@@ -429,6 +455,21 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 	return (dp);
 }
 
+/*
+ * Account for the meta-objset space in its placeholder dsl_dir.
+ */
+void
+dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+    int64_t used, int64_t comp, int64_t uncomp)
+{
+	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
+	mutex_enter(&dp->dp_lock);
+	dp->dp_mos_used_delta += used;
+	dp->dp_mos_compressed_delta += comp;
+	dp->dp_mos_uncompressed_delta += uncomp;
+	mutex_exit(&dp->dp_lock);
+}
+
 static int
 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
@@ -447,11 +488,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
-	dsl_sync_task_group_t *dstg;
 	objset_t *mos = dp->dp_meta_objset;
 	hrtime_t start, write_time;
 	uint64_t data_written;
 	int err;
+	list_t synced_datasets;
+
+	list_create(&synced_datasets, sizeof (dsl_dataset_t),
+	    offsetof(dsl_dataset_t, ds_synced_link));
 
 	/*
 	 * We need to copy dp_space_towrite() before doing
@@ -474,7 +518,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		 * may sync newly-created datasets on pass 2.
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
-		list_insert_tail(&dp->dp_synced_datasets, ds);
+		list_insert_tail(&synced_datasets, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	DTRACE_PROBE(pool_sync__1setup);
@@ -484,15 +528,20 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	ASSERT(err == 0);
 	DTRACE_PROBE(pool_sync__2rootzio);
 
-	for (ds = list_head(&dp->dp_synced_datasets); ds;
-	    ds = list_next(&dp->dp_synced_datasets, ds))
+	/*
+	 * After the data blocks have been written (ensured by the zio_wait()
+	 * above), update the user/group space accounting.
+	 */
+	for (ds = list_head(&synced_datasets); ds;
+	    ds = list_next(&synced_datasets, ds))
 		dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 
 	/*
 	 * Sync the datasets again to push out the changes due to
 	 * userspace updates.  This must be done before we process the
-	 * sync tasks, because that could cause a snapshot of a dataset
-	 * whose ds_bp will be rewritten when we do this 2nd sync.
+	 * sync tasks, so that any snapshots will have the correct
+	 * user accounting information (and we won't get confused
+	 * about which blocks are part of the snapshot).
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) {
@@ -503,30 +552,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	err = zio_wait(zio);
 
 	/*
-	 * Move dead blocks from the pending deadlist to the on-disk
-	 * deadlist.
+	 * Now that the datasets have been completely synced, we can
+	 * clean up our in-memory structures accumulated while syncing:
+	 *
+	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
+	 *  - clean up zil records
+	 *  - release hold from dsl_dataset_dirty()
 	 */
-	for (ds = list_head(&dp->dp_synced_datasets); ds;
-	    ds = list_next(&dp->dp_synced_datasets, ds)) {
+	while ((ds = list_remove_head(&synced_datasets))) {
+		ASSERTV(objset_t *os = ds->ds_objset);
 		bplist_iterate(&ds->ds_pending_deadlist,
 		    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+		ASSERT(!dmu_objset_is_dirty(os, txg));
+		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
 
-	while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) {
-		/*
-		 * No more sync tasks should have been added while we
-		 * were syncing.
-		 */
-		ASSERT(spa_sync_pass(dp->dp_spa) == 1);
-		dsl_sync_task_group_sync(dstg, tx);
-	}
-	DTRACE_PROBE(pool_sync__3task);
-
 	start = gethrtime();
 	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)))
 		dsl_dir_sync(dd, tx);
 	write_time += gethrtime() - start;
 
+	/*
+	 * The MOS's space is accounted for in the pool/$MOS
+	 * (dp_mos_dir).  We can't modify the mos while we're syncing
+	 * it, so we remember the deltas and apply them here.
+	 */
+	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
+	    dp->dp_mos_uncompressed_delta != 0) {
+		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
+		    dp->dp_mos_used_delta,
+		    dp->dp_mos_compressed_delta,
+		    dp->dp_mos_uncompressed_delta, tx);
+		dp->dp_mos_used_delta = 0;
+		dp->dp_mos_compressed_delta = 0;
+		dp->dp_mos_uncompressed_delta = 0;
+	}
+
 	start = gethrtime();
 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
@@ -542,6 +603,27 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	    hrtime_t, dp->dp_read_overhead);
 	write_time -= dp->dp_read_overhead;
 
+	/*
+	 * If we modify a dataset in the same txg that we want to destroy it,
+	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
+	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
+	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
+	 * and clearing the hold on it) before we process the sync_tasks.
+	 * The MOS data dirtied by the sync_tasks will be synced on the next
+	 * pass.
+	 */
+	DTRACE_PROBE(pool_sync__3task);
+	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
+		dsl_sync_task_group_t *dstg;
+		/*
+		 * No more sync tasks should have been added while we
+		 * were syncing.
+		 */
+		ASSERT(spa_sync_pass(dp->dp_spa) == 1);
+		while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg)))
+			dsl_sync_task_group_sync(dstg, tx);
+	}
+
 	dmu_tx_commit(tx);
 
 	dp->dp_space_towrite[txg & TXG_MASK] = 0;
@@ -590,15 +672,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 void
 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
+	zilog_t *zilog;
 	dsl_dataset_t *ds;
-	objset_t *os;
 
-	while ((ds = list_head(&dp->dp_synced_datasets))) {
-		list_remove(&dp->dp_synced_datasets, ds);
-		os = ds->ds_objset;
-		zil_clean(os->os_zil, txg);
-		ASSERT(!dmu_objset_is_dirty(os, txg));
-		dmu_buf_rele(ds->ds_dbuf, ds);
+	while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) {
+		ds = dmu_objset_ds(zilog->zl_os);
+		zil_clean(zilog, txg);
+		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
+		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
@@ -611,7 +692,7 @@ int
 dsl_pool_sync_context(dsl_pool_t *dp)
 {
 	return (curthread == dp->dp_tx.tx_sync_thread ||
-	    spa_get_dsl(dp->dp_spa) == NULL);
+	    spa_is_initializing(dp->dp_spa));
 }
 
 uint64_t
@@ -932,11 +1013,8 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 	ASSERT(dp->dp_tmp_userrefs_obj == 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
-	dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
-	    DMU_OT_NONE, 0, tx);
-
-	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
-	    sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
+	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 }
 
 static int
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index c2386dd67..297caa067 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_scan.h>
@@ -44,6 +45,7 @@
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
+#include <sys/zfeature.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
@@ -379,55 +381,6 @@ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
 	    priority, zio_flags, arc_flags, zb));
 }
 
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
-	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
-	    zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
-    const zbookmark_t *zb2)
-{
-	uint64_t zb1nextL0, zb2thisobj;
-
-	ASSERT(zb1->zb_objset == zb2->zb_objset);
-	ASSERT(zb2->zb_level == 0);
-
-	/*
-	 * A bookmark in the deadlist is considered to be after
-	 * everything else.
-	 */
-	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-		return (B_TRUE);
-
-	/* The objset_phys_t isn't before anything. */
-	if (dnp == NULL)
-		return (B_FALSE);
-
-	zb1nextL0 = (zb1->zb_blkid + 1) <<
-	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
-	zb2thisobj = zb2->zb_object ? zb2->zb_object :
-	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
-	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t nextobj = zb1nextL0 *
-		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-		return (nextobj <= zb2thisobj);
-	}
-
-	if (zb1->zb_object < zb2thisobj)
-		return (B_TRUE);
-	if (zb1->zb_object > zb2thisobj)
-		return (B_FALSE);
-	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
-		return (B_FALSE);
-	return (zb1nextL0 <= zb2->zb_blkid);
-}
-
 static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
@@ -459,7 +412,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
 	if (scn->scn_pausing)
 		return (B_TRUE); /* we're already pausing */
 
-	if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 		return (B_FALSE); /* we're resuming */
 
 	/* We only know how to resume from level-0 blocks. */
@@ -614,13 +567,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 	/*
 	 * We never skip over user/group accounting objects (obj<0)
 	 */
-	if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 	    (int64_t)zb->zb_object >= 0) {
 		/*
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
-		if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
@@ -823,22 +776,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 		goto out;
 
-	if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
-		/*
-		 * For non-user-accounting blocks, we need to read the
-		 * new bp (from a deleted snapshot, found in
-		 * check_existing_xlation).  If we used the old bp,
-		 * pointers inside this block from before we resumed
-		 * would be untranslated.
-		 *
-		 * For user-accounting blocks, we need to read the old
-		 * bp, because we will apply the entire space delta to
-		 * it (original untranslated -> translations from
-		 * deleted snap -> now).
-		 */
-		*bp_toread = *bp;
-	}
-
 	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx,
 	    &buf) != 0)
 		goto out;
@@ -1414,19 +1351,28 @@ out:
 	kmem_free(zc, sizeof(zap_cursor_t));
 }
 
-static int
-dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+static boolean_t
+dsl_scan_free_should_pause(dsl_scan_t *scn)
 {
-	dsl_scan_t *scn = arg;
 	uint64_t elapsed_nanosecs;
 
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
-
-	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 	    (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
 	    txg_sync_waiting(scn->scn_dp)) ||
-	    spa_shutting_down(scn->scn_dp->dp_spa))
-		return (ERESTART);
+	    spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg;
+
+	if (!scn->scn_is_bptree ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+		if (dsl_scan_free_should_pause(scn))
+			return (ERESTART);
+	}
 
 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
 	    dmu_tx_get_txg(tx), bp, 0));
@@ -1451,6 +1397,10 @@ dsl_scan_active(dsl_scan_t *scn)
 	if (scn->scn_phys.scn_state == DSS_SCANNING)
 		return (B_TRUE);
 
+	if (spa_feature_is_active(spa,
+	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+		return (B_TRUE);
+	}
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
 		    &used, &comp, &uncomp);
@@ -1497,14 +1447,40 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	 * traversing it.
 	 */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		scn->scn_is_bptree = B_FALSE;
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bpobj_iterate(&dp->dp_free_bpobj,
-		    dsl_scan_free_cb, scn, tx);
+		    dsl_scan_free_block_cb, scn, tx);
 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+
+		if (err == 0 && spa_feature_is_active(spa,
+		    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+			scn->scn_is_bptree = B_TRUE;
+			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+			    NULL, ZIO_FLAG_MUSTSUCCEED);
+			err = bptree_iterate(dp->dp_meta_objset,
+			    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
+			    scn, tx);
+			VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+			if (err != 0)
+				return;
+
+			/* disable async destroy feature */
+			spa_feature_decr(spa,
+			    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
+			ASSERT(!spa_feature_is_active(spa,
+			    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
+			VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, tx));
+			VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
+			    dp->dp_bptree_obj, tx));
+			dp->dp_bptree_obj = 0;
+		}
 		if (scn->scn_visited_this_txg) {
 			zfs_dbgmsg("freed %llu blocks in %llums from "
-			    "free_bpobj txg %llu",
+			    "free_bpobj/bptree txg %llu",
 			    (longlong_t)scn->scn_visited_this_txg,
 			    (longlong_t)
 			    (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
@@ -1619,9 +1595,13 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
-		zfs_blkstat_t *zb = &zab->zab_type[l][t];
 		int equal;
+		zfs_blkstat_t *zb;
+
+		if (t & DMU_OT_NEWTYPE)
+			t = DMU_OT_OTHER;
 
+		zb = &zab->zab_type[l][t];
 		zb->zb_count++;
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index d4b28cc90..240a683d6 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -18,8 +18,10 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -446,10 +448,9 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
 		char attr_name[8];
 
 		if (sa->sa_layout_attr_obj == 0) {
-			sa->sa_layout_attr_obj = zap_create(os,
-			    DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
-			VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
-			    &sa->sa_layout_attr_obj, tx) == 0);
+			sa->sa_layout_attr_obj = zap_create_link(os,
+			    DMU_OT_SA_ATTR_LAYOUTS,
+			    sa->sa_master_obj, SA_LAYOUTS, tx);
 		}
 
 		(void) snprintf(attr_name, sizeof (attr_name),
@@ -1583,10 +1584,9 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
 	}
 
 	if (sa->sa_reg_attr_obj == 0) {
-		sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
-		    DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
-		VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj,
-		    SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0);
+		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
+		    DMU_OT_SA_ATTR_REGISTRATION,
+		    sa->sa_master_obj, SA_REGISTRY, tx);
 	}
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_registered)
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index b610a0dae..5b6465f2e 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -63,6 +63,7 @@
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
+#include <sys/zfeature.h>
 
 #ifdef	_KERNEL
 #include <sys/bootprops.h>
@@ -114,7 +115,10 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 };
 
+static dsl_syncfunc_t spa_sync_version;
 static dsl_syncfunc_t spa_sync_props;
+static dsl_checkfunc_t spa_change_guid_check;
+static dsl_syncfunc_t spa_change_guid_sync;
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
@@ -169,6 +173,7 @@ static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
+	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size;
 	uint64_t alloc;
 	uint64_t space;
@@ -216,6 +221,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 	}
 
+	if (pool != NULL) {
+		dsl_dir_t *freedir = pool->dp_free_dir;
+
+		/*
+		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
+		 * when opening pools before this version freedir will be NULL.
+		 */
+		if (freedir != NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
+			    freedir->dd_phys->dd_used_bytes, src);
+		} else {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
+			    NULL, 0, src);
+		}
+	}
+
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
@@ -357,25 +378,55 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
 	uint64_t objnum = 0;
+	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
-		zpool_prop_t prop;
-		char *propname, *strval;
 		uint64_t intval;
-		objset_t *os;
-		char *slash, *check;
+		char *strval, *slash, *check, *fname;
+		const char *propname = nvpair_name(elem);
+		zpool_prop_t prop = zpool_name_to_prop(propname);
+
+		switch ((int)prop) {
+		case ZPROP_INVAL:
+			if (!zpool_prop_feature(propname)) {
+				error = EINVAL;
+				break;
+			}
+
+			/*
+			 * Sanitize the input.
+			 */
+			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+				error = EINVAL;
+				break;
+			}
+
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
 
-		propname = nvpair_name(elem);
+			if (intval != 0) {
+				error = EINVAL;
+				break;
+			}
+
+			fname = strchr(propname, '@') + 1;
+			if (zfeature_lookup_name(fname, NULL) != 0) {
+				error = EINVAL;
+				break;
+			}
 
-		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
-			return (EINVAL);
+			has_feature = B_TRUE;
+			break;
 
-		switch (prop) {
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
-			    (intval < spa_version(spa) || intval > SPA_VERSION))
+			    (intval < spa_version(spa) ||
+			    intval > SPA_VERSION_BEFORE_FEATURES ||
+			    has_feature))
 				error = EINVAL;
 			break;
 
@@ -412,6 +463,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
+				objset_t *os;
 				uint64_t compress;
 
 				if (strval == NULL || strval[0] == '\0') {
@@ -558,33 +610,58 @@ int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
-	nvpair_t *elem;
+	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
-	zpool_prop_t prop;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
-	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
-		if ((prop = zpool_name_to_prop(
-		    nvpair_name(elem))) == ZPROP_INVAL)
-			return (EINVAL);
+		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
+		if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
+			uint64_t ver;
+
+			if (prop == ZPOOL_PROP_VERSION) {
+				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
+			} else {
+				ASSERT(zpool_prop_feature(nvpair_name(elem)));
+				ver = SPA_VERSION_FEATURES;
+				need_sync = B_TRUE;
+			}
+
+			/* Save time if the version is already set. */
+			if (ver == spa_version(spa))
+				continue;
+
+			/*
+			 * In addition to the pool directory object, we might
+			 * create the pool properties object, the features for
+			 * read object, the features for write object, or the
+			 * feature descriptions object.
+			 */
+			error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
+			    spa_sync_version, spa, &ver, 6);
+			if (error)
+				return (error);
+			continue;
+		}
+
 		need_sync = B_TRUE;
 		break;
 	}
 
-	if (need_sync)
+	if (need_sync) {
 		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
-		    spa, nvp, 3));
-	else
-		return (0);
+		    spa, nvp, 6));
+	}
+
+	return (0);
 }
 
 /*
@@ -601,6 +678,47 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 	}
 }
 
+/*ARGSUSED*/
+static int
+spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	spa_t *spa = arg1;
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t vdev_state;
+	ASSERTV(uint64_t *newguid = arg2);
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	vdev_state = rvd->vdev_state;
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	if (vdev_state != VDEV_STATE_HEALTHY)
+		return (ENXIO);
+
+	ASSERT3U(spa_guid(spa), !=, *newguid);
+
+	return (0);
+}
+
+static void
+spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	spa_t *spa = arg1;
+	uint64_t *newguid = arg2;
+	uint64_t oldguid;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	oldguid = spa_guid(spa);
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	rvd->vdev_guid = *newguid;
+	rvd->vdev_guid_sum += (*newguid - oldguid);
+	vdev_config_dirty(rvd);
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	spa_history_log_internal(LOG_POOL_GUID_CHANGE, spa, tx,
+	    "old=%lld new=%lld", oldguid, *newguid);
+}
+
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
@@ -613,29 +731,23 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 int
 spa_change_guid(spa_t *spa)
 {
-	uint64_t	oldguid, newguid;
-	uint64_t	txg;
-
-	if (!(spa_mode_global & FWRITE))
-		return (EROFS);
-
-	txg = spa_vdev_enter(spa);
-
-	if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
-		return (spa_vdev_exit(spa, NULL, txg, ENXIO));
+	int error;
+	uint64_t guid;
 
-	oldguid = spa_guid(spa);
-	newguid = spa_generate_guid(NULL);
-	ASSERT3U(oldguid, !=, newguid);
+	mutex_enter(&spa_namespace_lock);
+	guid = spa_generate_guid(NULL);
 
-	spa->spa_root_vdev->vdev_guid = newguid;
-	spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
+	error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
+	    spa_change_guid_sync, spa, &guid, 5);
 
-	vdev_config_dirty(spa->spa_root_vdev);
+	if (error == 0) {
+		spa_config_sync(spa, B_FALSE, B_TRUE);
+		spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
+	}
 
-	spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
+	mutex_exit(&spa_namespace_lock);
 
-	return (spa_vdev_exit(spa, NULL, txg, 0));
+	return (error);
 }
 
 /*
@@ -1628,7 +1740,7 @@ spa_load_verify_done(zio_t *zio)
 	int error = zio->io_error;
 
 	if (error) {
-		if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
 			atomic_add_64(&sle->sle_meta_count, 1);
 		else
@@ -1858,6 +1970,9 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
 			    KM_PUSHPAGE) == 0);
 		}
 
+		nvlist_free(spa->spa_load_info);
+		spa->spa_load_info = fnvlist_alloc();
+
 		gethrestime(&spa->spa_loaded_ts);
 		error = spa_load_impl(spa, pool_guid, config, state, type,
 		    mosconfig, &ereport);
@@ -1891,12 +2006,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 {
 	int error = 0;
 	nvlist_t *nvroot = NULL;
+	nvlist_t *label;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
 	uint64_t children, config_cache_txg = spa->spa_config_txg;
 	int orig_mode = spa->spa_mode;
 	int parse;
 	uint64_t obj;
+	boolean_t missing_feat_write = B_FALSE;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
@@ -1976,19 +2093,79 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	/*
 	 * Find the best uberblock.
 	 */
-	vdev_uberblock_load(NULL, rvd, ub);
+	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
-	if (ub->ub_txg == 0)
+	if (ub->ub_txg == 0) {
+		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+	}
 
 	/*
-	 * If the pool is newer than the code, we can't open it.
+	 * If the pool has an unsupported version we can't open it.
 	 */
-	if (ub->ub_version > SPA_VERSION)
+	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
+		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+	}
+
+	if (ub->ub_version >= SPA_VERSION_FEATURES) {
+		nvlist_t *features;
+
+		/*
+		 * If we weren't able to find what's necessary for reading the
+		 * MOS in the label, return failure.
+		 */
+		if (label == NULL || nvlist_lookup_nvlist(label,
+		    ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
+			nvlist_free(label);
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+			    ENXIO));
+		}
+
+		/*
+		 * Update our in-core representation with the definitive values
+		 * from the label.
+		 */
+		nvlist_free(spa->spa_label_features);
+		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
+	}
+
+	nvlist_free(label);
+
+	/*
+	 * Look through entries in the label nvlist's features_for_read. If
+	 * there is a feature listed there which we don't understand then we
+	 * cannot open a pool.
+	 */
+	if (ub->ub_version >= SPA_VERSION_FEATURES) {
+		nvlist_t *unsup_feat;
+		nvpair_t *nvp;
+
+		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
+		    0);
+
+		for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL);
+		    nvp != NULL;
+		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
+			if (!zfeature_is_supported(nvpair_name(nvp))) {
+				VERIFY(nvlist_add_string(unsup_feat,
+				    nvpair_name(nvp), "") == 0);
+			}
+		}
+
+		if (!nvlist_empty(unsup_feat)) {
+			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
+			nvlist_free(unsup_feat);
+			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+			    ENOTSUP));
+		}
+
+		nvlist_free(unsup_feat);
+	}
 
 	/*
 	 * If the vdev guid sum doesn't match the uberblock, we have an
@@ -2022,7 +2199,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 
-	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
@@ -2030,6 +2207,89 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
+	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
+		boolean_t missing_feat_read = B_FALSE;
+		nvlist_t *unsup_feat, *enabled_feat;
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
+		    &spa->spa_feat_for_read_obj) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
+		    &spa->spa_feat_for_write_obj) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
+		    &spa->spa_feat_desc_obj) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		enabled_feat = fnvlist_alloc();
+		unsup_feat = fnvlist_alloc();
+
+		if (!feature_is_supported(spa->spa_meta_objset,
+		    spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
+		    unsup_feat, enabled_feat))
+			missing_feat_read = B_TRUE;
+
+		if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
+			if (!feature_is_supported(spa->spa_meta_objset,
+			    spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
+			    unsup_feat, enabled_feat)) {
+				missing_feat_write = B_TRUE;
+			}
+		}
+
+		fnvlist_add_nvlist(spa->spa_load_info,
+		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
+
+		if (!nvlist_empty(unsup_feat)) {
+			fnvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
+		}
+
+		fnvlist_free(enabled_feat);
+		fnvlist_free(unsup_feat);
+
+		if (!missing_feat_read) {
+			fnvlist_add_boolean(spa->spa_load_info,
+			    ZPOOL_CONFIG_CAN_RDONLY);
+		}
+
+		/*
+		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
+		 * twofold: to determine whether the pool is available for
+		 * import in read-write mode and (if it is not) whether the
+		 * pool is available for import in read-only mode. If the pool
+		 * is available for import in read-write mode, it is displayed
+		 * as available in userland; if it is not available for import
+		 * in read-only mode, it is displayed as unavailable in
+		 * userland. If the pool is available for import in read-only
+		 * mode but not read-write mode, it is displayed as unavailable
+		 * in userland with a special note that the pool is actually
+		 * available for open in read-only mode.
+		 *
+		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
+		 * missing a feature for write, we must first determine whether
+		 * the pool can be opened read-only before returning to
+		 * userland in order to know whether to display the
+		 * abovementioned note.
+		 */
+		if (missing_feat_read || (missing_feat_write &&
+		    spa_writeable(spa))) {
+			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+			    ENOTSUP));
+		}
+	}
+
+	spa->spa_is_initializing = B_TRUE;
+	error = dsl_pool_open(spa->spa_dsl_pool);
+	spa->spa_is_initializing = B_FALSE;
+	if (error != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
 	if (!mosconfig) {
 		uint64_t hostid;
 		nvlist_t *policy = NULL, *nvconfig;
@@ -2247,7 +2507,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		nvlist_free(nvconfig);
 
 		/*
-		 * Now that we've validate the config, check the state of the
+		 * Now that we've validated the config, check the state of the
 		 * root vdev.  If it can't be opened, it indicates one or
 		 * more toplevel vdevs are faulted.
 		 */
@@ -2260,6 +2520,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		}
 	}
 
+	if (missing_feat_write) {
+		ASSERT(state == SPA_LOAD_TRYIMPORT);
+
+		/*
+		 * At this point, we know that we can open the pool in
+		 * read-only mode but not read-write mode. We now have enough
+		 * information and can return to userland.
+		 */
+		return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
+	}
+
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
@@ -2370,10 +2641,18 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
 }
 
+/*
+ * If spa_load() fails this function will try loading prior txg's. If
+ * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
+ * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
+ * function will not rewind the pool and will return the same error as
+ * spa_load().
+ */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
     uint64_t max_request, int rewind_flags)
 {
+	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
@@ -2402,9 +2681,18 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
 		return (load_error);
 	}
 
-	/* Price of rolling back is discarding txgs, including log */
-	if (state == SPA_LOAD_RECOVER)
+	if (state == SPA_LOAD_RECOVER) {
+		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
+	} else {
+		/*
+		 * If we aren't rolling back save the load info from our first
+		 * import attempt so that we can restore it after attempting
+		 * to rewind.
+		 */
+		loadinfo = spa->spa_load_info;
+		spa->spa_load_info = fnvlist_alloc();
+	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
@@ -2428,7 +2716,20 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 
-	return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
+	if (state == SPA_LOAD_RECOVER) {
+		ASSERT3P(loadinfo, ==, NULL);
+		return (rewind_error);
+	} else {
+		/* Store the rewind info as part of the initial load info */
+		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
+		    spa->spa_load_info);
+
+		/* Restore the initial load info */
+		fnvlist_free(spa->spa_load_info);
+		spa->spa_load_info = loadinfo;
+
+		return (load_error);
+	}
 }
 
 /*
@@ -2698,8 +2999,50 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
 	}
 }
 
+static void
+spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+{
+	nvlist_t *features;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	if (spa->spa_feat_for_read_obj != 0) {
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_feat_for_read_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+			    za.za_num_integers == 1);
+			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+			    za.za_first_integer));
+		}
+		zap_cursor_fini(&zc);
+	}
+
+	if (spa->spa_feat_for_write_obj != 0) {
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_feat_for_write_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+			    za.za_num_integers == 1);
+			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+			    za.za_first_integer));
+		}
+		zap_cursor_fini(&zc);
+	}
+
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+	    features) == 0);
+	nvlist_free(features);
+}
+
 int
-spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
+spa_get_stats(const char *name, nvlist_t **config,
+    char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
@@ -2734,6 +3077,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
+			spa_add_feature_stats(spa, *config);
 		}
 	}
 
@@ -2954,6 +3298,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj;
+	boolean_t has_features;
+	nvpair_t *elem;
 	int c;
 
 	/*
@@ -2980,10 +3326,18 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 		return (error);
 	}
 
-	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
-	    &version) != 0)
+	has_features = B_FALSE;
+	for (elem = nvlist_next_nvpair(props, NULL);
+	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
+		if (zpool_prop_feature(nvpair_name(elem)))
+			has_features = B_TRUE;
+	}
+
+	if (has_features || nvlist_lookup_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
-	ASSERT(version <= SPA_VERSION);
+	}
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
@@ -3059,8 +3413,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
+	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
 	spa->spa_meta_objset = dp->dp_meta_objset;
+	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
@@ -3084,6 +3440,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
+	if (spa_version(spa) >= SPA_VERSION_FEATURES)
+		spa_feature_create_zap_objects(spa, tx);
+
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
@@ -3276,7 +3635,7 @@ spa_import_rootpool(char *devpath, char *devid)
 	}
 #endif
 	if (config == NULL) {
-		cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
+		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
 		    devpath);
 		return (EIO);
 	}
@@ -3590,6 +3949,8 @@ spa_tryimport(nvlist_t *tryconfig)
 		    state) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp) == 0);
+		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+		    spa->spa_load_info) == 0);
 
 		/*
 		 * If the bootfs property exists on this pool then we
@@ -5305,7 +5666,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 	 * information.  This avoids the dbuf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
-	bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
+	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = vmem_alloc(bufsize, KM_PUSHPAGE);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
@@ -5381,6 +5742,14 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
+	/*
+	 * If we're upgrading the spa version then make sure that
+	 * the config object gets updated with the correct version.
+	 */
+	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+		    spa->spa_uberblock.ub_version);
+
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (spa->spa_config_syncing)
@@ -5390,6 +5759,24 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
+static void
+spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	spa_t *spa = arg1;
+	uint64_t version = *(uint64_t *)arg2;
+
+	/*
+	 * Setting the version is special cased when first creating the pool.
+	 */
+	ASSERT(tx->tx_txg != TXG_INITIAL);
+
+	ASSERT(version <= SPA_VERSION);
+	ASSERT(version >= spa_version(spa));
+
+	spa->spa_uberblock.ub_version = version;
+	vdev_config_dirty(spa->spa_root_vdev);
+}
+
 /*
  * Set zpool properties.
  */
@@ -5399,32 +5786,39 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
 	spa_t *spa = arg1;
 	objset_t *mos = spa->spa_meta_objset;
 	nvlist_t *nvp = arg2;
-	nvpair_t *elem;
-	uint64_t intval;
-	char *strval;
-	zpool_prop_t prop;
-	const char *propname;
-	zprop_type_t proptype;
+	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
-	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
-		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
+		uint64_t intval;
+		char *strval, *fname;
+		zpool_prop_t prop;
+		const char *propname;
+		zprop_type_t proptype;
+		zfeature_info_t *feature;
+
+		prop = zpool_name_to_prop(nvpair_name(elem));
+		switch ((int)prop) {
+		case ZPROP_INVAL:
+			/*
+			 * We checked this earlier in spa_prop_validate().
+			 */
+			ASSERT(zpool_prop_feature(nvpair_name(elem)));
+
+			fname = strchr(nvpair_name(elem), '@') + 1;
+			VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
+
+			spa_feature_enable(spa, feature, tx);
+			break;
+
 		case ZPOOL_PROP_VERSION:
+			VERIFY(nvpair_value_uint64(elem, &intval) == 0);
 			/*
-			 * Only set version for non-zpool-creation cases
-			 * (set/import). spa_create() needs special care
-			 * for version setting.
+			 * The version is synced seperatly before other
+			 * properties and should be correct by now.
 			 */
-			if (tx->tx_txg != TXG_INITIAL) {
-				VERIFY(nvpair_value_uint64(elem,
-				    &intval) == 0);
-				ASSERT(intval <= SPA_VERSION);
-				ASSERT(intval >= spa_version(spa));
-				spa->spa_uberblock.ub_version = intval;
-				vdev_config_dirty(spa->spa_root_vdev);
-			}
+			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
@@ -5461,14 +5855,10 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
-				VERIFY((spa->spa_pool_props_object =
-				    zap_create(mos, DMU_OT_POOL_PROPS,
-				    DMU_OT_NONE, 0, tx)) > 0);
-
-				VERIFY(zap_update(mos,
+				spa->spa_pool_props_object =
+				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
-				    8, 1, &spa->spa_pool_props_object, tx)
-				    == 0);
+				    tx);
 			}
 
 			/* normalize the property name */
@@ -5567,6 +5957,11 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+		spa_feature_create_zap_objects(spa, tx);
+	}
 }
 
 /*
@@ -5738,6 +6133,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 				    rvd->vdev_children, txg, B_TRUE);
 		}
 
+		if (error == 0)
+			spa->spa_last_synced_guid = rvd->vdev_guid;
+
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index c86884148..09149e622 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -35,6 +35,7 @@
 #include <sys/utsname.h>
 #include <sys/systeminfo.h>
 #include <sys/sunddi.h>
+#include <sys/zfeature.h>
 #ifdef _KERNEL
 #include <sys/kobj.h>
 #include <sys/zone.h>
@@ -408,6 +409,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 
+	/*
+	 * Store what's necessary for reading the MOS in the label.
+	 */
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+	    spa->spa_label_features) == 0);
+
 	if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
 		ddt_histogram_t *ddh;
 		ddt_stat_t *dds;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 4a8e6adfd..5ec8e688e 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -49,6 +49,7 @@
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include "zfs_prop.h"
+#include "zfeature_common.h"
 
 /*
  * SPA locking
@@ -217,7 +218,7 @@
  * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  *
- * spa_rename() is also implemented within this file since is requires
+ * spa_rename() is also implemented within this file since it requires
  * manipulation of the namespace.
  */
 
@@ -479,8 +480,22 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
 	    KM_PUSHPAGE) == 0);
 
-	if (config != NULL)
+	if (config != NULL) {
+		nvlist_t *features;
+
+		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+		    &features) == 0) {
+			VERIFY(nvlist_dup(features, &spa->spa_label_features,
+			    0) == 0);
+		}
+
 		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+	}
+
+	if (spa->spa_label_features == NULL) {
+		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
+		    KM_SLEEP) == 0);
+	}
 
 	return (spa);
 }
@@ -518,6 +533,7 @@ spa_remove(spa_t *spa)
 
 	list_destroy(&spa->spa_config_list);
 
+	nvlist_free(spa->spa_label_features);
 	nvlist_free(spa->spa_load_info);
 	spa_config_set(spa, NULL);
 
@@ -1025,6 +1041,20 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
  * ==========================================================================
  */
 
+void
+spa_activate_mos_feature(spa_t *spa, const char *feature)
+{
+	(void) nvlist_add_boolean(spa->spa_label_features, feature);
+	vdev_config_dirty(spa->spa_root_vdev);
+}
+
+void
+spa_deactivate_mos_feature(spa_t *spa, const char *feature)
+{
+	(void) nvlist_remove_all(spa->spa_label_features, feature);
+	vdev_config_dirty(spa->spa_root_vdev);
+}
+
 /*
  * Rename a spa_t.
  */
@@ -1175,12 +1205,22 @@ spa_generate_guid(spa_t *spa)
 void
 sprintf_blkptr(char *buf, const blkptr_t *bp)
 {
-	char *type = NULL;
+	char type[256];
 	char *checksum = NULL;
 	char *compress = NULL;
 
 	if (bp != NULL) {
-		type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
+		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
+			dmu_object_byteswap_t bswap =
+			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
+			(void) snprintf(type, sizeof (type), "bswap %s %s",
+			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
+			    "metadata" : "data",
+			    dmu_ot_byteswap[bswap].ob_name);
+		} else {
+			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
+			    sizeof (type));
+		}
 		checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
 		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
 	}
@@ -1252,6 +1292,12 @@ spa_get_dsl(spa_t *spa)
 	return (spa->spa_dsl_pool);
 }
 
+boolean_t
+spa_is_initializing(spa_t *spa)
+{
+	return (spa->spa_is_initializing);
+}
+
 blkptr_t *
 spa_get_rootblkptr(spa_t *spa)
 {
@@ -1288,16 +1334,29 @@ spa_name(spa_t *spa)
 uint64_t
 spa_guid(spa_t *spa)
 {
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	uint64_t guid;
+
 	/*
 	 * If we fail to parse the config during spa_load(), we can go through
 	 * the error path (which posts an ereport) and end up here with no root
 	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
 	 * this case.
 	 */
-	if (spa->spa_root_vdev != NULL)
+	if (spa->spa_root_vdev == NULL)
+		return (spa->spa_config_guid);
+
+	guid = spa->spa_last_synced_guid != 0 ?
+	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
+
+	/*
+	 * Return the most recently synced out guid unless we're
+	 * in syncing context.
+	 */
+	if (dp && dsl_pool_sync_context(dp))
 		return (spa->spa_root_vdev->vdev_guid);
 	else
-		return (spa->spa_config_guid);
+		return (guid);
 }
 
 uint64_t
@@ -1532,6 +1591,7 @@ spa_init(int mode)
 	vdev_cache_stat_init();
 	zfs_prop_init();
 	zpool_prop_init();
+	zpool_feature_init();
 	spa_config_load();
 	l2arc_start();
 }
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 17494bcda..838a6f642 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -20,6 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -669,7 +671,7 @@ txg_list_destroy(txg_list_t *tl)
 	mutex_destroy(&tl->tl_lock);
 }
 
-int
+boolean_t
 txg_list_empty(txg_list_t *tl, uint64_t txg)
 {
 	return (tl->tl_head[txg & TXG_MASK] == NULL);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index e374f6d78..b96975103 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1348,8 +1348,9 @@ vdev_validate(vdev_t *vd, boolean_t strict)
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		uint64_t aux_guid = 0;
 		nvlist_t *nvl;
+		uint64_t txg = strict ? spa->spa_config_txg : -1ULL;
 
-		if ((label = vdev_label_read_config(vd)) == NULL) {
+		if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (0);
@@ -1532,7 +1533,7 @@ vdev_reopen(vdev_t *vd)
 		    !l2arc_vdev_present(vd))
 			l2arc_add_vdev(spa, vd);
 	} else {
-		(void) vdev_validate(vd, B_TRUE);
+		(void) vdev_validate(vd, spa_last_synced_txg(spa));
 	}
 
 	/*
@@ -1993,14 +1994,14 @@ vdev_validate_aux(vdev_t *vd)
 	if (!vdev_readable(vd))
 		return (0);
 
-	if ((label = vdev_label_read_config(vd)) == NULL) {
+	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
-	    version > SPA_VERSION ||
+	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 7ac23500f..1fe36feb2 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -18,8 +18,10 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -121,6 +123,8 @@
  * 	txg		Transaction group in which this label was written
  * 	pool_guid	Unique identifier for this pool
  * 	vdev_tree	An nvlist describing vdev tree.
+ *	features_for_read
+ *			An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
@@ -428,13 +432,23 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config)
 	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 }
 
+/*
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
+ */
 nvlist_t *
-vdev_label_read_config(vdev_t *vd)
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
 	zio_t *zio;
+	uint64_t best_txg = 0;
+	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
 	int l;
@@ -448,6 +462,7 @@ vdev_label_read_config(vdev_t *vd)
 
 retry:
 	for (l = 0; l < VDEV_LABELS; l++) {
+		nvlist_t *label = NULL;
 
 		zio = zio_root(spa, NULL, NULL, flags);
 
@@ -457,12 +472,31 @@ retry:
 
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-		    &config, 0) == 0)
-			break;
+		    &label, 0) == 0) {
+			uint64_t label_txg = 0;
+
+			/*
+			 * Auxiliary vdevs won't have txg values in their
+			 * labels and newly added vdevs may not have been
+			 * completely initialized so just return the
+			 * configuration from the first valid label we
+			 * encounter.
+			 */
+			error = nvlist_lookup_uint64(label,
+			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
+			if ((error || label_txg == 0) && !config) {
+				config = label;
+				break;
+			} else if (label_txg <= txg && label_txg > best_txg) {
+				best_txg = label_txg;
+				nvlist_free(config);
+				config = fnvlist_dup(label);
+			}
+		}
 
-		if (config != NULL) {
-			nvlist_free(config);
-			config = NULL;
+		if (label != NULL) {
+			nvlist_free(label);
+			label = NULL;
 		}
 	}
 
@@ -497,7 +531,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
 	/*
 	 * Read the label, if any, and perform some basic sanity checks.
 	 */
-	if ((label = vdev_label_read_config(vd)) == NULL)
+	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
 		return (B_FALSE);
 
 	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
@@ -838,7 +872,7 @@ retry:
  * come back up, we fail to see the uberblock for txg + 1 because, say,
  * it was on a mirrored device and the replica to which we wrote txg + 1
  * is now offline.  If we then make some changes and sync txg + 1, and then
- * the missing replica comes back, then for a new seconds we'll have two
+ * the missing replica comes back, then for a few seconds we'll have two
  * conflicting uberblocks on disk with the same txg.  The solution is simple:
  * among uberblocks with equal txg, choose the one with the latest timestamp.
  */
@@ -858,47 +892,49 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 	return (0);
 }
 
+struct ubl_cbdata {
+	uberblock_t	*ubl_ubbest;	/* Best uberblock */
+	vdev_t		*ubl_vd;	/* vdev associated with the above */
+};
+
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
+	vdev_t *vd = zio->io_vd;
 	spa_t *spa = zio->io_spa;
 	zio_t *rio = zio->io_private;
 	uberblock_t *ub = zio->io_data;
-	uberblock_t *ubbest = rio->io_private;
+	struct ubl_cbdata *cbp = rio->io_private;
 
-	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
+	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
 		if (ub->ub_txg <= spa->spa_load_max_txg &&
-		    vdev_uberblock_compare(ub, ubbest) > 0)
-			*ubbest = *ub;
+		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
+			/*
+			 * Keep track of the vdev in which this uberblock
+			 * was found. We will use this information later
+			 * to obtain the config nvlist associated with
+			 * this uberblock.
+			 */
+			*cbp->ubl_ubbest = *ub;
+			cbp->ubl_vd = vd;
+		}
 		mutex_exit(&rio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
 }
 
-void
-vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+static void
+vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
+    struct ubl_cbdata *cbp)
 {
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 	int c, l, n;
 
-	if (vd == rvd) {
-		ASSERT(zio == NULL);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		zio = zio_root(spa, NULL, ubbest, flags);
-		bzero(ubbest, sizeof (uberblock_t));
-	}
-
-	ASSERT(zio != NULL);
-
 	for (c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		for (l = 0; l < VDEV_LABELS; l++) {
@@ -911,11 +947,46 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
 			}
 		}
 	}
+}
 
-	if (vd == rvd) {
-		(void) zio_wait(zio);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-	}
+/*
+ * Reads the 'best' uberblock from disk along with its associated
+ * configuration. First, we read the uberblock array of each label of each
+ * vdev, keeping track of the uberblock with the highest txg in each array.
+ * Then, we read the configuration from the same vdev as the best uberblock.
+ */
+void
+vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
+{
+	zio_t *zio;
+	spa_t *spa = rvd->vdev_spa;
+	struct ubl_cbdata cb;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+	ASSERT(ub);
+	ASSERT(config);
+
+	bzero(ub, sizeof (uberblock_t));
+	*config = NULL;
+
+	cb.ubl_ubbest = ub;
+	cb.ubl_vd = NULL;
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	zio = zio_root(spa, NULL, &cb, flags);
+	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
+	(void) zio_wait(zio);
+
+	/*
+	 * It's possible that the best uberblock was discovered on a label
+	 * that has a configuration which was written in a future txg.
+	 * Search all labels on this vdev to find the configuration that
+	 * matches the txg for our uberblock.
+	 */
+	if (cb.ubl_vd != NULL)
+		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index fac54eab0..fd3021be6 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -946,6 +947,19 @@ fzap_prefetch(zap_name_t *zn)
  * Helper functions for consumers.
  */
 
+uint64_t
+zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+    const char *name, dmu_tx_t *tx)
+{
+	uint64_t new_obj;
+
+	VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
+	VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
+	    tx) == 0);
+
+	return (new_obj);
+}
+
 int
 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
     char *name)
@@ -1080,6 +1094,16 @@ zap_add_int_key(objset_t *os, uint64_t obj,
 }
 
 int
+zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_update(os, obj, name, 8, 1, &value, tx));
+}
+
+int
 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 {
 	char name[20];
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index d5b97dac9..3d8cae0d3 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zio.h>
@@ -461,7 +461,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(db, &doi);
-		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 	}
 #endif
 
@@ -585,7 +585,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(db, &doi);
-		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 	}
 #endif
 
diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c
new file mode 100644
index 000000000..c09b32d17
--- /dev/null
+++ b/module/zfs/zfeature.c
@@ -0,0 +1,432 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfeature.h>
+#include <sys/dmu.h>
+#include <sys/nvpair.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include "zfeature_common.h"
+#include <sys/spa_impl.h>
+
+/*
+ * ZFS Feature Flags
+ * -----------------
+ *
+ * ZFS feature flags are used to provide fine-grained versioning to the ZFS
+ * on-disk format. Once enabled on a pool feature flags replace the old
+ * spa_version() number.
+ *
+ * Each new on-disk format change will be given a uniquely identifying string
+ * guid rather than a version number. This avoids the problem of different
+ * organizations creating new on-disk formats with the same version number. To
+ * keep feature guids unique they should consist of the reverse dns name of the
+ * organization which implemented the feature and a short name for the feature,
+ * separated by a colon (e.g. com.delphix:async_destroy).
+ *
+ * Reference Counts
+ * ----------------
+ *
+ * Within each pool features can be in one of three states: disabled, enabled,
+ * or active. These states are differentiated by a reference count stored on
+ * disk for each feature:
+ *
+ *   1) If there is no reference count stored on disk the feature is disabled.
+ *   2) If the reference count is 0 a system administrator has enabled the
+ *      feature, but the feature has not been used yet, so no on-disk
+ *      format changes have been made.
+ *   3) If the reference count is greater than 0 the feature is active.
+ *      The format changes required by the feature are currently on disk.
+ *      Note that if the feature's format changes are reversed the feature
+ *      may choose to set its reference count back to 0.
+ *
+ * Feature flags makes no differentiation between non-zero reference counts
+ * for an active feature (e.g. a reference count of 1 means the same thing as a
+ * reference count of 27834721), but feature implementations may choose to use
+ * the reference count to store meaningful information. For example, a new RAID
+ * implementation might set the reference count to the number of vdevs using
+ * it. If all those disks are removed from the pool the feature goes back to
+ * having a reference count of 0.
+ *
+ * It is the responsibility of the individual features to maintain a non-zero
+ * reference count as long as the feature's format changes are present on disk.
+ *
+ * Dependencies
+ * ------------
+ *
+ * Each feature may depend on other features. The only effect of this
+ * relationship is that when a feature is enabled all of its dependencies are
+ * automatically enabled as well. Any future work to support disabling of
+ * features would need to ensure that features cannot be disabled if other
+ * enabled features depend on them.
+ *
+ * On-disk Format
+ * --------------
+ *
+ * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
+ * (5000). In order for this to work the pool is automatically upgraded to
+ * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
+ * format changes will be in use.
+ *
+ * Information about features is stored in 3 ZAP objects in the pool's MOS.
+ * These objects are linked to by the following names in the pool directory
+ * object:
+ *
+ * 1) features_for_read: feature guid -> reference count
+ *    Features needed to open the pool for reading.
+ * 2) features_for_write: feature guid -> reference count
+ *    Features needed to open the pool for writing.
+ * 3) feature_descriptions: feature guid -> descriptive string
+ *    A human readable string.
+ *
+ * All enabled features appear in either features_for_read or
+ * features_for_write, but not both.
+ *
+ * To open a pool in read-only mode only the features listed in
+ * features_for_read need to be supported.
+ *
+ * To open the pool in read-write mode features in both features_for_read and
+ * features_for_write need to be supported.
+ *
+ * Some features may be required to read the ZAP objects containing feature
+ * information. To allow software to check for compatibility with these features
+ * before the pool is opened their names must be stored in the label in a
+ * new "features_for_read" entry (note that features that are only required
+ * to write to a pool never need to be stored in the label since the
+ * features_for_write ZAP object can be read before the pool is written to).
+ * To save space in the label features must be explicitly marked as needing to
+ * be written to the label. Also, reference counts are not stored in the label,
+ * instead any feature whose reference count drops to 0 is removed from the
+ * label.
+ *
+ * Adding New Features
+ * -------------------
+ *
+ * Features must be registered in zpool_feature_init() function in
+ * zfeature_common.c using the zfeature_register() function. This function
+ * has arguments to specify if the feature should be stored in the
+ * features_for_read or features_for_write ZAP object and if it needs to be
+ * written to the label when active.
+ *
+ * Once a feature is registered it will appear as a "feature@<feature name>"
+ * property which can be set by an administrator. Feature implementors should
+ * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
+ * query the state of a feature and the spa_feature_incr() and
+ * spa_feature_decr() functions to change an enabled feature's reference count.
+ * Reference counts may only be updated in the syncing context.
+ *
+ * Features may not perform enable-time initialization. Instead, any such
+ * initialization should occur when the feature is first used. This design
+ * enforces that on-disk changes be made only when features are used. Code
+ * should only check if a feature is enabled using spa_feature_is_enabled(),
+ * not by relying on any feature specific metadata existing. If a feature is
+ * enabled, but the feature's metadata is not on disk yet then it should be
+ * created as needed.
+ *
+ * As an example, consider the com.delphix:async_destroy feature. This feature
+ * relies on the existence of a bptree in the MOS that store blocks for
+ * asynchronous freeing. This bptree is not created when async_destroy is
+ * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
+ * called to check if async_destroy is enabled. If it is and the bptree object
+ * does not exist yet, the bptree object is created as part of the dataset
+ * destroy and async_destroy's reference count is incremented to indicate it
+ * has made an on-disk format change. Later, after the destroyed dataset's
+ * blocks have all been asynchronously freed there is no longer any use for the
+ * bptree object, so it is destroyed and async_destroy's reference count is
+ * decremented back to 0 to indicate that it has undone its on-disk format
+ * changes.
+ */
+
+typedef enum {
+	FEATURE_ACTION_ENABLE,
+	FEATURE_ACTION_INCR,
+	FEATURE_ACTION_DECR,
+} feature_action_t;
+
+/*
+ * Checks that the features active in the specified object are supported by
+ * this software.  Adds each unsupported feature (name -> description) to
+ * the supplied nvlist.
+ */
+boolean_t
+feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj,
+    nvlist_t *unsup_feat, nvlist_t *enabled_feat)
+{
+	boolean_t supported;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+	char *buf;
+
+	zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP);
+	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	supported = B_TRUE;
+	for (zap_cursor_init(zc, os, obj);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		ASSERT(za->za_integer_length == sizeof (uint64_t) &&
+		    za->za_num_integers == 1);
+
+		if (NULL != enabled_feat) {
+			fnvlist_add_uint64(enabled_feat, za->za_name,
+			    za->za_first_integer);
+		}
+
+		if (za->za_first_integer != 0 &&
+		    !zfeature_is_supported(za->za_name)) {
+			supported = B_FALSE;
+
+			if (NULL != unsup_feat) {
+				char *desc = "";
+
+				if (zap_lookup(os, desc_obj, za->za_name,
+				    1, sizeof (buf), buf) == 0)
+					desc = buf;
+
+				VERIFY(nvlist_add_string(unsup_feat,
+				    za->za_name, desc) == 0);
+			}
+		}
+	}
+	zap_cursor_fini(zc);
+
+	kmem_free(buf, MAXPATHLEN);
+	kmem_free(za, sizeof(zap_attribute_t));
+	kmem_free(zc, sizeof(zap_cursor_t));
+
+	return (supported);
+}
+
+static int
+feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
+    zfeature_info_t *feature, uint64_t *res)
+{
+	int err;
+	uint64_t refcount;
+	uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
+
+	/*
+	 * If the pool is currently being created, the feature objects may not
+	 * have been allocated yet.  Act as though all features are disabled.
+	 */
+	if (zapobj == 0)
+		return (ENOTSUP);
+
+	err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
+	    &refcount);
+	if (err != 0) {
+		if (err == ENOENT)
+			return (ENOTSUP);
+		else
+			return (err);
+	}
+	*res = refcount;
+	return (0);
+}
+
+static int
+feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj,
+    uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action,
+    dmu_tx_t *tx)
+{
+	int error;
+	uint64_t refcount;
+	uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
+
+	ASSERT(0 != zapobj);
+	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+
+	error = zap_lookup(os, zapobj, feature->fi_guid,
+	    sizeof (uint64_t), 1, &refcount);
+
+	/*
+	 * If we can't ascertain the status of the specified feature, an I/O
+	 * error occurred.
+	 */
+	if (error != 0 && error != ENOENT)
+		return (error);
+
+	switch (action) {
+	case FEATURE_ACTION_ENABLE:
+		/*
+		 * If the feature is already enabled, ignore the request.
+		 */
+		if (error == 0)
+			return (0);
+		refcount = 0;
+		break;
+	case FEATURE_ACTION_INCR:
+		if (error == ENOENT)
+			return (ENOTSUP);
+		if (refcount == UINT64_MAX)
+			return (EOVERFLOW);
+		refcount++;
+		break;
+	case FEATURE_ACTION_DECR:
+		if (error == ENOENT)
+			return (ENOTSUP);
+		if (refcount == 0)
+			return (EOVERFLOW);
+		refcount--;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	if (action == FEATURE_ACTION_ENABLE) {
+		int i;
+
+		for (i = 0; feature->fi_depends[i] != NULL; i++) {
+			zfeature_info_t *dep = feature->fi_depends[i];
+
+			error = feature_do_action(os, read_obj, write_obj,
+			    desc_obj, dep, FEATURE_ACTION_ENABLE, tx);
+			if (error != 0)
+				return (error);
+		}
+	}
+
+	error = zap_update(os, zapobj, feature->fi_guid,
+	    sizeof (uint64_t), 1, &refcount, tx);
+	if (error != 0)
+		return (error);
+
+	if (action == FEATURE_ACTION_ENABLE) {
+		error = zap_update(os, desc_obj,
+		    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
+		    feature->fi_desc, tx);
+		if (error != 0)
+			return (error);
+	}
+
+	if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) {
+		spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid);
+	}
+
+	if (action == FEATURE_ACTION_DECR && refcount == 0) {
+		spa_deactivate_mos_feature(dmu_objset_spa(os),
+		    feature->fi_guid);
+	}
+
+	return (0);
+}
+
+void
+spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
+{
+	/*
+	 * We create feature flags ZAP objects in two instances: during pool
+	 * creation and during pool upgrade.
+	 */
+	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
+	    tx->tx_txg == TXG_INITIAL));
+
+	spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURES_FOR_READ, tx);
+	spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURES_FOR_WRITE, tx);
+	spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURE_DESCRIPTIONS, tx);
+}
+
+/*
+ * Enable any required dependencies, then enable the requested feature.
+ */
+void
+spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+	VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
+	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+	    spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx));
+}
+
+/*
+ * If the specified feature has not yet been enabled, this function returns
+ * ENOTSUP; otherwise, this function increments the feature's refcount (or
+ * returns EOVERFLOW if the refcount cannot be incremented). This function must
+ * be called from syncing context.
+ */
+void
+spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+	VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
+	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+	    spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx));
+}
+
+/*
+ * If the specified feature has not yet been enabled, this function returns
+ * ENOTSUP; otherwise, this function decrements the feature's refcount (or
+ * returns EOVERFLOW if the refcount is already 0). This function must
+ * be called from syncing context.
+ */
+void
+spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+	VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
+	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+	    spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx));
+}
+
+boolean_t
+spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature)
+{
+	int err;
+	uint64_t refcount = 0;
+
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_refcount(spa->spa_meta_objset,
+	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+	    feature, &refcount);
+	ASSERT(err == 0 || err == ENOTSUP);
+	return (err == 0);
+}
+
+boolean_t
+spa_feature_is_active(spa_t *spa, zfeature_info_t *feature)
+{
+	int err;
+	uint64_t refcount = 0;
+
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_refcount(spa->spa_meta_objset,
+	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+	    feature, &refcount);
+	ASSERT(err == 0 || err == ENOTSUP);
+	return (err == 0 && refcount > 0);
+}
diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c
new file mode 100644
index 000000000..40066991b
--- /dev/null
+++ b/module/zfs/zfeature_common.c
@@ -0,0 +1,163 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <errno.h>
+#include <string.h>
+#endif
+#include <sys/debug.h>
+#include <sys/fs/zfs.h>
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include "zfeature_common.h"
+
+/*
+ * Set to disable all feature checks while opening pools, allowing pools with
+ * unsupported features to be opened. Set for testing only.
+ */
+boolean_t zfeature_checks_disable = B_FALSE;
+
+zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+/*
+ * Valid characters for feature guids. This list is mainly for aesthetic
+ * purposes and could be expanded in the future. There are different allowed
+ * characters in the guids reverse dns portion (before the colon) and its
+ * short name (after the colon).
+ */
+static int
+valid_char(char c, boolean_t after_colon)
+{
+	return ((c >= 'a' && c <= 'z') ||
+	    (c >= '0' && c <= '9') ||
+	    c == (after_colon ? '_' : '.'));
+}
+
+/*
+ * Every feature guid must contain exactly one colon which separates a reverse
+ * dns organization name from the feature's "short" name (e.g.
+ * "com.company:feature_name").
+ */
+boolean_t
+zfeature_is_valid_guid(const char *name)
+{
+	int i;
+	boolean_t has_colon = B_FALSE;
+
+	i = 0;
+	while (name[i] != '\0') {
+		char c = name[i++];
+		if (c == ':') {
+			if (has_colon)
+				return (B_FALSE);
+			has_colon = B_TRUE;
+			continue;
+		}
+		if (!valid_char(c, has_colon))
+			return (B_FALSE);
+	}
+
+	return (has_colon);
+}
+
+boolean_t
+zfeature_is_supported(const char *guid)
+{
+	if (zfeature_checks_disable)
+		return (B_TRUE);
+
+	return (0 == zfeature_lookup_guid(guid, NULL));
+}
+
+int
+zfeature_lookup_guid(const char *guid, zfeature_info_t **res)
+{
+	int i;
+
+	for (i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t *feature = &spa_feature_table[i];
+		if (strcmp(guid, feature->fi_guid) == 0) {
+			if (res != NULL)
+				*res = feature;
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+int
+zfeature_lookup_name(const char *name, zfeature_info_t **res)
+{
+	int i;
+
+	for (i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t *feature = &spa_feature_table[i];
+		if (strcmp(name, feature->fi_uname) == 0) {
+			if (res != NULL)
+				*res = feature;
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+static void
+zfeature_register(int fid, const char *guid, const char *name, const char *desc,
+    boolean_t readonly, boolean_t mos, zfeature_info_t **deps)
+{
+	zfeature_info_t *feature = &spa_feature_table[fid];
+	static zfeature_info_t *nodeps[] = { NULL };
+
+	ASSERT(name != NULL);
+	ASSERT(desc != NULL);
+	ASSERT(!readonly || !mos);
+	ASSERT3U(fid, <, SPA_FEATURES);
+	ASSERT(zfeature_is_valid_guid(guid));
+
+	if (deps == NULL)
+		deps = nodeps;
+
+	feature->fi_guid = guid;
+	feature->fi_uname = name;
+	feature->fi_desc = desc;
+	feature->fi_can_readonly = readonly;
+	feature->fi_mos = mos;
+	feature->fi_depends = deps;
+}
+
+void
+zpool_feature_init(void)
+{
+	zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
+	    "com.delphix:async_destroy", "async_destroy",
+	    "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL);
+	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
+	    "com.delphix:empty_bpobj", "empty_bpobj",
+	    "Snapshots use less space.", B_TRUE, B_FALSE, NULL);
+}
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index c609203ea..98b1dd794 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
@@ -1107,6 +1108,8 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp)
 /*
  * Find a zfs_sb_t for a mounted filesystem, or create our own, in which
  * case its z_sb will be NULL, and it will be opened as the owner.
+ * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
+ * which prevents all inode ops from running.
  */
 static int
 zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
@@ -1170,7 +1173,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 
 		(void) nvlist_lookup_uint64(props,
 		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
-		if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) {
+		if (!SPA_VERSION_IS_SUPPORTED(version)) {
 			error = EINVAL;
 			goto pool_props_bad;
 		}
@@ -1297,6 +1300,15 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name		name of the pool
+ *
+ * outputs:
+ * zc_cookie		real errno
+ * zc_nvlist_dst	config nvlist
+ * zc_nvlist_dst_size	size of config nvlist
+ */
 static int
 zfs_ioc_pool_stats(zfs_cmd_t *zc)
 {
@@ -1398,7 +1410,8 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) {
+	if (zc->zc_cookie < spa_version(spa) ||
+	    !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
 		spa_close(spa, FTAG);
 		return (EINVAL);
 	}
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 220f2d79e..c9618c13e 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -480,6 +480,38 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite)
 }
 
 /*
+ * Called when we create in-memory log transactions so that we know
+ * to cleanup the itxs at the end of spa_sync().
+ */
+void
+zilog_dirty(zilog_t *zilog, uint64_t txg)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+
+	if (dsl_dataset_is_snapshot(ds))
+		panic("dirtying snapshot!");
+
+	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(ds->ds_dbuf, zilog);
+	}
+}
+
+boolean_t
+zilog_is_dirty(zilog_t *zilog)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+	int t;
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
  * Create an on-disk intent log.
  */
 static lwb_t *
@@ -601,14 +633,21 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 			kmem_cache_free(zil_lwb_cache, lwb);
 		}
 	} else if (!keep_first) {
-		(void) zil_parse(zilog, zil_free_log_block,
-		    zil_free_log_record, tx, zh->zh_claim_txg);
+		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 }
 
+void
+zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+	(void) zil_parse(zilog, zil_free_log_block,
+	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
+}
+
 int
 zil_claim(const char *osname, void *txarg)
 {
@@ -1042,6 +1081,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 		return (NULL);
 
 	ASSERT(lwb->lwb_buf != NULL);
+	ASSERT(zilog_is_dirty(zilog) ||
+	    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 
 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
 		dlen = P2ROUNDUP_TYPED(
@@ -1272,7 +1313,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
-	if (spa_freeze_txg(zilog->zl_spa) !=  UINT64_MAX)
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
@@ -1323,6 +1364,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+	zilog_dirty(zilog, txg);
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
@@ -1332,7 +1374,10 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 
 /*
  * If there are any in-memory intent log transactions which have now been
- * synced then start up a taskq to free them.
+ * synced then start up a taskq to free them. We should only do this after we
+ * have written out the uberblocks (i.e. txg has been comitted) so that
+ * don't inadvertently clean out in-memory log records that would be required
+ * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
@@ -1837,6 +1882,7 @@ zil_close(zilog_t *zilog)
 	mutex_exit(&zilog->zl_lock);
 	if (txg)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
+	ASSERT(!zilog_is_dirty(zilog));
 
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index bfb817b78..638105a09 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -704,7 +704,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
-	    zp->zp_type < DMU_OT_NUMTYPES &&
+	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
 	    zp->zp_copies <= spa_max_replication(spa) &&
@@ -988,7 +988,7 @@ zio_read_bp_init(zio_t *zio)
 		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 	}
 
-	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
 	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
@@ -1248,7 +1248,7 @@ __zio_execute(zio_t *zio)
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
-		dsl_pool_t *dsl;
+		dsl_pool_t *dp;
 		boolean_t cut;
 		int rv;
 
@@ -1262,7 +1262,7 @@ __zio_execute(zio_t *zio)
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
-		dsl = spa_get_dsl(zio->io_spa);
+		dp = spa_get_dsl(zio->io_spa);
 		cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 		    zio_requeue_io_start_cut_in_line : B_FALSE;
 
@@ -1272,16 +1272,24 @@ __zio_execute(zio_t *zio)
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
-		 * If we are in the txg_sync_thread or being called
-		 * during pool init issue async to minimize stack depth.
-		 * Both of these call paths may be recursively called.
-		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
-		if (((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
-		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) ||
-		    (dsl != NULL && dsl_pool_sync_context(dsl))) {
+		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+			return;
+		}
+
+		/*
+		 * If we executing in the context of the tx_sync_thread,
+		 * or we are performing pool initialization outside of a
+		 * zio_taskq[ZIO_TASKQ_ISSUE] context.  Then issue the zio
+		 * async to minimize stack usage for these deep call paths.
+		 */
+		if ((dp && curthread == dp->dp_tx.tx_sync_thread) ||
+		    (dp && spa_is_initializing(dp->dp_spa) &&
+		    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE))) {
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
@@ -3131,6 +3139,48 @@ static zio_pipe_stage_t *zio_pipeline[] = {
 	zio_done
 };
 
+/* dnp is the dnode for zb1->zb_object */
+boolean_t
+zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+    const zbookmark_t *zb2)
+{
+	uint64_t zb1nextL0, zb2thisobj;
+
+	ASSERT(zb1->zb_objset == zb2->zb_objset);
+	ASSERT(zb2->zb_level == 0);
+
+	/*
+	 * A bookmark in the deadlist is considered to be after
+	 * everything else.
+	 */
+	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+		return (B_TRUE);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
+		return (B_FALSE);
+
+	zb1nextL0 = (zb1->zb_blkid + 1) <<
+	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+	zb2thisobj = zb2->zb_object ? zb2->zb_object :
+	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+		uint64_t nextobj = zb1nextL0 *
+		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+		return (nextobj <= zb2thisobj);
+	}
+
+	if (zb1->zb_object < zb2thisobj)
+		return (B_TRUE);
+	if (zb1->zb_object > zb2thisobj)
+		return (B_FALSE);
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+		return (B_FALSE);
+	return (zb1nextL0 <= zb2->zb_blkid);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* Fault injection */
 EXPORT_SYMBOL(zio_injection_enabled);