8 files changed, 1331 insertions, 746 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 455e78a75..4edb8c0a6 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -102,7 +102,9 @@ usage(void)
 	(void) fprintf(stderr, "        -C cached pool configuration\n");
 	(void) fprintf(stderr, "	-i intent logs\n");
 	(void) fprintf(stderr, "	-b block statistics\n");
-	(void) fprintf(stderr, "	-c checksum all data blocks\n");
+	(void) fprintf(stderr, "	-m metaslabs\n");
+	(void) fprintf(stderr, "	-c checksum all metadata (twice for "
+	    "all data) blocks\n");
 	(void) fprintf(stderr, "	-s report stats on zdb's I/O\n");
 	(void) fprintf(stderr, "	-S <user|all>:<cksum_alg|all> -- "
 	    "dump blkptr signatures\n");
@@ -125,6 +127,11 @@ usage(void)
 	exit(1);
 }
 
+/*
+ * Called for usage errors that are discovered after a call to spa_open(),
+ * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
+ */
+
 static void
 fatal(const char *fmt, ...)
 {
@@ -136,7 +143,7 @@ fatal(const char *fmt, ...)
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
-	abort();
+	exit(1);
 }
 
 static void
@@ -209,7 +216,7 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
-	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed));
+	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
@@ -435,7 +442,7 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
 	alloc = 0;
 	for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
 		VERIFY(0 == dmu_read(os, smo->smo_object, offset,
-		    sizeof (entry), &entry));
+		    sizeof (entry), &entry, DMU_READ_PREFETCH));
 		if (SM_DEBUG_DECODE(entry)) {
 			(void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
@@ -467,6 +474,21 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
 }
 
 static void
+dump_metaslab_stats(metaslab_t *msp)
+{
+	char maxbuf[5];
+	space_map_t *sm = &msp->ms_map;
+	avl_tree_t *t = sm->sm_pp_root;
+	int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+	nicenum(space_map_maxsize(sm), maxbuf);
+
+	(void) printf("\t %20s %10lu   %7s  %6s   %4s %4d%%\n",
+	    "segments", avl_numnodes(t), "maxsize", maxbuf,
+	    "freepct", free_pct);
+}
+
+static void
 dump_metaslab(metaslab_t *msp)
 {
 	char freebuf[5];
@@ -476,22 +498,28 @@ dump_metaslab(metaslab_t *msp)
 
 	nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
 
-	if (dump_opt['d'] <= 5) {
-		(void) printf("\t%10llx   %10llu   %5s\n",
-		    (u_longlong_t)msp->ms_map.sm_start,
-		    (u_longlong_t)smo->smo_object,
-		    freebuf);
-		return;
-	}
-
 	(void) printf(
-	    "\tvdev %llu   offset %08llx   spacemap %4llu   free %5s\n",
+	    "\tvdev %5llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
 	    (u_longlong_t)smo->smo_object, freebuf);
 
-	ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+	if (dump_opt['m'] > 1) {
+		mutex_enter(&msp->ms_lock);
+		VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
+		    SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
+		dump_metaslab_stats(msp);
+		space_map_unload(&msp->ms_map);
+		mutex_exit(&msp->ms_lock);
+	}
+
+	if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
+		ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+
+		mutex_enter(&msp->ms_lock);
+		dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+		mutex_exit(&msp->ms_lock);
+	}
 
-	dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
 }
 
 static void
@@ -506,14 +534,12 @@ dump_metaslabs(spa_t *spa)
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vd = rvd->vdev_child[c];
 
-		(void) printf("\n    vdev %llu\n\n", (u_longlong_t)vd->vdev_id);
+		(void) printf("\t%-10s   %-19s   %-15s   %-10s\n",
+		    "vdev", "offset", "spacemap", "free");
+		(void) printf("\t%10s   %19s   %15s   %10s\n",
+		    "----------", "-------------------",
+		    "---------------", "-------------");
 
-		if (dump_opt['d'] <= 5) {
-			(void) printf("\t%10s   %10s   %5s\n",
-			    "offset", "spacemap", "free");
-			(void) printf("\t%10s   %10s   %5s\n",
-			    "------", "--------", "----");
-		}
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
@@ -917,6 +943,7 @@ dump_uidgid(objset_t *os, znode_phys_t *zp)
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj) == 0);
+		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
@@ -1020,6 +1047,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
+	dump_zap,		/* ZFS user/group used		*/
+	dump_zap,		/* ZFS user/group quota		*/
 };
 
 static void
@@ -1083,6 +1112,14 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 	}
 
 	if (verbosity >= 4) {
+		(void) printf("\tdnode flags: %s%s\n",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
+		    "USED_BYTES " : "",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
+		    "USERUSED_ACCOUNTED " : "");
+		(void) printf("\tdnode maxblkid: %llu\n",
+		    (longlong_t)dn->dn_phys->dn_maxblkid);
+
 		object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
 		object_viewer[doi.doi_type](os, object, NULL, 0);
 		*print_header = 1;
@@ -1137,7 +1174,7 @@ dump_dir(objset_t *os)
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[8];
-	char blkbuf[BP_SPRINTF_LEN];
+	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[MAXNAMELEN];
 	char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
@@ -1163,8 +1200,8 @@ dump_dir(objset_t *os)
 	nicenum(refdbytes, numbuf);
 
 	if (verbosity >= 4) {
-		(void) strcpy(blkbuf, ", rootbp ");
-		sprintf_blkptr(blkbuf + strlen(blkbuf),
+		(void) sprintf(blkbuf + strlen(blkbuf), ", rootbp ");
+		(void) sprintf_blkptr(blkbuf + strlen(blkbuf),
 		    BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
@@ -1199,7 +1236,12 @@ dump_dir(objset_t *os)
 	}
 
 	dump_object(os, 0, verbosity, &print_header);
-	object_count = 1;
+	object_count = 0;
+	if (os->os->os_userused_dnode &&
+	    os->os->os_userused_dnode->dn_type != 0) {
+		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
+		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
+	}
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
@@ -1211,8 +1253,10 @@ dump_dir(objset_t *os)
 
 	(void) printf("\n");
 
-	if (error != ESRCH)
-		fatal("dmu_object_next() = %d", error);
+	if (error != ESRCH) {
+		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
+		abort();
+	}
 }
 
 static void
@@ -1395,7 +1439,8 @@ static space_map_ops_t zdb_space_map_ops = {
 	zdb_space_map_unload,
 	NULL,	/* alloc */
 	zdb_space_map_claim,
-	NULL	/* free */
+	NULL,	/* free */
+	NULL	/* maxsize */
 };
 
 static void
@@ -1505,13 +1550,25 @@ zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 {
 	zdb_cb_t *zcb = arg;
 	char blkbuf[BP_SPRINTF_LEN];
+	dmu_object_type_t type;
+	boolean_t is_l0_metadata;
 
 	if (bp == NULL)
 		return (0);
 
-	zdb_count_block(spa, zcb, bp, BP_GET_TYPE(bp));
+	type = BP_GET_TYPE(bp);
+
+	zdb_count_block(spa, zcb, bp, type);
 
-	if (dump_opt['c'] || dump_opt['S']) {
+	/*
+	 * if we do metadata-only checksumming there's no need to checksum
+	 * indirect blocks here because it is done during traverse
+	 */
+	is_l0_metadata = (BP_GET_LEVEL(bp) == 0 && type < DMU_OT_NUMTYPES &&
+	    dmu_ot[type].ot_metadata);
+
+	if (dump_opt['c'] > 1 || dump_opt['S'] ||
+	    (dump_opt['c'] && is_l0_metadata)) {
 		int ioerr, size;
 		void *data;
 
@@ -1523,7 +1580,7 @@ zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 		free(data);
 
 		/* We expect io errors on intent log */
-		if (ioerr && BP_GET_TYPE(bp) != DMU_OT_INTENT_LOG) {
+		if (ioerr && type != DMU_OT_INTENT_LOG) {
 			zcb->zcb_haderrors = 1;
 			zcb->zcb_errors[ioerr]++;
 
@@ -1571,8 +1628,9 @@ dump_block_stats(spa_t *spa)
 	int c, e;
 
 	if (!dump_opt['S']) {
-		(void) printf("\nTraversing all blocks %s%s%s%s...\n",
+		(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
 		    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+		    (dump_opt['c'] == 1) ? "metadata " : "",
 		    dump_opt['c'] ? "checksums " : "",
 		    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
 		    !dump_opt['L'] ? "nothing leaked " : "");
@@ -1772,14 +1830,17 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock);
 
-	if (dump_opt['d'] || dump_opt['i']) {
+	if (dump_opt['d'] || dump_opt['i'] || dump_opt['m']) {
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_bplist(dp->dp_meta_objset,
 			    spa->spa_sync_bplist_obj, "Deferred frees");
 			dump_dtl(spa->spa_root_vdev, 0);
-			dump_metaslabs(spa);
 		}
+
+		if (dump_opt['d'] >= 3 || dump_opt['m'])
+			dump_metaslabs(spa);
+
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL,
 		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 	}
@@ -2255,13 +2316,14 @@ main(int argc, char **argv)
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:t:")) != -1) {
+	while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) {
 		switch (c) {
 		case 'u':
 		case 'd':
 		case 'i':
 		case 'b':
 		case 'c':
+		case 'm':
 		case 's':
 		case 'C':
 		case 'l':
@@ -2397,7 +2459,7 @@ main(int argc, char **argv)
 			}
 
 			if (error == 0)
-				error = spa_import_faulted(argv[0],
+				error = spa_import_verbatim(argv[0],
 				    exported_conf, nvl);
 
 			nvlist_free(nvl);
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
index 02d35a050..cc08ef514 100644
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Print intent log header and statistics.
  */
@@ -345,8 +343,10 @@ dump_intent_log(zilog_t *zilog)
 	if (zh->zh_log.blk_birth == 0 || verbose < 2)
 		return;
 
-	(void) printf("\n    ZIL header: claim_txg %llu, seq %llu\n",
-	    (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_replay_seq);
+	(void) printf("\n    ZIL header: claim_txg %llu, claim_seq %llu",
+	    (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_seq);
+	(void) printf(" replay_seq %llu, flags 0x%llx\n",
+	    (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
 
 	if (verbose >= 4)
 		print_log_bp(&zh->zh_log, "\n\tfirst block: ");
diff --git a/cmd/zfs/zfs_iter.c b/cmd/zfs/zfs_iter.c
index a22370a02..ca5c2b232 100644
--- a/cmd/zfs/zfs_iter.c
+++ b/cmd/zfs/zfs_iter.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -53,11 +53,14 @@ typedef struct zfs_node {
 } zfs_node_t;
 
 typedef struct callback_data {
-	uu_avl_t	*cb_avl;
-	int		cb_flags;
-	zfs_type_t	cb_types;
-	zfs_sort_column_t *cb_sortcol;
-	zprop_list_t	**cb_proplist;
+	uu_avl_t		*cb_avl;
+	int			cb_flags;
+	zfs_type_t		cb_types;
+	zfs_sort_column_t	*cb_sortcol;
+	zprop_list_t		**cb_proplist;
+	int			cb_depth_limit;
+	int			cb_depth;
+	uint8_t			cb_props_table[ZFS_NUM_PROPS];
 } callback_data_t;
 
 uu_avl_pool_t *avl_pool;
@@ -98,10 +101,17 @@ zfs_callback(zfs_handle_t *zhp, void *data)
 		uu_avl_node_init(node, &node->zn_avlnode, avl_pool);
 		if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol,
 		    &idx) == NULL) {
-			if (cb->cb_proplist &&
-			    zfs_expand_proplist(zhp, cb->cb_proplist) != 0) {
-				free(node);
-				return (-1);
+			if (cb->cb_proplist) {
+				if ((*cb->cb_proplist) &&
+				    !(*cb->cb_proplist)->pl_all)
+					zfs_prune_proplist(zhp,
+					    cb->cb_props_table);
+
+				if (zfs_expand_proplist(zhp, cb->cb_proplist)
+				    != 0) {
+					free(node);
+					return (-1);
+				}
 			}
 			uu_avl_insert(cb->cb_avl, node, idx);
 			dontclose = 1;
@@ -113,11 +123,15 @@ zfs_callback(zfs_handle_t *zhp, void *data)
 	/*
 	 * Recurse if necessary.
 	 */
-	if (cb->cb_flags & ZFS_ITER_RECURSE) {
+	if (cb->cb_flags & ZFS_ITER_RECURSE &&
+	    ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 ||
+	    cb->cb_depth < cb->cb_depth_limit)) {
+		cb->cb_depth++;
 		if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM)
 			(void) zfs_iter_filesystems(zhp, zfs_callback, data);
 		if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps)
 			(void) zfs_iter_snapshots(zhp, zfs_callback, data);
+		cb->cb_depth--;
 	}
 
 	if (!dontclose)
@@ -325,10 +339,10 @@ zfs_sort(const void *larg, const void *rarg, void *data)
 
 int
 zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
-    zfs_sort_column_t *sortcol, zprop_list_t **proplist,
+    zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit,
     zfs_iter_f callback, void *data)
 {
-	callback_data_t cb;
+	callback_data_t cb = {0};
 	int ret = 0;
 	zfs_node_t *node;
 	uu_avl_walk_t *walk;
@@ -346,6 +360,45 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
 	cb.cb_flags = flags;
 	cb.cb_proplist = proplist;
 	cb.cb_types = types;
+	cb.cb_depth_limit = limit;
+	/*
+	 * If cb_proplist is provided then in the zfs_handles created  we
+	 * retain only those properties listed in cb_proplist and sortcol.
+	 * The rest are pruned. So, the caller should make sure that no other
+	 * properties other than those listed in cb_proplist/sortcol are
+	 * accessed.
+	 *
+	 * If cb_proplist is NULL then we retain all the properties.  We
+	 * always retain the zoned property, which some other properties
+	 * need (userquota & friends), and the createtxg property, which
+	 * we need to sort snapshots.
+	 */
+	if (cb.cb_proplist && *cb.cb_proplist) {
+		zprop_list_t *p = *cb.cb_proplist;
+
+		while (p) {
+			if (p->pl_prop >= ZFS_PROP_TYPE &&
+			    p->pl_prop < ZFS_NUM_PROPS) {
+				cb.cb_props_table[p->pl_prop] = B_TRUE;
+			}
+			p = p->pl_next;
+		}
+
+		while (sortcol) {
+			if (sortcol->sc_prop >= ZFS_PROP_TYPE &&
+			    sortcol->sc_prop < ZFS_NUM_PROPS) {
+				cb.cb_props_table[sortcol->sc_prop] = B_TRUE;
+			}
+			sortcol = sortcol->sc_next;
+		}
+
+		cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE;
+		cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE;
+	} else {
+		(void) memset(cb.cb_props_table, B_TRUE,
+		    sizeof (cb.cb_props_table));
+	}
+
 	if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
 		(void) fprintf(stderr,
 		    gettext("internal error: out of memory\n"));
diff --git a/cmd/zfs/zfs_iter.h b/cmd/zfs/zfs_iter.h
index 76a11085a..a0290775b 100644
--- a/cmd/zfs/zfs_iter.h
+++ b/cmd/zfs/zfs_iter.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,9 +41,10 @@ typedef struct zfs_sort_column {
 #define	ZFS_ITER_RECURSE	   (1 << 0)
 #define	ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
 #define	ZFS_ITER_PROP_LISTSNAPS    (1 << 2)
+#define	ZFS_ITER_DEPTH_LIMIT	   (1 << 3)
 
 int zfs_for_each(int, char **, int options, zfs_type_t,
-    zfs_sort_column_t *, zprop_list_t **, zfs_iter_f, void *);
+    zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
 int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t);
 void zfs_free_sort_columns(zfs_sort_column_t *);
 
diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index cf30df7d8..f28318ff5 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -39,12 +39,14 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <zone.h>
+#include <grp.h>
+#include <pwd.h>
 #include <sys/mkdev.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
-#include <sys/avl.h>
+#include <sys/fs/zfs.h>
 
 #include <libzfs.h>
 #include <libuutil.h>
@@ -56,6 +58,7 @@ libzfs_handle_t *g_zfs;
 
 static FILE *mnttab_file;
 static char history_str[HIS_MAX_RECORD_LEN];
+const char *pypath = "/usr/lib/zfs/pyzfs.py";
 
 static int zfs_do_clone(int argc, char **argv);
 static int zfs_do_create(int argc, char **argv);
@@ -75,8 +78,8 @@ static int zfs_do_unshare(int argc, char **argv);
 static int zfs_do_send(int argc, char **argv);
 static int zfs_do_receive(int argc, char **argv);
 static int zfs_do_promote(int argc, char **argv);
-static int zfs_do_allow(int argc, char **argv);
-static int zfs_do_unallow(int argc, char **argv);
+static int zfs_do_userspace(int argc, char **argv);
+static int zfs_do_python(int argc, char **argv);
 
 /*
  * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
@@ -116,7 +119,9 @@ typedef enum {
 	HELP_UNMOUNT,
 	HELP_UNSHARE,
 	HELP_ALLOW,
-	HELP_UNALLOW
+	HELP_UNALLOW,
+	HELP_USERSPACE,
+	HELP_GROUPSPACE
 } zfs_help_t;
 
 typedef struct zfs_command {
@@ -150,6 +155,8 @@ static zfs_command_t command_table[] = {
 	{ "get", 	zfs_do_get,		HELP_GET		},
 	{ "inherit",	zfs_do_inherit,		HELP_INHERIT		},
 	{ "upgrade",	zfs_do_upgrade,		HELP_UPGRADE		},
+	{ "userspace",	zfs_do_userspace,	HELP_USERSPACE		},
+	{ "groupspace",	zfs_do_userspace,	HELP_GROUPSPACE		},
 	{ NULL },
 	{ "mount",	zfs_do_mount,		HELP_MOUNT		},
 	{ "unmount",	zfs_do_unmount,		HELP_UNMOUNT		},
@@ -159,9 +166,9 @@ static zfs_command_t command_table[] = {
 	{ "send",	zfs_do_send,		HELP_SEND		},
 	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
 	{ NULL },
-	{ "allow",	zfs_do_allow,		HELP_ALLOW		},
+	{ "allow",	zfs_do_python,		HELP_ALLOW		},
 	{ NULL },
-	{ "unallow",	zfs_do_unallow,		HELP_UNALLOW		},
+	{ "unallow",	zfs_do_python,		HELP_UNALLOW		},
 };
 
 #define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
@@ -184,8 +191,8 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tdestroy [-rRf] "
 		    "<filesystem|volume|snapshot>\n"));
 	case HELP_GET:
-		return (gettext("\tget [-rHp] [-o field[,...]] "
-		    "[-s source[,...]]\n"
+		return (gettext("\tget [-rHp] [-d max] "
+		    "[-o field[,...]] [-s source[,...]]\n"
 		    "\t    <\"all\" | property[,...]> "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_INHERIT:
@@ -195,8 +202,8 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tupgrade [-v]\n"
 		    "\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
 	case HELP_LIST:
-		return (gettext("\tlist [-rH] [-o property[,...]] "
-		    "[-t type[,...]] [-s property] ...\n"
+		return (gettext("\tlist [-rH][-d max] "
+		    "[-o property[,...]] [-t type[,...]] [-s property] ...\n"
 		    "\t    [-S property] ... "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
@@ -232,7 +239,8 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tunshare [-f] "
 		    "<-a | filesystem|mountpoint>\n"));
 	case HELP_ALLOW:
-		return (gettext("\tallow [-ldug] "
+		return (gettext("\tallow <filesystem|volume>\n"
+		    "\tallow [-ldug] "
 		    "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
 		    "\t    <filesystem|volume>\n"
 		    "\tallow [-ld] -e <perm|@setname>[,...] "
@@ -250,6 +258,14 @@ get_usage(zfs_help_t idx)
 		    "<filesystem|volume>\n"
 		    "\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
 		    "<filesystem|volume>\n"));
+	case HELP_USERSPACE:
+		return (gettext("\tuserspace [-hniHp] [-o field[,...]] "
+		    "[-sS field] ... [-t type[,...]]\n"
+		    "\t    <filesystem|snapshot>\n"));
+	case HELP_GROUPSPACE:
+		return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] "
+		    "[-sS field] ... [-t type[,...]]\n"
+		    "\t    <filesystem|snapshot>\n"));
 	}
 
 	abort();
@@ -311,7 +327,6 @@ usage(boolean_t requested)
 {
 	int i;
 	boolean_t show_properties = B_FALSE;
-	boolean_t show_permissions = B_FALSE;
 	FILE *fp = requested ? stdout : stderr;
 
 	if (current_command == NULL) {
@@ -342,13 +357,7 @@ usage(boolean_t requested)
 	    strcmp(current_command->name, "list") == 0))
 		show_properties = B_TRUE;
 
-	if (current_command != NULL &&
-	    (strcmp(current_command->name, "allow") == 0 ||
-	    strcmp(current_command->name, "unallow") == 0))
-		show_permissions = B_TRUE;
-
 	if (show_properties) {
-
 		(void) fprintf(fp,
 		    gettext("\nThe following properties are supported:\n"));
 
@@ -359,16 +368,26 @@ usage(boolean_t requested)
 		(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
 		    ZFS_TYPE_DATASET);
 
+		(void) fprintf(fp, "\t%-15s ", "userused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "groupused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "userquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "groupquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+
 		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
 		    "with standard units such as K, M, G, etc.\n"));
 		(void) fprintf(fp, gettext("\nUser-defined properties can "
 		    "be specified by using a name containing a colon (:).\n"));
-
-	} else if (show_permissions) {
-		(void) fprintf(fp,
-		    gettext("\nThe following permissions are supported:\n"));
-
-		zfs_deleg_permissions();
+		(void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ "
+		    "properties must be appended with\n"
+		    "a user or group specifier of one of these forms:\n"
+		    "    POSIX name      (eg: \"matt\")\n"
+		    "    POSIX id        (eg: \"126829\")\n"
+		    "    SMB name@domain (eg: \"matt@sun\")\n"
+		    "    SMB SID         (eg: \"S-1-234-567-89\")\n"));
 	} else {
 		(void) fprintf(fp,
 		    gettext("\nFor the property list, run: %s\n"),
@@ -415,6 +434,27 @@ parseprop(nvlist_t *props)
 	return (0);
 }
 
+static int
+parse_depth(char *opt, int *flags)
+{
+	char *tmp;
+	int depth;
+
+	depth = (int)strtol(opt, &tmp, 0);
+	if (*tmp) {
+		(void) fprintf(stderr,
+		    gettext("%s is not an integer\n"), optarg);
+		usage(B_FALSE);
+	}
+	if (depth < 0) {
+		(void) fprintf(stderr,
+		    gettext("Depth can not be negative.\n"));
+		usage(B_FALSE);
+	}
+	*flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE);
+	return (depth);
+}
+
 /*
  * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
  *
@@ -1063,6 +1103,17 @@ get_callback(zfs_handle_t *zhp, void *data)
 			zprop_print_one_property(zfs_get_name(zhp), cbp,
 			    zfs_prop_to_name(pl->pl_prop),
 			    buf, sourcetype, source);
+		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
+			sourcetype = ZPROP_SRC_LOCAL;
+
+			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+			    buf, sizeof (buf), cbp->cb_literal) != 0) {
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, buf, sourcetype, source);
 		} else {
 			if (nvlist_lookup_nvlist(userprop,
 			    pl->pl_user_prop, &propval) != 0) {
@@ -1102,6 +1153,7 @@ zfs_do_get(int argc, char **argv)
 	int i, c, flags = 0;
 	char *value, *fields;
 	int ret;
+	int limit = 0;
 	zprop_list_t fake_name = { 0 };
 
 	/*
@@ -1115,11 +1167,14 @@ zfs_do_get(int argc, char **argv)
 	cb.cb_type = ZFS_TYPE_DATASET;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) {
+	while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) {
 		switch (c) {
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			break;
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			break;
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
@@ -1250,7 +1305,7 @@ zfs_do_get(int argc, char **argv)
 
 	/* run for each object */
 	ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL,
-	    &cb.cb_proplist, get_callback, &cb);
+	    &cb.cb_proplist, limit, get_callback, &cb);
 
 	if (cb.cb_proplist == &fake_name)
 		zprop_free_list(fake_name.pl_next);
@@ -1363,10 +1418,10 @@ zfs_do_inherit(int argc, char **argv)
 
 	if (flags & ZFS_ITER_RECURSE) {
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
-		    NULL, NULL, inherit_recurse_cb, propname);
+		    NULL, NULL, 0, inherit_recurse_cb, propname);
 	} else {
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
-		    NULL, NULL, inherit_cb, propname);
+		    NULL, NULL, 0, inherit_cb, propname);
 	}
 
 	return (ret);
@@ -1435,21 +1490,30 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data)
 {
 	upgrade_cbdata_t *cb = data;
 	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
-
-	if (cb->cb_version >= ZPL_VERSION_FUID) {
-		int spa_version;
-
-		if (zfs_spa_version(zhp, &spa_version) < 0)
-			return (-1);
-
-		if (spa_version < SPA_VERSION_FUID) {
-			/* can't upgrade */
-			(void) printf(gettext("%s: can not be upgraded; "
-			    "the pool version needs to first be upgraded\nto "
-			    "version %d\n\n"),
-			    zfs_get_name(zhp), SPA_VERSION_FUID);
-			cb->cb_numfailed++;
-			return (0);
+	int i;
+	static struct { int zplver; int spaver; } table[] = {
+		{ZPL_VERSION_FUID, SPA_VERSION_FUID},
+		{ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+		{0, 0}
+	};
+
+
+	for (i = 0; table[i].zplver; i++) {
+		if (cb->cb_version >= table[i].zplver) {
+			int spa_version;
+
+			if (zfs_spa_version(zhp, &spa_version) < 0)
+				return (-1);
+
+			if (spa_version < table[i].spaver) {
+				/* can't upgrade */
+				(void) printf(gettext("%s: can not be "
+				    "upgraded; the pool version needs to first "
+				    "be upgraded\nto version %d\n\n"),
+				    zfs_get_name(zhp), table[i].spaver);
+				cb->cb_numfailed++;
+				return (0);
+			}
 		}
 	}
 
@@ -1550,6 +1614,8 @@ zfs_do_upgrade(int argc, char **argv)
 		(void) printf(gettext(" 2   Enhanced directory entries\n"));
 		(void) printf(gettext(" 3   Case insensitive and File system "
 		    "unique identifer (FUID)\n"));
+		(void) printf(gettext(" 4   userquota, groupquota "
+		    "properties\n"));
 		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
@@ -1561,7 +1627,7 @@ zfs_do_upgrade(int argc, char **argv)
 		if (cb.cb_version == 0)
 			cb.cb_version = ZPL_VERSION;
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
-		    NULL, NULL, upgrade_set_callback, &cb);
+		    NULL, NULL, 0, upgrade_set_callback, &cb);
 		(void) printf(gettext("%llu filesystems upgraded\n"),
 		    cb.cb_numupgraded);
 		if (cb.cb_numsamegraded) {
@@ -1579,14 +1645,14 @@ zfs_do_upgrade(int argc, char **argv)
 
 		flags |= ZFS_ITER_RECURSE;
 		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
-		    NULL, NULL, upgrade_list_callback, &cb);
+		    NULL, NULL, 0, upgrade_list_callback, &cb);
 
 		found = cb.cb_foundone;
 		cb.cb_foundone = B_FALSE;
 		cb.cb_newer = B_TRUE;
 
 		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
-		    NULL, NULL, upgrade_list_callback, &cb);
+		    NULL, NULL, 0, upgrade_list_callback, &cb);
 
 		if (!cb.cb_foundone && !found) {
 			(void) printf(gettext("All filesystems are "
@@ -1598,11 +1664,90 @@ zfs_do_upgrade(int argc, char **argv)
 }
 
 /*
- * list [-rH] [-o property[,property]...] [-t type[,type]...]
+ * zfs userspace
+ */
+static int
+userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
+{
+	zfs_userquota_prop_t *typep = arg;
+	zfs_userquota_prop_t p = *typep;
+	char *name = NULL;
+	char *ug, *propname;
+	char namebuf[32];
+	char sizebuf[32];
+
+	if (domain == NULL || domain[0] == '\0') {
+		if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) {
+			struct group *g = getgrgid(rid);
+			if (g)
+				name = g->gr_name;
+		} else {
+			struct passwd *p = getpwuid(rid);
+			if (p)
+				name = p->pw_name;
+		}
+	}
+
+	if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA)
+		ug = "group";
+	else
+		ug = "user";
+
+	if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED)
+		propname = "used";
+	else
+		propname = "quota";
+
+	if (name == NULL) {
+		(void) snprintf(namebuf, sizeof (namebuf),
+		    "%llu", (longlong_t)rid);
+		name = namebuf;
+	}
+	zfs_nicenum(space, sizebuf, sizeof (sizebuf));
+
+	(void) printf("%s %s %s%c%s %s\n", propname, ug, domain,
+	    domain[0] ? '-' : ' ', name, sizebuf);
+
+	return (0);
+}
+
+static int
+zfs_do_userspace(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	zfs_userquota_prop_t p;
+	int error;
+
+	/*
+	 * Try the python version.  If the execv fails, we'll continue
+	 * and do a simplistic implementation.
+	 */
+	(void) execv(pypath, argv-1);
+
+	(void) printf("internal error: %s not found\n"
+	    "falling back on built-in implementation, "
+	    "some features will not work\n", pypath);
+
+	if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL)
+		return (1);
+
+	(void) printf("PROP TYPE NAME VALUE\n");
+
+	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
+		error = zfs_userspace(zhp, p, userspace_cb, &p);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...]
  *      [-s property [-s property]...] [-S property [-S property]...]
  *      <dataset> ...
  *
  * 	-r	Recurse over all children
+ * 	-d	Limit recursion by depth.
  * 	-H	Scripted mode; elide headers and separate columns by tabs
  * 	-o	Control which fields to display.
  * 	-t	Control which object types to display.
@@ -1685,7 +1830,6 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
 			first = B_FALSE;
 		}
 
-		right_justify = B_FALSE;
 		if (pl->pl_prop != ZPROP_INVAL) {
 			if (zfs_prop_get(zhp, pl->pl_prop, property,
 			    sizeof (property), NULL, NULL, 0, B_FALSE) != 0)
@@ -1694,6 +1838,13 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
 				propstr = property;
 
 			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
+			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+			    property, sizeof (property), B_FALSE) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = B_TRUE;
 		} else {
 			if (nvlist_lookup_nvlist(userprops,
 			    pl->pl_user_prop, &propval) != 0)
@@ -1701,6 +1852,7 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
 			else
 				verify(nvlist_lookup_string(propval,
 				    ZPROP_VALUE, &propstr) == 0);
+			right_justify = B_FALSE;
 		}
 
 		width = pl->pl_width;
@@ -1752,16 +1904,20 @@ zfs_do_list(int argc, char **argv)
 	char *fields = NULL;
 	list_cbdata_t cb = { 0 };
 	char *value;
+	int limit = 0;
 	int ret;
 	zfs_sort_column_t *sortcol = NULL;
 	int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) {
+	while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) {
 		switch (c) {
 		case 'o':
 			fields = optarg;
 			break;
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			break;
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
@@ -1852,7 +2008,7 @@ zfs_do_list(int argc, char **argv)
 	cb.cb_first = B_TRUE;
 
 	ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
-	    list_callback, &cb);
+	    limit, list_callback, &cb);
 
 	zprop_free_list(cb.cb_proplist);
 	zfs_free_sort_columns(sortcol);
@@ -2235,7 +2391,7 @@ zfs_do_set(int argc, char **argv)
 	}
 
 	ret = zfs_for_each(argc - 2, argv + 2, NULL,
-	    ZFS_TYPE_DATASET, NULL, NULL, set_callback, &cb);
+	    ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb);
 
 	return (ret);
 }
@@ -2495,390 +2651,6 @@ zfs_do_receive(int argc, char **argv)
 	return (err != 0);
 }
 
-typedef struct allow_cb {
-	int  a_permcnt;
-	size_t a_treeoffset;
-} allow_cb_t;
-
-static void
-zfs_print_perms(avl_tree_t *tree)
-{
-	zfs_perm_node_t *permnode;
-
-	permnode = avl_first(tree);
-	while (permnode != NULL) {
-		(void) printf("%s", permnode->z_pname);
-		permnode = AVL_NEXT(tree, permnode);
-		if (permnode)
-			(void) printf(",");
-		else
-			(void) printf("\n");
-	}
-}
-
-/*
- * Iterate over user/groups/everyone/... and the call perm_iter
- * function to print actual permission when tree has >0 nodes.
- */
-static void
-zfs_iter_perms(avl_tree_t *tree, const char *banner, allow_cb_t *cb)
-{
-	zfs_allow_node_t *item;
-	avl_tree_t *ptree;
-
-	item = avl_first(tree);
-	while (item) {
-		ptree = (void *)((char *)item + cb->a_treeoffset);
-		if (avl_numnodes(ptree)) {
-			if (cb->a_permcnt++ == 0)
-				(void) printf("%s\n", banner);
-			(void) printf("\t%s", item->z_key);
-			/*
-			 * Avoid an extra space being printed
-			 * for "everyone" which is keyed with a null
-			 * string
-			 */
-			if (item->z_key[0] != '\0')
-				(void) printf(" ");
-			zfs_print_perms(ptree);
-		}
-		item = AVL_NEXT(tree, item);
-	}
-}
-
-#define	LINES "-------------------------------------------------------------\n"
-static int
-zfs_print_allows(char *ds)
-{
-	zfs_allow_t *curperms, *perms;
-	zfs_handle_t *zhp;
-	allow_cb_t allowcb = { 0 };
-	char banner[MAXPATHLEN];
-
-	if (ds[0] == '-')
-		usage(B_FALSE);
-
-	if (strrchr(ds, '@')) {
-		(void) fprintf(stderr, gettext("Snapshots don't have 'allow'"
-		    " permissions\n"));
-		return (1);
-	}
-	if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL)
-		return (1);
-
-	if (zfs_perm_get(zhp, &perms)) {
-		(void) fprintf(stderr,
-		    gettext("Failed to retrieve 'allows' on %s\n"), ds);
-		zfs_close(zhp);
-		return (1);
-	}
-
-	zfs_close(zhp);
-
-	if (perms != NULL)
-		(void) printf("%s", LINES);
-	for (curperms = perms; curperms; curperms = curperms->z_next) {
-
-		(void) snprintf(banner, sizeof (banner),
-		    gettext("Permission sets on (%s)"), curperms->z_setpoint);
-		allowcb.a_treeoffset =
-		    offsetof(zfs_allow_node_t, z_localdescend);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_sets, banner, &allowcb);
-
-		(void) snprintf(banner, sizeof (banner),
-		    gettext("Create time permissions on (%s)"),
-		    curperms->z_setpoint);
-		allowcb.a_treeoffset =
-		    offsetof(zfs_allow_node_t, z_localdescend);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_crperms, banner, &allowcb);
-
-
-		(void) snprintf(banner, sizeof (banner),
-		    gettext("Local permissions on (%s)"), curperms->z_setpoint);
-		allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_local);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_user, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_group, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
-
-		(void) snprintf(banner, sizeof (banner),
-		    gettext("Descendent permissions on (%s)"),
-		    curperms->z_setpoint);
-		allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_descend);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_user, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_group, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
-
-		(void) snprintf(banner, sizeof (banner),
-		    gettext("Local+Descendent permissions on (%s)"),
-		    curperms->z_setpoint);
-		allowcb.a_treeoffset =
-		    offsetof(zfs_allow_node_t, z_localdescend);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_user, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_group, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
-
-		(void) printf("%s", LINES);
-	}
-	zfs_free_allows(perms);
-	return (0);
-}
-
-#define	ALLOWOPTIONS "ldcsu:g:e"
-#define	UNALLOWOPTIONS "ldcsu:g:er"
-
-/*
- * Validate options, and build necessary datastructure to display/remove/add
- * permissions.
- * Returns 0 - If permissions should be added/removed
- * Returns 1 - If permissions should be displayed.
- * Returns -1 - on failure
- */
-int
-parse_allow_args(int *argc, char **argv[], boolean_t unallow,
-    char **ds, int *recurse, nvlist_t **zperms)
-{
-	int c;
-	char *options = unallow ? UNALLOWOPTIONS : ALLOWOPTIONS;
-	zfs_deleg_inherit_t deleg_type = ZFS_DELEG_NONE;
-	zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
-	char *who = NULL;
-	char *perms = NULL;
-	zfs_handle_t *zhp;
-
-	while ((c = getopt(*argc, *argv, options)) != -1) {
-		switch (c) {
-		case 'l':
-			if (who_type == ZFS_DELEG_CREATE ||
-			    who_type == ZFS_DELEG_NAMED_SET)
-				usage(B_FALSE);
-
-			deleg_type |= ZFS_DELEG_PERM_LOCAL;
-			break;
-		case 'd':
-			if (who_type == ZFS_DELEG_CREATE ||
-			    who_type == ZFS_DELEG_NAMED_SET)
-				usage(B_FALSE);
-
-			deleg_type |= ZFS_DELEG_PERM_DESCENDENT;
-			break;
-		case 'r':
-			*recurse = B_TRUE;
-			break;
-		case 'c':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			if (deleg_type)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_CREATE;
-			break;
-		case 's':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			if (deleg_type)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_NAMED_SET;
-			break;
-		case 'u':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_USER;
-			who = optarg;
-			break;
-		case 'g':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_GROUP;
-			who = optarg;
-			break;
-		case 'e':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_EVERYONE;
-			break;
-		default:
-			usage(B_FALSE);
-			break;
-		}
-	}
-
-	if (deleg_type == 0)
-		deleg_type = ZFS_DELEG_PERM_LOCALDESCENDENT;
-
-	*argc -= optind;
-	*argv += optind;
-
-	if (unallow == B_FALSE && *argc == 1) {
-		/*
-		 * Only print permissions if no options were processed
-		 */
-		if (optind == 1)
-			return (1);
-		else
-			usage(B_FALSE);
-	}
-
-	/*
-	 * initialize variables for zfs_build_perms based on number
-	 * of arguments.
-	 * 3 arguments ==>	zfs [un]allow joe perm,perm,perm <dataset> or
-	 *			zfs [un]allow -s @set1 perm,perm <dataset>
-	 * 2 arguments ==>	zfs [un]allow -c perm,perm <dataset> or
-	 *			zfs [un]allow -u|-g <name> perm <dataset> or
-	 *			zfs [un]allow -e perm,perm <dataset>
-	 *			zfs unallow joe <dataset>
-	 *			zfs unallow -s @set1 <dataset>
-	 * 1 argument  ==>	zfs [un]allow -e <dataset> or
-	 *			zfs [un]allow -c <dataset>
-	 */
-
-	switch (*argc) {
-	case 3:
-		perms = (*argv)[1];
-		who = (*argv)[0];
-		*ds = (*argv)[2];
-
-		/*
-		 * advance argc/argv for do_allow cases.
-		 * for do_allow case make sure who have a know who type
-		 * and its not a permission set.
-		 */
-		if (unallow == B_TRUE) {
-			*argc -= 2;
-			*argv += 2;
-		} else if (who_type != ZFS_DELEG_WHO_UNKNOWN &&
-		    who_type != ZFS_DELEG_NAMED_SET)
-			usage(B_FALSE);
-		break;
-
-	case 2:
-		if (unallow == B_TRUE && (who_type == ZFS_DELEG_EVERYONE ||
-		    who_type == ZFS_DELEG_CREATE || who != NULL)) {
-			perms = (*argv)[0];
-			*ds = (*argv)[1];
-		} else {
-			if (unallow == B_FALSE &&
-			    (who_type == ZFS_DELEG_WHO_UNKNOWN ||
-			    who_type == ZFS_DELEG_NAMED_SET))
-				usage(B_FALSE);
-			else if (who_type == ZFS_DELEG_WHO_UNKNOWN ||
-			    who_type == ZFS_DELEG_NAMED_SET)
-				who = (*argv)[0];
-			else if (who_type != ZFS_DELEG_NAMED_SET)
-				perms = (*argv)[0];
-			*ds = (*argv)[1];
-		}
-		if (unallow == B_TRUE) {
-			(*argc)--;
-			(*argv)++;
-		}
-		break;
-
-	case 1:
-		if (unallow == B_FALSE)
-			usage(B_FALSE);
-		if (who == NULL && who_type != ZFS_DELEG_CREATE &&
-		    who_type != ZFS_DELEG_EVERYONE)
-			usage(B_FALSE);
-		*ds = (*argv)[0];
-		break;
-
-	default:
-		usage(B_FALSE);
-	}
-
-	if (strrchr(*ds, '@')) {
-		(void) fprintf(stderr,
-		    gettext("Can't set or remove 'allow' permissions "
-		    "on snapshots.\n"));
-			return (-1);
-	}
-
-	if ((zhp = zfs_open(g_zfs, *ds, ZFS_TYPE_DATASET)) == NULL)
-		return (-1);
-
-	if ((zfs_build_perms(zhp, who, perms,
-	    who_type, deleg_type, zperms)) != 0) {
-		zfs_close(zhp);
-		return (-1);
-	}
-	zfs_close(zhp);
-	return (0);
-}
-
-static int
-zfs_do_allow(int argc, char **argv)
-{
-	char *ds;
-	nvlist_t *zperms = NULL;
-	zfs_handle_t *zhp;
-	int unused;
-	int ret;
-
-	if ((ret = parse_allow_args(&argc, &argv, B_FALSE, &ds,
-	    &unused, &zperms)) == -1)
-		return (1);
-
-	if (ret == 1)
-		return (zfs_print_allows(argv[0]));
-
-	if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL)
-		return (1);
-
-	if (zfs_perm_set(zhp, zperms)) {
-		zfs_close(zhp);
-		nvlist_free(zperms);
-		return (1);
-	}
-	nvlist_free(zperms);
-	zfs_close(zhp);
-
-	return (0);
-}
-
-static int
-unallow_callback(zfs_handle_t *zhp, void *data)
-{
-	nvlist_t *nvp = (nvlist_t *)data;
-	int error;
-
-	error = zfs_perm_remove(zhp, nvp);
-	if (error) {
-		(void) fprintf(stderr, gettext("Failed to remove permissions "
-		    "on %s\n"), zfs_get_name(zhp));
-	}
-	return (error);
-}
-
-static int
-zfs_do_unallow(int argc, char **argv)
-{
-	int recurse = B_FALSE;
-	char *ds;
-	int error;
-	nvlist_t *zperms = NULL;
-	int flags = 0;
-
-	if (parse_allow_args(&argc, &argv, B_TRUE,
-	    &ds, &recurse, &zperms) == -1)
-		return (1);
-
-	if (recurse)
-		flags |= ZFS_ITER_RECURSE;
-	error = zfs_for_each(argc, argv, flags,
-	    ZFS_TYPE_FILESYSTEM|ZFS_TYPE_VOLUME, NULL,
-	    NULL, unallow_callback, (void *)zperms);
-
-	if (zperms)
-		nvlist_free(zperms);
-
-	return (error);
-}
-
 typedef struct get_all_cbdata {
 	zfs_handle_t	**cb_handles;
 	size_t		cb_alloc;
@@ -3944,6 +3716,15 @@ zfs_do_unshare(int argc, char **argv)
 	return (unshare_unmount(OP_SHARE, argc, argv));
 }
 
+/* ARGSUSED */
+static int
+zfs_do_python(int argc, char **argv)
+{
+	(void) execv(pypath, argv-1);
+	(void) printf("internal error: %s not found\n", pypath);
+	return (-1);
+}
+
 /*
  * Called when invoked as /etc/fs/zfs/mount.  Do the mount if the mountpoint is
  * 'legacy'.  Otherwise, complain that use should be using 'zfs mount'.
@@ -4197,6 +3978,7 @@ main(int argc, char **argv)
 		/*
 		 * Run the appropriate command.
 		 */
+		libzfs_mnttab_cache(g_zfs, B_TRUE);
 		if (find_command_idx(cmdname, &i) == 0) {
 			current_command = &command_table[i];
 			ret = command_table[i].func(argc - 1, argv + 1);
@@ -4210,6 +3992,7 @@ main(int argc, char **argv)
 			usage(B_FALSE);
 			ret = 1;
 		}
+		libzfs_mnttab_cache(g_zfs, B_FALSE);
 	}
 
 	(void) fclose(mnttab_file);
diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index 02dc18b9c..09c377ef8 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * ZFS Fault Injector
  *
@@ -224,7 +222,7 @@ usage(void)
 	    "\t\tClear the particular record (if given a numeric ID), or\n"
 	    "\t\tall records if 'all' is specificed.\n"
 	    "\n"
-	    "\tzinject -d device [-e errno] [-L <nvlist|uber>] pool\n"
+	    "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
 	    "\t\tInject a fault into a particular device or the device's\n"
 	    "\t\tlabel.  Label injection can either be 'nvlist' or 'uber'.\n"
 	    "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
@@ -516,7 +514,7 @@ main(int argc, char **argv)
 		return (0);
 	}
 
-	while ((c = getopt(argc, argv, ":ab:d:f:qhc:t:l:mr:e:uL:")) != -1) {
+	while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:")) != -1) {
 		switch (c) {
 		case 'a':
 			flags |= ZINJECT_FLUSH_ARC;
@@ -553,6 +551,9 @@ main(int argc, char **argv)
 				return (1);
 			}
 			break;
+		case 'F':
+			record.zi_failfast = B_TRUE;
+			break;
 		case 'h':
 			usage();
 			return (0);
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index aeeff50fd..bcbee8b17 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -376,12 +376,11 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props,
 		}
 		normnm = zpool_prop_to_name(prop);
 	} else {
-		if ((fprop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
-			(void) fprintf(stderr, gettext("property '%s' is "
-			    "not a valid file system property\n"), propname);
-			return (2);
+		if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
+			normnm = zfs_prop_to_name(fprop);
+		} else {
+			normnm = propname;
 		}
-		normnm = zfs_prop_to_name(fprop);
 	}
 
 	if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
@@ -979,14 +978,189 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
 	return (max);
 }
 
+typedef struct spare_cbdata {
+	uint64_t	cb_guid;
+	zpool_handle_t	*cb_zhp;
+} spare_cbdata_t;
+
+static boolean_t
+find_vdev(nvlist_t *nv, uint64_t search)
+{
+	uint64_t guid;
+	nvlist_t **child;
+	uint_t c, children;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
+	    search == guid)
+		return (B_TRUE);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			if (find_vdev(child[c], search))
+				return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+find_spare(zpool_handle_t *zhp, void *data)
+{
+	spare_cbdata_t *cbp = data;
+	nvlist_t *config, *nvroot;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	if (find_vdev(nvroot, cbp->cb_guid)) {
+		cbp->cb_zhp = zhp;
+		return (1);
+	}
+
+	zpool_close(zhp);
+	return (0);
+}
+
+/*
+ * Print out configuration state as requested by status_callback.
+ */
+void
+print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
+    int namewidth, int depth, boolean_t isspare)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_stat_t *vs;
+	char rbuf[6], wbuf[6], cbuf[6], repaired[7];
+	char *vname;
+	uint64_t notpresent;
+	spare_cbdata_t cb;
+	char *state;
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+	if (isspare) {
+		/*
+		 * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
+		 * online drives.
+		 */
+		if (vs->vs_aux == VDEV_AUX_SPARED)
+			state = "INUSE";
+		else if (vs->vs_state == VDEV_STATE_HEALTHY)
+			state = "AVAIL";
+	}
+
+	(void) printf("\t%*s%-*s  %-8s", depth, "", namewidth - depth,
+	    name, state);
+
+	if (!isspare) {
+		zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
+		zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
+		zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
+		(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
+	}
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &notpresent) == 0) {
+		char *path;
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+		(void) printf("  was %s", path);
+	} else if (vs->vs_aux != 0) {
+		(void) printf("  ");
+
+		switch (vs->vs_aux) {
+		case VDEV_AUX_OPEN_FAILED:
+			(void) printf(gettext("cannot open"));
+			break;
+
+		case VDEV_AUX_BAD_GUID_SUM:
+			(void) printf(gettext("missing device"));
+			break;
+
+		case VDEV_AUX_NO_REPLICAS:
+			(void) printf(gettext("insufficient replicas"));
+			break;
+
+		case VDEV_AUX_VERSION_NEWER:
+			(void) printf(gettext("newer version"));
+			break;
+
+		case VDEV_AUX_SPARED:
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+			    &cb.cb_guid) == 0);
+			if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
+				if (strcmp(zpool_get_name(cb.cb_zhp),
+				    zpool_get_name(zhp)) == 0)
+					(void) printf(gettext("currently in "
+					    "use"));
+				else
+					(void) printf(gettext("in use by "
+					    "pool '%s'"),
+					    zpool_get_name(cb.cb_zhp));
+				zpool_close(cb.cb_zhp);
+			} else {
+				(void) printf(gettext("currently in use"));
+			}
+			break;
+
+		case VDEV_AUX_ERR_EXCEEDED:
+			(void) printf(gettext("too many errors"));
+			break;
+
+		case VDEV_AUX_IO_FAILURE:
+			(void) printf(gettext("experienced I/O failures"));
+			break;
+
+		case VDEV_AUX_BAD_LOG:
+			(void) printf(gettext("bad intent log"));
+			break;
+
+		default:
+			(void) printf(gettext("corrupted data"));
+			break;
+		}
+	} else if (vs->vs_scrub_repaired != 0 && children == 0) {
+		/*
+		 * Report bytes resilvered/repaired on leaf devices.
+		 */
+		zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
+		(void) printf(gettext("  %s %s"), repaired,
+		    (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
+		    "resilvered" : "repaired");
+	}
+
+	(void) printf("\n");
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE;
+
+		/* Don't print logs here */
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		if (is_log)
+			continue;
+		vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+		print_status_config(zhp, vname, child[c],
+		    namewidth, depth + 2, isspare);
+		free(vname);
+	}
+}
+
 
 /*
  * Print the configuration of an exported pool.  Iterate over all vdevs in the
  * pool, printing out the name and status for each one.
  */
 void
-print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
-    boolean_t print_logs)
+print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
 {
 	nvlist_t **child;
 	uint_t c, children;
@@ -1043,12 +1217,11 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    &is_log);
-		if ((is_log && !print_logs) || (!is_log && print_logs))
+		if (is_log)
 			continue;
 
 		vname = zpool_vdev_name(g_zfs, NULL, child[c]);
-		print_import_config(vname, child[c],
-		    namewidth, depth + 2, B_FALSE);
+		print_import_config(vname, child[c], namewidth, depth + 2);
 		free(vname);
 	}
 
@@ -1074,6 +1247,43 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
 }
 
 /*
+ * Print log vdevs.
+ * Logs are recorded as top level vdevs in the main pool child array
+ * but with "is_log" set to 1. We use either print_status_config() or
+ * print_import_config() to print the top level logs then any log
+ * children (eg mirrored slogs) are printed recursively - which
+ * works because only the top level vdev is marked "is_log"
+ */
+static void
+print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
+{
+	uint_t c, children;
+	nvlist_t **child;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) != 0)
+		return;
+
+	(void) printf(gettext("\tlogs\n"));
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE;
+		char *name;
+
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		if (!is_log)
+			continue;
+		name = zpool_vdev_name(g_zfs, zhp, child[c]);
+		if (verbose)
+			print_status_config(zhp, name, child[c], namewidth,
+			    2, B_FALSE);
+		else
+			print_import_config(name, child[c], namewidth, 2);
+		free(name);
+	}
+}
+/*
  * Display the status for the given pool.
  */
 static void
@@ -1241,11 +1451,9 @@ show_import(nvlist_t *config)
 	if (namewidth < 10)
 		namewidth = 10;
 
-	print_import_config(name, nvroot, namewidth, 0, B_FALSE);
-	if (num_logs(nvroot) > 0) {
-		(void) printf(gettext("\tlogs\n"));
-		print_import_config(name, nvroot, namewidth, 0, B_TRUE);
-	}
+	print_import_config(name, nvroot, namewidth, 0);
+	if (num_logs(nvroot) > 0)
+		print_logs(NULL, nvroot, namewidth, B_FALSE);
 
 	if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
 		(void) printf(gettext("\n\tAdditional devices are known to "
@@ -2427,10 +2635,14 @@ zpool_do_online(int argc, char **argv)
 	zpool_handle_t *zhp;
 	int ret = 0;
 	vdev_state_t newstate;
+	int flags = 0;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "t")) != -1) {
+	while ((c = getopt(argc, argv, "et")) != -1) {
 		switch (c) {
+		case 'e':
+			flags |= ZFS_ONLINE_EXPAND;
+			break;
 		case 't':
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -2458,7 +2670,7 @@ zpool_do_online(int argc, char **argv)
 		return (1);
 
 	for (i = 1; i < argc; i++) {
-		if (zpool_vdev_online(zhp, argv[i], 0, &newstate) == 0) {
+		if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
 			if (newstate != VDEV_STATE_HEALTHY) {
 				(void) printf(gettext("warning: device '%s' "
 				    "onlined, but remains in faulted state\n"),
@@ -2715,181 +2927,6 @@ print_scrub_status(nvlist_t *nvroot)
 	    (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
 }
 
-typedef struct spare_cbdata {
-	uint64_t	cb_guid;
-	zpool_handle_t	*cb_zhp;
-} spare_cbdata_t;
-
-static boolean_t
-find_vdev(nvlist_t *nv, uint64_t search)
-{
-	uint64_t guid;
-	nvlist_t **child;
-	uint_t c, children;
-
-	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
-	    search == guid)
-		return (B_TRUE);
-
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0) {
-		for (c = 0; c < children; c++)
-			if (find_vdev(child[c], search))
-				return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-static int
-find_spare(zpool_handle_t *zhp, void *data)
-{
-	spare_cbdata_t *cbp = data;
-	nvlist_t *config, *nvroot;
-
-	config = zpool_get_config(zhp, NULL);
-	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-
-	if (find_vdev(nvroot, cbp->cb_guid)) {
-		cbp->cb_zhp = zhp;
-		return (1);
-	}
-
-	zpool_close(zhp);
-	return (0);
-}
-
-/*
- * Print out configuration state as requested by status_callback.
- */
-void
-print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
-    int namewidth, int depth, boolean_t isspare, boolean_t print_logs)
-{
-	nvlist_t **child;
-	uint_t c, children;
-	vdev_stat_t *vs;
-	char rbuf[6], wbuf[6], cbuf[6], repaired[7];
-	char *vname;
-	uint64_t notpresent;
-	spare_cbdata_t cb;
-	char *state;
-
-	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
-	    (uint64_t **)&vs, &c) == 0);
-
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) != 0)
-		children = 0;
-
-	state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
-	if (isspare) {
-		/*
-		 * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
-		 * online drives.
-		 */
-		if (vs->vs_aux == VDEV_AUX_SPARED)
-			state = "INUSE";
-		else if (vs->vs_state == VDEV_STATE_HEALTHY)
-			state = "AVAIL";
-	}
-
-	(void) printf("\t%*s%-*s  %-8s", depth, "", namewidth - depth,
-	    name, state);
-
-	if (!isspare) {
-		zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
-		zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
-		zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
-		(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
-	}
-
-	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-	    &notpresent) == 0) {
-		char *path;
-		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
-		(void) printf("  was %s", path);
-	} else if (vs->vs_aux != 0) {
-		(void) printf("  ");
-
-		switch (vs->vs_aux) {
-		case VDEV_AUX_OPEN_FAILED:
-			(void) printf(gettext("cannot open"));
-			break;
-
-		case VDEV_AUX_BAD_GUID_SUM:
-			(void) printf(gettext("missing device"));
-			break;
-
-		case VDEV_AUX_NO_REPLICAS:
-			(void) printf(gettext("insufficient replicas"));
-			break;
-
-		case VDEV_AUX_VERSION_NEWER:
-			(void) printf(gettext("newer version"));
-			break;
-
-		case VDEV_AUX_SPARED:
-			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
-			    &cb.cb_guid) == 0);
-			if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
-				if (strcmp(zpool_get_name(cb.cb_zhp),
-				    zpool_get_name(zhp)) == 0)
-					(void) printf(gettext("currently in "
-					    "use"));
-				else
-					(void) printf(gettext("in use by "
-					    "pool '%s'"),
-					    zpool_get_name(cb.cb_zhp));
-				zpool_close(cb.cb_zhp);
-			} else {
-				(void) printf(gettext("currently in use"));
-			}
-			break;
-
-		case VDEV_AUX_ERR_EXCEEDED:
-			(void) printf(gettext("too many errors"));
-			break;
-
-		case VDEV_AUX_IO_FAILURE:
-			(void) printf(gettext("experienced I/O failures"));
-			break;
-
-		case VDEV_AUX_BAD_LOG:
-			(void) printf(gettext("bad intent log"));
-			break;
-
-		default:
-			(void) printf(gettext("corrupted data"));
-			break;
-		}
-	} else if (vs->vs_scrub_repaired != 0 && children == 0) {
-		/*
-		 * Report bytes resilvered/repaired on leaf devices.
-		 */
-		zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
-		(void) printf(gettext("  %s %s"), repaired,
-		    (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
-		    "resilvered" : "repaired");
-	}
-
-	(void) printf("\n");
-
-	for (c = 0; c < children; c++) {
-		uint64_t is_log = B_FALSE;
-
-		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
-		    &is_log);
-		if ((is_log && !print_logs) || (!is_log && print_logs))
-			continue;
-		vname = zpool_vdev_name(g_zfs, zhp, child[c]);
-		print_status_config(zhp, vname, child[c],
-		    namewidth, depth + 2, isspare, B_FALSE);
-		free(vname);
-	}
-}
-
 static void
 print_error_log(zpool_handle_t *zhp)
 {
@@ -2940,7 +2977,7 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
 	for (i = 0; i < nspares; i++) {
 		name = zpool_vdev_name(g_zfs, zhp, spares[i]);
 		print_status_config(zhp, name, spares[i],
-		    namewidth, 2, B_TRUE, B_FALSE);
+		    namewidth, 2, B_TRUE);
 		free(name);
 	}
 }
@@ -2960,7 +2997,7 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache,
 	for (i = 0; i < nl2cache; i++) {
 		name = zpool_vdev_name(g_zfs, zhp, l2cache[i]);
 		print_status_config(zhp, name, l2cache[i],
-		    namewidth, 2, B_FALSE, B_FALSE);
+		    namewidth, 2, B_FALSE);
 		free(name);
 	}
 }
@@ -3190,11 +3227,10 @@ status_callback(zpool_handle_t *zhp, void *data)
 		(void) printf(gettext("\t%-*s  %-8s %5s %5s %5s\n"), namewidth,
 		    "NAME", "STATE", "READ", "WRITE", "CKSUM");
 		print_status_config(zhp, zpool_get_name(zhp), nvroot,
-		    namewidth, 0, B_FALSE, B_FALSE);
-		if (num_logs(nvroot) > 0)
-			print_status_config(zhp, "logs", nvroot, namewidth, 0,
-			    B_FALSE, B_TRUE);
+		    namewidth, 0, B_FALSE);
 
+		if (num_logs(nvroot) > 0)
+			print_logs(zhp, nvroot, namewidth, B_TRUE);
 		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2cache, &nl2cache) == 0)
 			print_l2cache(zhp, l2cache, nl2cache, namewidth);
@@ -3418,7 +3454,7 @@ zpool_do_upgrade(int argc, char **argv)
 
 
 	/* check options */
-	while ((c = getopt(argc, argv, "avV:")) != -1) {
+	while ((c = getopt(argc, argv, ":avV:")) != -1) {
 		switch (c) {
 		case 'a':
 			cb.cb_all = B_TRUE;
@@ -3435,6 +3471,11 @@ zpool_do_upgrade(int argc, char **argv)
 				usage(B_FALSE);
 			}
 			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -3495,8 +3536,9 @@ zpool_do_upgrade(int argc, char **argv)
 		(void) printf(gettext(" 11  Improved scrub performance\n"));
 		(void) printf(gettext(" 12  Snapshot properties\n"));
 		(void) printf(gettext(" 13  snapused property\n"));
-		(void) printf(gettext(" 14  passthrough-x aclinherit "
-		    "support\n"));
+		(void) printf(gettext(" 14  passthrough-x aclinherit\n"));
+		(void) printf(gettext(" 15  user/group space accounting\n"));
+		(void) printf(gettext(" 16  stmf property support\n"));
 		(void) printf(gettext("For more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 56873ce3f..0566e3bf7 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -76,6 +76,7 @@
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
+#include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/dmu_objset.h>
 #include <sys/poll.h>
@@ -92,6 +93,7 @@
 #include <sys/vdev_file.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
 #include <sys/refcount.h>
 #include <stdio.h>
 #include <stdio_ext.h>
@@ -162,6 +164,7 @@ typedef void ztest_func_t(ztest_args_t *);
  * Note: these aren't static because we want dladdr() to work.
  */
 ztest_func_t ztest_dmu_read_write;
+ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
 ztest_func_t ztest_zap;
@@ -170,6 +173,7 @@ ztest_func_t ztest_traverse;
 ztest_func_t ztest_dsl_prop_get_set;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_snapshot_create_destroy;
+ztest_func_t ztest_dsl_dataset_promote_busy;
 ztest_func_t ztest_spa_create_destroy;
 ztest_func_t ztest_fault_inject;
 ztest_func_t ztest_spa_rename;
@@ -196,6 +200,7 @@ uint64_t zopt_rarely = 60;		/* every 60 seconds */
 
 ztest_info_t ztest_info[] = {
 	{ ztest_dmu_read_write,			1,	&zopt_always	},
+	{ ztest_dmu_read_write_zcopy,		1,	&zopt_always	},
 	{ ztest_dmu_write_parallel,		30,	&zopt_always	},
 	{ ztest_dmu_object_alloc_free,		1,	&zopt_always	},
 	{ ztest_zap,				30,	&zopt_always	},
@@ -208,6 +213,7 @@ ztest_info_t ztest_info[] = {
 	{ ztest_spa_rename,			1,	&zopt_rarely	},
 	{ ztest_vdev_attach_detach,		1,	&zopt_rarely	},
 	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
+	{ ztest_dsl_dataset_promote_busy,	1,	&zopt_rarely	},
 	{ ztest_vdev_add_remove,		1,	&zopt_vdevtime	},
 	{ ztest_vdev_aux_add_remove,		1,	&zopt_vdevtime	},
 	{ ztest_scrub,				1,	&zopt_vdevtime	},
@@ -242,9 +248,11 @@ static ztest_shared_t *ztest_shared;
 static int ztest_random_fd;
 static int ztest_dump_core = 1;
 
+static uint64_t metaslab_sz;
 static boolean_t ztest_exiting;
 
 extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
 
 #define	ZTEST_DIROBJ		1
 #define	ZTEST_MICROZAP_OBJ	2
@@ -946,7 +954,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
 		 * of devices that have pending state changes.
 		 */
 		if (ztest_random(2) == 0)
-			(void) vdev_online(spa, guid, B_FALSE, NULL);
+			(void) vdev_online(spa, guid, 0, NULL);
 
 		error = spa_vdev_remove(spa, guid, B_FALSE);
 		if (error != 0 && error != EBUSY)
@@ -1024,7 +1032,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 	}
 
 	oldguid = oldvd->vdev_guid;
-	oldsize = vdev_get_rsize(oldvd);
+	oldsize = vdev_get_min_asize(oldvd);
 	oldvd_is_log = oldvd->vdev_top->vdev_islog;
 	(void) strcpy(oldpath, oldvd->vdev_path);
 	pvd = oldvd->vdev_parent;
@@ -1060,7 +1068,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 	}
 
 	if (newvd) {
-		newsize = vdev_get_rsize(newvd);
+		newsize = vdev_get_min_asize(newvd);
 	} else {
 		/*
 		 * Make newsize a little bigger or smaller than oldsize.
@@ -1136,49 +1144,202 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 }
 
 /*
+ * Callback function which expands the physical size of the vdev.
+ */
+vdev_t *
+grow_vdev(vdev_t *vd, void *arg)
+{
+	spa_t *spa = vd->vdev_spa;
+	size_t *newsize = arg;
+	size_t fsize;
+	int fd;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
+		return (vd);
+
+	fsize = lseek(fd, 0, SEEK_END);
+	(void) ftruncate(fd, *newsize);
+
+	if (zopt_verbose >= 6) {
+		(void) printf("%s grew from %lu to %lu bytes\n",
+		    vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
+	}
+	(void) close(fd);
+	return (NULL);
+}
+
+/*
+ * Callback function which expands a given vdev by calling vdev_online().
+ */
+/* ARGSUSED */
+vdev_t *
+online_vdev(vdev_t *vd, void *arg)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *tvd = vd->vdev_top;
+	vdev_t *pvd = vd->vdev_parent;
+	uint64_t guid = vd->vdev_guid;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	/* Calling vdev_online will initialize the new metaslabs */
+	spa_config_exit(spa, SCL_STATE, spa);
+	(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	/*
+	 * Since we dropped the lock we need to ensure that we're
+	 * still talking to the original vdev. It's possible this
+	 * vdev may have been detached/replaced while we were
+	 * trying to online it.
+	 */
+	if (vd != vdev_lookup_by_guid(tvd, guid) || vd->vdev_parent != pvd) {
+		if (zopt_verbose >= 6) {
+			(void) printf("vdev %p has disappeared, was "
+			    "guid %llu\n", (void *)vd, (u_longlong_t)guid);
+		}
+		return (vd);
+	}
+	return (NULL);
+}
+
+/*
+ * Traverse the vdev tree calling the supplied function.
+ * We continue to walk the tree until we either have walked all
+ * children or we receive a non-NULL return from the callback.
+ * If a NULL callback is passed, then we just return back the first
+ * leaf vdev we encounter.
+ */
+vdev_t *
+vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
+{
+	if (vd->vdev_ops->vdev_op_leaf) {
+		if (func == NULL)
+			return (vd);
+		else
+			return (func(vd, arg));
+	}
+
+	for (uint_t c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
+			return (cvd);
+	}
+	return (NULL);
+}
+
+/*
  * Verify that dynamic LUN growth works as expected.
  */
 void
 ztest_vdev_LUN_growth(ztest_args_t *za)
 {
 	spa_t *spa = za->za_spa;
-	char dev_name[MAXPATHLEN];
-	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
-	uint64_t vdev;
-	size_t fsize;
-	int fd;
+	vdev_t *vd, *tvd = NULL;
+	size_t psize, newsize;
+	uint64_t spa_newsize, spa_cursize, ms_count;
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	mutex_enter(&spa_namespace_lock);
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	while (tvd == NULL || tvd->vdev_islog) {
+		uint64_t vdev;
+
+		vdev = ztest_random(spa->spa_root_vdev->vdev_children);
+		tvd = spa->spa_root_vdev->vdev_child[vdev];
+	}
 
 	/*
-	 * Pick a random leaf vdev.
+	 * Determine the size of the first leaf vdev associated with
+	 * our top-level device.
 	 */
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
-	spa_config_exit(spa, SCL_VDEV, FTAG);
+	vd = vdev_walk_tree(tvd, NULL, NULL);
+	ASSERT3P(vd, !=, NULL);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
-	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+	psize = vd->vdev_psize;
 
-	if ((fd = open(dev_name, O_RDWR)) != -1) {
-		/*
-		 * Determine the size.
-		 */
-		fsize = lseek(fd, 0, SEEK_END);
+	/*
+	 * We only try to expand the vdev if it's less than 4x its
+	 * original size and it has a valid psize.
+	 */
+	if (psize == 0 || psize >= 4 * zopt_vdev_size) {
+		spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&spa_namespace_lock);
+		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+		return;
+	}
+	ASSERT(psize > 0);
+	newsize = psize + psize / 8;
+	ASSERT3U(newsize, >, psize);
 
-		/*
-		 * If it's less than 2x the original size, grow by around 3%.
-		 */
-		if (fsize < 2 * zopt_vdev_size) {
-			size_t newsize = fsize + ztest_random(fsize / 32);
-			(void) ftruncate(fd, newsize);
-			if (zopt_verbose >= 6) {
-				(void) printf("%s grew from %lu to %lu bytes\n",
-				    dev_name, (ulong_t)fsize, (ulong_t)newsize);
-			}
+	if (zopt_verbose >= 6) {
+		(void) printf("Expanding vdev %s from %lu to %lu\n",
+		    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
+	}
+
+	spa_cursize = spa_get_space(spa);
+	ms_count = tvd->vdev_ms_count;
+
+	/*
+	 * Growing the vdev is a two step process:
+	 *	1). expand the physical size (i.e. relabel)
+	 *	2). online the vdev to create the new metaslabs
+	 */
+	if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
+	    vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
+	    tvd->vdev_state != VDEV_STATE_HEALTHY) {
+		if (zopt_verbose >= 5) {
+			(void) printf("Could not expand LUN because "
+			    "some vdevs were not healthy\n");
 		}
-		(void) close(fd);
+		(void) spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&spa_namespace_lock);
+		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+		return;
 	}
 
+	(void) spa_config_exit(spa, SCL_STATE, spa);
+	mutex_exit(&spa_namespace_lock);
+
+	/*
+	 * Expanding the LUN will update the config asynchronously,
+	 * thus we must wait for the async thread to complete any
+	 * pending tasks before proceeding.
+	 */
+	mutex_enter(&spa->spa_async_lock);
+	while (spa->spa_async_thread != NULL || spa->spa_async_tasks)
+		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+	mutex_exit(&spa->spa_async_lock);
+
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+	spa_newsize = spa_get_space(spa);
+
+	/*
+	 * Make sure we were able to grow the pool.
+	 */
+	if (ms_count >= tvd->vdev_ms_count ||
+	    spa_cursize >= spa_newsize) {
+		(void) printf("Top-level vdev metaslab count: "
+		    "before %llu, after %llu\n",
+		    (u_longlong_t)ms_count,
+		    (u_longlong_t)tvd->vdev_ms_count);
+		fatal(0, "LUN expansion failed: before %llu, "
+		    "after %llu\n", spa_cursize, spa_newsize);
+	} else if (zopt_verbose >= 5) {
+		char oldnumbuf[6], newnumbuf[6];
+
+		nicenum(spa_cursize, oldnumbuf);
+		nicenum(spa_newsize, newnumbuf);
+		(void) printf("%s grew from %s to %s\n",
+		    spa->spa_name, oldnumbuf, newnumbuf);
+	}
+	spa_config_exit(spa, SCL_STATE, spa);
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 }
 
@@ -1425,7 +1586,8 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
 	error = dmu_objset_destroy(snapname);
 	if (error != 0 && error != ENOENT)
 		fatal(0, "dmu_objset_destroy() = %d", error);
-	error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, FALSE);
+	error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
+	    NULL, FALSE);
 	if (error == ENOSPC)
 		ztest_record_enospc("dmu_take_snapshot");
 	else if (error != 0 && error != EEXIST)
@@ -1434,6 +1596,148 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
 }
 
 /*
+ * Cleanup non-standard snapshots and clones.
+ */
+void
+ztest_dsl_dataset_cleanup(char *osname, uint64_t curval)
+{
+	char snap1name[100];
+	char clone1name[100];
+	char snap2name[100];
+	char clone2name[100];
+	char snap3name[100];
+	int error;
+
+	(void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
+	(void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
+	(void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
+	(void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
+	(void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
+
+	error = dmu_objset_destroy(clone2name);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
+	error = dmu_objset_destroy(snap3name);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
+	error = dmu_objset_destroy(snap2name);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
+	error = dmu_objset_destroy(clone1name);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
+	error = dmu_objset_destroy(snap1name);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
+}
+
+/*
+ * Verify dsl_dataset_promote handles EBUSY
+ */
+void
+ztest_dsl_dataset_promote_busy(ztest_args_t *za)
+{
+	int error;
+	objset_t *os = za->za_os;
+	objset_t *clone;
+	dsl_dataset_t *ds;
+	char snap1name[100];
+	char clone1name[100];
+	char snap2name[100];
+	char clone2name[100];
+	char snap3name[100];
+	char osname[MAXNAMELEN];
+	uint64_t curval = za->za_instance;
+
+	(void) rw_rdlock(&ztest_shared->zs_name_lock);
+
+	dmu_objset_name(os, osname);
+	ztest_dsl_dataset_cleanup(osname, curval);
+
+	(void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
+	(void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
+	(void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
+	(void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
+	(void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
+
+	error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
+	    NULL, FALSE);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_take_snapshot");
+			goto out;
+		}
+		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
+	}
+
+	error = dmu_objset_open(snap1name, DMU_OST_OTHER,
+	    DS_MODE_USER | DS_MODE_READONLY, &clone);
+	if (error)
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
+
+	error = dmu_objset_create(clone1name, DMU_OST_OTHER, clone, 0,
+	    NULL, NULL);
+	dmu_objset_close(clone);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_create");
+			goto out;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
+	}
+
+	error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
+	    NULL, FALSE);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_take_snapshot");
+			goto out;
+		}
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
+	}
+
+	error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
+	    NULL, FALSE);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_take_snapshot");
+			goto out;
+		}
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
+	}
+
+	error = dmu_objset_open(snap3name, DMU_OST_OTHER,
+	    DS_MODE_USER | DS_MODE_READONLY, &clone);
+	if (error)
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
+
+	error = dmu_objset_create(clone2name, DMU_OST_OTHER, clone, 0,
+	    NULL, NULL);
+	dmu_objset_close(clone);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_create");
+			goto out;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
+	}
+
+	error = dsl_dataset_own(snap1name, DS_MODE_READONLY, FTAG, &ds);
+	if (error)
+		fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
+	error = dsl_dataset_promote(clone2name);
+	if (error != EBUSY)
+		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
+		    error);
+	dsl_dataset_disown(ds, FTAG);
+
+out:
+	ztest_dsl_dataset_cleanup(osname, curval);
+
+	(void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+/*
  * Verify that dmu_object_{alloc,free} work as expected.
  */
 void
@@ -1456,7 +1760,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
 	 * Create a batch object if necessary, and record it in the directory.
 	 */
 	VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (uint64_t), &batchobj));
+	    sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
 	if (batchobj == 0) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
@@ -1481,7 +1785,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
 	 */
 	for (b = 0; b < batchsize; b++) {
 		VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t), &object));
+		    sizeof (uint64_t), &object, DMU_READ_PREFETCH));
 		if (object == 0)
 			continue;
 		/*
@@ -1516,7 +1820,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
 		 * We expect the word at endoff to be our object number.
 		 */
 		VERIFY(0 == dmu_read(os, object, endoff,
-		    sizeof (uint64_t), &temp));
+		    sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
 
 		if (temp != object) {
 			fatal(0, "bad data in %s, got %llu, expected %llu",
@@ -1701,7 +2005,7 @@ ztest_dmu_read_write(ztest_args_t *za)
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
 	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (dd), &dd));
+	    sizeof (dd), &dd, DMU_READ_PREFETCH));
 	if (dd.dd_chunk == 0) {
 		ASSERT(dd.dd_packobj == 0);
 		ASSERT(dd.dd_bigobj == 0);
@@ -1763,9 +2067,11 @@ ztest_dmu_read_write(ztest_args_t *za)
 	/*
 	 * Read the current contents of our objects.
 	 */
-	error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf);
+	error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf,
+	    DMU_READ_PREFETCH);
 	ASSERT3U(error, ==, 0);
-	error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf);
+	error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf,
+	    DMU_READ_PREFETCH);
 	ASSERT3U(error, ==, 0);
 
 	/*
@@ -1871,9 +2177,9 @@ ztest_dmu_read_write(ztest_args_t *za)
 		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
 		VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
-		    packsize, packcheck));
+		    packsize, packcheck, DMU_READ_PREFETCH));
 		VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
-		    bigsize, bigcheck));
+		    bigsize, bigcheck, DMU_READ_PREFETCH));
 
 		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
@@ -1887,6 +2193,314 @@ ztest_dmu_read_write(ztest_args_t *za)
 }
 
 void
+compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
+    uint64_t bigsize, uint64_t n, dmu_read_write_dir_t dd, uint64_t txg)
+{
+	uint64_t i;
+	bufwad_t *pack;
+	bufwad_t *bigH;
+	bufwad_t *bigT;
+
+	/*
+	 * For each index from n to n + s, verify that the existing bufwad
+	 * in packobj matches the bufwads at the head and tail of the
+	 * corresponding chunk in bigobj.  Then update all three bufwads
+	 * with the new values we want to write out.
+	 */
+	for (i = 0; i < s; i++) {
+		/* LINTED */
+		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+		/* LINTED */
+		bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+		/* LINTED */
+		bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+
+		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+		if (pack->bw_txg > txg)
+			fatal(0, "future leak: got %llx, open txg is %llx",
+			    pack->bw_txg, txg);
+
+		if (pack->bw_data != 0 && pack->bw_index != n + i)
+			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+			    pack->bw_index, n, i);
+
+		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+		pack->bw_index = n + i;
+		pack->bw_txg = txg;
+		pack->bw_data = 1 + ztest_random(-2ULL);
+
+		*bigH = *pack;
+		*bigT = *pack;
+	}
+}
+
+void
+ztest_dmu_read_write_zcopy(ztest_args_t *za)
+{
+	objset_t *os = za->za_os;
+	dmu_read_write_dir_t dd;
+	dmu_tx_t *tx;
+	uint64_t i;
+	int error;
+	uint64_t n, s, txg;
+	bufwad_t *packbuf, *bigbuf;
+	uint64_t packoff, packsize, bigoff, bigsize;
+	uint64_t regions = 997;
+	uint64_t stride = 123456789ULL;
+	uint64_t width = 9;
+	dmu_buf_t *bonus_db;
+	arc_buf_t **bigbuf_arcbufs;
+	dmu_object_info_t *doi = &za->za_doi;
+
+	/*
+	 * This test uses two objects, packobj and bigobj, that are always
+	 * updated together (i.e. in the same tx) so that their contents are
+	 * in sync and can be compared.  Their contents relate to each other
+	 * in a simple way: packobj is a dense array of 'bufwad' structures,
+	 * while bigobj is a sparse array of the same bufwads.  Specifically,
+	 * for any index n, there are three bufwads that should be identical:
+	 *
+	 *	packobj, at offset n * sizeof (bufwad_t)
+	 *	bigobj, at the head of the nth chunk
+	 *	bigobj, at the tail of the nth chunk
+	 *
+	 * The chunk size is set equal to bigobj block size so that
+	 * dmu_assign_arcbuf() can be tested for object updates.
+	 */
+
+	/*
+	 * Read the directory info.  If it's the first time, set things up.
+	 */
+	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
+	    sizeof (dd), &dd, DMU_READ_PREFETCH));
+	if (dd.dd_chunk == 0) {
+		ASSERT(dd.dd_packobj == 0);
+		ASSERT(dd.dd_bigobj == 0);
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("create r/w directory");
+			dmu_tx_abort(tx);
+			return;
+		}
+
+		dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+		    DMU_OT_NONE, 0, tx);
+		dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+		    DMU_OT_NONE, 0, tx);
+		ztest_set_random_blocksize(os, dd.dd_packobj, tx);
+		ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
+
+		VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
+		ASSERT(doi->doi_data_block_size >= 2 * sizeof (bufwad_t));
+		ASSERT(ISP2(doi->doi_data_block_size));
+		dd.dd_chunk = doi->doi_data_block_size;
+
+		dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
+		    tx);
+		dmu_tx_commit(tx);
+	} else {
+		VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
+		VERIFY(ISP2(doi->doi_data_block_size));
+		VERIFY(dd.dd_chunk == doi->doi_data_block_size);
+		VERIFY(dd.dd_chunk >= 2 * sizeof (bufwad_t));
+	}
+
+	/*
+	 * Pick a random index and compute the offsets into packobj and bigobj.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(width - 1);
+
+	packoff = n * sizeof (bufwad_t);
+	packsize = s * sizeof (bufwad_t);
+
+	bigoff = n * dd.dd_chunk;
+	bigsize = s * dd.dd_chunk;
+
+	packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
+	bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
+
+	VERIFY(dmu_bonus_hold(os, dd.dd_bigobj, FTAG, &bonus_db) == 0);
+
+	bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
+
+	/*
+	 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
+	 * Iteration 1 test zcopy to already referenced dbufs.
+	 * Iteration 2 test zcopy to dirty dbuf in the same txg.
+	 * Iteration 3 test zcopy to dbuf dirty in previous txg.
+	 * Iteration 4 test zcopy when dbuf is no longer dirty.
+	 * Iteration 5 test zcopy when it can't be done.
+	 * Iteration 6 one more zcopy write.
+	 */
+	for (i = 0; i < 7; i++) {
+		uint64_t j;
+		uint64_t off;
+
+		/*
+		 * In iteration 5 (i == 5) use arcbufs
+		 * that don't match bigobj blksz to test
+		 * dmu_assign_arcbuf() when it can't directly
+		 * assign an arcbuf to a dbuf.
+		 */
+		for (j = 0; j < s; j++) {
+			if (i != 5) {
+				bigbuf_arcbufs[j] =
+				    dmu_request_arcbuf(bonus_db,
+				    dd.dd_chunk);
+			} else {
+				bigbuf_arcbufs[2 * j] =
+				    dmu_request_arcbuf(bonus_db,
+				    dd.dd_chunk / 2);
+				bigbuf_arcbufs[2 * j + 1] =
+				    dmu_request_arcbuf(bonus_db,
+				    dd.dd_chunk / 2);
+			}
+		}
+
+		/*
+		 * Get a tx for the mods to both packobj and bigobj.
+		 */
+		tx = dmu_tx_create(os);
+
+		dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
+		dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
+
+		if (ztest_random(100) == 0) {
+			error = -1;
+		} else {
+			error = dmu_tx_assign(tx, TXG_WAIT);
+		}
+
+		if (error) {
+			if (error != -1) {
+				ztest_record_enospc("dmu r/w range");
+			}
+			dmu_tx_abort(tx);
+			umem_free(packbuf, packsize);
+			umem_free(bigbuf, bigsize);
+			for (j = 0; j < s; j++) {
+				if (i != 5) {
+					dmu_return_arcbuf(bigbuf_arcbufs[j]);
+				} else {
+					dmu_return_arcbuf(
+					    bigbuf_arcbufs[2 * j]);
+					dmu_return_arcbuf(
+					    bigbuf_arcbufs[2 * j + 1]);
+				}
+			}
+			umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+			dmu_buf_rele(bonus_db, FTAG);
+			return;
+		}
+
+		txg = dmu_tx_get_txg(tx);
+
+		/*
+		 * 50% of the time don't read objects in the 1st iteration to
+		 * test dmu_assign_arcbuf() for the case when there're no
+		 * existing dbufs for the specified offsets.
+		 */
+		if (i != 0 || ztest_random(2) != 0) {
+			error = dmu_read(os, dd.dd_packobj, packoff,
+			    packsize, packbuf, DMU_READ_PREFETCH);
+			ASSERT3U(error, ==, 0);
+			error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize,
+			    bigbuf, DMU_READ_PREFETCH);
+			ASSERT3U(error, ==, 0);
+		}
+		compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
+		    n, dd, txg);
+
+		/*
+		 * We've verified all the old bufwads, and made new ones.
+		 * Now write them out.
+		 */
+		dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
+		if (zopt_verbose >= 6) {
+			(void) printf("writing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		for (off = bigoff, j = 0; j < s; j++, off += dd.dd_chunk) {
+			dmu_buf_t *dbt;
+			if (i != 5) {
+				bcopy((caddr_t)bigbuf + (off - bigoff),
+				    bigbuf_arcbufs[j]->b_data, dd.dd_chunk);
+			} else {
+				bcopy((caddr_t)bigbuf + (off - bigoff),
+				    bigbuf_arcbufs[2 * j]->b_data,
+				    dd.dd_chunk / 2);
+				bcopy((caddr_t)bigbuf + (off - bigoff) +
+				    dd.dd_chunk / 2,
+				    bigbuf_arcbufs[2 * j + 1]->b_data,
+				    dd.dd_chunk / 2);
+			}
+
+			if (i == 1) {
+				VERIFY(dmu_buf_hold(os, dd.dd_bigobj, off,
+				    FTAG, &dbt) == 0);
+			}
+			if (i != 5) {
+				dmu_assign_arcbuf(bonus_db, off,
+				    bigbuf_arcbufs[j], tx);
+			} else {
+				dmu_assign_arcbuf(bonus_db, off,
+				    bigbuf_arcbufs[2 * j], tx);
+				dmu_assign_arcbuf(bonus_db,
+				    off + dd.dd_chunk / 2,
+				    bigbuf_arcbufs[2 * j + 1], tx);
+			}
+			if (i == 1) {
+				dmu_buf_rele(dbt, FTAG);
+			}
+		}
+		dmu_tx_commit(tx);
+
+		/*
+		 * Sanity check the stuff we just wrote.
+		 */
+		{
+			void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+			void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+			VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+			    packsize, packcheck, DMU_READ_PREFETCH));
+			VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+			    bigsize, bigcheck, DMU_READ_PREFETCH));
+
+			ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+			ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+			umem_free(packcheck, packsize);
+			umem_free(bigcheck, bigsize);
+		}
+		if (i == 2) {
+			txg_wait_open(dmu_objset_pool(os), 0);
+		} else if (i == 3) {
+			txg_wait_synced(dmu_objset_pool(os), 0);
+		}
+	}
+
+	dmu_buf_rele(bonus_db, FTAG);
+	umem_free(packbuf, packsize);
+	umem_free(bigbuf, bigsize);
+	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+}
+
+void
 ztest_dmu_check_future_leak(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
@@ -1935,6 +2549,8 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 	uint64_t blkoff;
 	zbookmark_t zb;
 	dmu_tx_t *tx = dmu_tx_create(os);
+	dmu_buf_t *bonus_db;
+	arc_buf_t *abuf = NULL;
 
 	dmu_objset_name(os, osname);
 
@@ -1963,6 +2579,12 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 		}
 	}
 
+	if (off != -1ULL && P2PHASE(off, bs) == 0 && !do_free &&
+	    ztest_random(8) == 0) {
+		VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &bonus_db) == 0);
+		abuf = dmu_request_arcbuf(bonus_db, bs);
+	}
+
 	txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
 	error = dmu_tx_assign(tx, txg_how);
 	if (error) {
@@ -1973,6 +2595,10 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 			ztest_record_enospc("dmu write parallel");
 		}
 		dmu_tx_abort(tx);
+		if (abuf != NULL) {
+			dmu_return_arcbuf(abuf);
+			dmu_buf_rele(bonus_db, FTAG);
+		}
 		return;
 	}
 	txg = dmu_tx_get_txg(tx);
@@ -2027,8 +2653,12 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 		za->za_dbuf = NULL;
 	} else if (do_free) {
 		VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
-	} else {
+	} else if (abuf == NULL) {
 		dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
+	} else {
+		bcopy(wbt, abuf->b_data, btsize);
+		dmu_assign_arcbuf(bonus_db, off, abuf, tx);
+		dmu_buf_rele(bonus_db, FTAG);
 	}
 
 	(void) mutex_unlock(lp);
@@ -2064,16 +2694,20 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 	dmu_buf_rele(db, FTAG);
 	za->za_dbuf = NULL;
 
-	(void) mutex_unlock(lp);
-
-	if (error)
+	if (error) {
+		(void) mutex_unlock(lp);
 		return;
+	}
 
-	if (blk.blk_birth == 0)		/* concurrent free */
+	if (blk.blk_birth == 0)	{	/* concurrent free */
+		(void) mutex_unlock(lp);
 		return;
+	}
 
 	txg_suspend(dmu_objset_pool(os));
 
+	(void) mutex_unlock(lp);
+
 	ASSERT(blk.blk_fill == 1);
 	ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
 	ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
@@ -2146,7 +2780,7 @@ ztest_zap(ztest_args_t *za)
 	 * Create a new object if necessary, and record it in the directory.
 	 */
 	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (uint64_t), &object));
+	    sizeof (uint64_t), &object, DMU_READ_PREFETCH));
 
 	if (object == 0) {
 		tx = dmu_tx_create(os);
@@ -2799,7 +3433,7 @@ ztest_verify_blocks(char *pool)
 	isa = strdup(isa);
 	/* LINTED */
 	(void) sprintf(bin,
-	    "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache %s",
+	    "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s",
 	    isalen,
 	    isa,
 	    zopt_verbose >= 3 ? "s" : "",
@@ -2944,7 +3578,7 @@ ztest_resume(spa_t *spa)
 		spa_vdev_state_enter(spa);
 		vdev_clear(spa, NULL);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
-		zio_resume(spa);
+		(void) zio_resume(spa);
 	}
 }
 
@@ -3216,6 +3850,10 @@ ztest_run(char *pool)
 		(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
 		if (zopt_verbose >= 3)
 			(void) printf("Destroying %s to free up space\n", name);
+
+		/* Cleanup any non-standard clones and snapshots */
+		ztest_dsl_dataset_cleanup(name, za[d].za_instance);
+
 		(void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
 		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 		(void) rw_unlock(&ztest_shared->zs_name_lock);
@@ -3296,6 +3934,8 @@ ztest_init(char *pool)
 	if (error)
 		fatal(0, "spa_open() = %d", error);
 
+	metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+
 	if (zopt_verbose >= 3)
 		show_pool_stats(spa);
 
@@ -3387,6 +4027,9 @@ main(int argc, char **argv)
 			zi->zi_call_time = 0;
 		}
 
+		/* Set the allocation switch size */
+		metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1;
+
 		pid = fork();
 
 		if (pid == -1)