8 files changed, 167 insertions, 70 deletions
diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index 7fe91bebe..edf362f7f 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -136,6 +136,7 @@ struct objset {
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_refresh_ownership(objset_t *os, void *tag);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h
index ee0885a60..6442b20f7 100644
--- a/include/sys/dmu_send.h
+++ b/include/sys/dmu_send.h
@@ -55,12 +55,13 @@ typedef struct dmu_recv_cookie {
 	struct avl_tree *drc_guid_to_ds_map;
 	zio_cksum_t drc_cksum;
 	uint64_t drc_newsnapobj;
+	void *drc_owner;
 } dmu_recv_cookie_t;
 
 int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
     boolean_t force, char *origin, dmu_recv_cookie_t *drc);
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
     int cleanup_fd, uint64_t *action_handlep);
-int dmu_recv_end(dmu_recv_cookie_t *drc);
+int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
 
 #endif /* _DMU_SEND_H */
diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h
index 866a8976c..65732adc5 100644
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@@ -248,7 +248,7 @@ void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
 
 int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
-    dsl_dataset_t *origin_head, boolean_t force);
+    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
 void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx);
 int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
@@ -265,7 +265,7 @@ int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
 int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx);
 void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
     zprop_source_t source, uint64_t value, dmu_tx_t *tx);
-int dsl_dataset_rollback(const char *fsname);
+int dsl_dataset_rollback(const char *fsname, void *owner);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index f886254af..6a3b5f05e 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -517,6 +517,38 @@ dmu_objset_rele(objset_t *os, void *tag)
 	dsl_pool_rele(dp, tag);
 }
 
+/*
+ * When we are called, os MUST refer to an objset associated with a dataset
+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
+ * == tag.  We will then release and reacquire ownership of the dataset while
+ * holding the pool config_rwlock to avoid intervening namespace or ownership
+ * changes may occur.
+ *
+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
+ * release the hold on its dataset and acquire a new one on the dataset of the
+ * same name so that it can be partially torn down and reconstructed.
+ */
+void
+dmu_objset_refresh_ownership(objset_t *os, void *tag)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds, *newds;
+	char name[MAXNAMELEN];
+
+	ds = os->os_dsl_dataset;
+	VERIFY3P(ds, !=, NULL);
+	VERIFY3P(ds->ds_owner, ==, tag);
+	VERIFY(dsl_dataset_long_held(ds));
+
+	dsl_dataset_name(ds, name);
+	dp = dmu_objset_pool(os);
+	dsl_pool_config_enter(dp, FTAG);
+	dmu_objset_disown(os, tag);
+	VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
+	VERIFY3P(newds, ==, os->os_dsl_dataset);
+	dsl_pool_config_exit(dp, FTAG);
+}
+
 void
 dmu_objset_disown(objset_t *os, void *tag)
 {
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index c5eb2fb79..d38e2c512 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -1612,7 +1612,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 			}
 		}
 		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
-		    origin_head, drc->drc_force);
+		    origin_head, drc->drc_force, drc->drc_owner, tx);
 		if (error != 0) {
 			dsl_dataset_rele(origin_head, FTAG);
 			return (error);
@@ -1685,6 +1685,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 
 		dsl_dataset_rele(origin_head, FTAG);
 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+		if (drc->drc_owner != NULL)
+			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
 	} else {
 		dsl_dataset_t *ds = drc->drc_ds;
 
@@ -1787,8 +1790,10 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc)
 }
 
 int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 {
+	drc->drc_owner = owner;
+
 	if (drc->drc_newfs)
 		return (dmu_recv_new_end(drc));
 	else
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index b04ef69bb..ca0c4aae8 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -1669,16 +1669,52 @@ dsl_dataset_rename_snapshot(const char *fsname,
 	    dsl_dataset_rename_snapshot_sync, &ddrsa, 1));
 }
 
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset.  We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here.  We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
+static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+	boolean_t held;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	if (owner != NULL) {
+		VERIFY3P(ds->ds_owner, ==, owner);
+		dsl_dataset_long_rele(ds, owner);
+	}
+
+	held = dsl_dataset_long_held(ds);
+
+	if (owner != NULL)
+		dsl_dataset_long_hold(ds, owner);
+
+	if (held)
+		return (SET_ERROR(EBUSY));
+
+	return (0);
+}
+
+typedef struct dsl_dataset_rollback_arg {
+	const char *ddra_fsname;
+	void *ddra_owner;
+} dsl_dataset_rollback_arg_t;
+
 static int
 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 {
-	const char *fsname = arg;
+	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int64_t unused_refres_delta;
 	int error;
 
-	error = dsl_dataset_hold(dp, fsname, FTAG, &ds);
+	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
@@ -1694,9 +1730,10 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 		return (SET_ERROR(EINVAL));
 	}
 
-	if (dsl_dataset_long_held(ds)) {
+	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EBUSY));
+		return (error);
 	}
 
 	/*
@@ -1733,12 +1770,12 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 static void
 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
 {
-	const char *fsname = arg;
+	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds, *clone;
 	uint64_t cloneobj;
 
-	VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds));
+	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
 
 	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
 	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
@@ -1754,11 +1791,26 @@ dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
 	dsl_dataset_rele(ds, FTAG);
 }
 
+/*
+ * If owner != NULL:
+ *
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ *   succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted.  See
+ * notes above zfs_suspend_fs() for further details.
+ */
 int
-dsl_dataset_rollback(const char *fsname)
+dsl_dataset_rollback(const char *fsname, void *owner)
 {
+	dsl_dataset_rollback_arg_t ddra;
+
+	ddra.ddra_fsname = fsname;
+	ddra.ddra_owner = owner;
+
 	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
-	    dsl_dataset_rollback_sync, (void *)fsname, 1));
+	    dsl_dataset_rollback_sync, (void *)&ddra, 1));
 }
 
 struct promotenode {
@@ -2276,7 +2328,7 @@ dsl_dataset_promote(const char *name, char *conflsnap)
 
 int
 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
-    dsl_dataset_t *origin_head, boolean_t force)
+    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
 {
 	int64_t unused_refres_delta;
 
@@ -2305,7 +2357,7 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
 		return (SET_ERROR(ETXTBSY));
 
 	/* origin_head should have no long holds (e.g. is not mounted) */
-	if (dsl_dataset_long_held(origin_head))
+	if (dsl_dataset_handoff_check(origin_head, owner, tx))
 		return (SET_ERROR(EBUSY));
 
 	/* check amount of any unconsumed refreservation */
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index d6736c29a..556a0c940 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1349,7 +1349,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
 			/*
 			 * XXX we could probably try again, since the unmounting
 			 * thread should be just about to disassociate the
-			 * objset from the zfsvfs.
+			 * objset from the zsb.
 			 */
 			rrw_exit(&(*zsbp)->z_teardown_lock, tag);
 			return (SET_ERROR(EBUSY));
@@ -3504,13 +3504,13 @@ zfs_ioc_rollback(zfs_cmd_t *zc)
 		if (error == 0) {
 			int resume_err;
 
-			error = dsl_dataset_rollback(zc->zc_name);
+			error = dsl_dataset_rollback(zc->zc_name, zsb);
 			resume_err = zfs_resume_fs(zsb, zc->zc_name);
 			error = error ? error : resume_err;
 		}
 		deactivate_super(zsb->z_sb);
 	} else {
-		error = dsl_dataset_rollback(zc->zc_name);
+		error = dsl_dataset_rollback(zc->zc_name, NULL);
 	}
 	return (error);
 }
@@ -4038,13 +4038,13 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 			 * If the suspend fails, then the recv_end will
 			 * likely also fail, and clean up after itself.
 			 */
-			end_err = dmu_recv_end(&drc);
+			end_err = dmu_recv_end(&drc, zsb);
 			if (error == 0)
 				error = zfs_resume_fs(zsb, tofs);
 			error = error ? error : end_err;
 			deactivate_super(zsb->z_sb);
 		} else {
-			error = dmu_recv_end(&drc);
+			error = dmu_recv_end(&drc, NULL);
 		}
 	}
 
@@ -4528,8 +4528,11 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
 			 * objset_phys_t).  Suspend/resume the fs will do that.
 			 */
 			error = zfs_suspend_fs(zsb);
-			if (error == 0)
+			if (error == 0) {
+				dmu_objset_refresh_ownership(zsb->z_os,
+				    zsb);
 				error = zfs_resume_fs(zsb, zc->zc_name);
+			}
 		}
 		if (error == 0)
 			error = dmu_objset_userspace_upgrade(zsb->z_os);
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index 06a5affa4..9fc6c6fe1 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -1453,7 +1453,9 @@ EXPORT_SYMBOL(zfs_vget);
  * Block out VFS ops and close zfs_sb_t
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
- * 'z_teardown_inactive_lock' write held.
+ * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
  */
 int
 zfs_suspend_fs(zfs_sb_t *zsb)
@@ -1463,8 +1465,6 @@ zfs_suspend_fs(zfs_sb_t *zsb)
 	if ((error = zfs_sb_teardown(zsb, B_FALSE)) != 0)
 		return (error);
 
-	dmu_objset_disown(zsb->z_os, zsb);
-
 	return (0);
 }
 EXPORT_SYMBOL(zfs_suspend_fs);
@@ -1476,66 +1476,69 @@ int
 zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
 {
 	int err;
+	znode_t *zp;
+	uint64_t sa_obj = 0;
 
 	ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock));
 
-	err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zsb, &zsb->z_os);
-	if (err) {
-		zsb->z_os = NULL;
-	} else {
-		znode_t *zp;
-		uint64_t sa_obj = 0;
+	/*
+	 * We already own this, so just hold and rele it to update the
+	 * objset_t, as the one we had before may have been evicted.
+	 */
+	VERIFY0(dmu_objset_hold(osname, zsb, &zsb->z_os));
+	VERIFY3P(zsb->z_os->os_dsl_dataset->ds_owner, ==, zsb);
+	VERIFY(dsl_dataset_long_held(zsb->z_os->os_dsl_dataset));
+	dmu_objset_rele(zsb->z_os, zsb);
 
-		/*
-		 * Make sure version hasn't changed
-		 */
+	/*
+	 * Make sure version hasn't changed
+	 */
 
-		err = zfs_get_zplprop(zsb->z_os, ZFS_PROP_VERSION,
-		    &zsb->z_version);
+	err = zfs_get_zplprop(zsb->z_os, ZFS_PROP_VERSION,
+	    &zsb->z_version);
 
-		if (err)
-			goto bail;
+	if (err)
+		goto bail;
 
-		err = zap_lookup(zsb->z_os, MASTER_NODE_OBJ,
-		    ZFS_SA_ATTRS, 8, 1, &sa_obj);
+	err = zap_lookup(zsb->z_os, MASTER_NODE_OBJ,
+	    ZFS_SA_ATTRS, 8, 1, &sa_obj);
 
-		if (err && zsb->z_version >= ZPL_VERSION_SA)
-			goto bail;
+	if (err && zsb->z_version >= ZPL_VERSION_SA)
+		goto bail;
 
-		if ((err = sa_setup(zsb->z_os, sa_obj,
-		    zfs_attr_table,  ZPL_END, &zsb->z_attr_table)) != 0)
-			goto bail;
+	if ((err = sa_setup(zsb->z_os, sa_obj,
+	    zfs_attr_table,  ZPL_END, &zsb->z_attr_table)) != 0)
+		goto bail;
 
-		if (zsb->z_version >= ZPL_VERSION_SA)
-			sa_register_update_callback(zsb->z_os,
-			    zfs_sa_upgrade);
+	if (zsb->z_version >= ZPL_VERSION_SA)
+		sa_register_update_callback(zsb->z_os,
+		    zfs_sa_upgrade);
 
-		VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0);
+	VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0);
 
-		zfs_set_fuid_feature(zsb);
-		zsb->z_rollback_time = jiffies;
+	zfs_set_fuid_feature(zsb);
+	zsb->z_rollback_time = jiffies;
 
-		/*
-		 * Attempt to re-establish all the active inodes with their
-		 * dbufs.  If a zfs_rezget() fails, then we unhash the inode
-		 * and mark it stale.  This prevents a collision if a new
-		 * inode/object is created which must use the same inode
-		 * number.  The stale inode will be be released when the
-		 * VFS prunes the dentry holding the remaining references
-		 * on the stale inode.
-		 */
-		mutex_enter(&zsb->z_znodes_lock);
-		for (zp = list_head(&zsb->z_all_znodes); zp;
-		    zp = list_next(&zsb->z_all_znodes, zp)) {
-			err2 = zfs_rezget(zp);
-			if (err2) {
-				remove_inode_hash(ZTOI(zp));
-				zp->z_is_stale = B_TRUE;
-			}
+	/*
+	 * Attempt to re-establish all the active inodes with their
+	 * dbufs.  If a zfs_rezget() fails, then we unhash the inode
+	 * and mark it stale.  This prevents a collision if a new
+	 * inode/object is created which must use the same inode
+	 * number.  The stale inode will be be released when the
+	 * VFS prunes the dentry holding the remaining references
+	 * on the stale inode.
+	 */
+	mutex_enter(&zsb->z_znodes_lock);
+	for (zp = list_head(&zsb->z_all_znodes); zp;
+	    zp = list_next(&zsb->z_all_znodes, zp)) {
+		err = zfs_rezget(zp);
+		if (err) {
+			remove_inode_hash(ZTOI(zp));
+			zp->z_is_stale = B_TRUE;
 		}
-		mutex_exit(&zsb->z_znodes_lock);
 	}
+	mutex_exit(&zsb->z_znodes_lock);
 
 bail:
 	/* release the VFS ops */
@@ -1544,8 +1547,8 @@ bail:
 
 	if (err) {
 		/*
-		 * Since we couldn't reopen zfs_sb_t or, or
-		 * setup the sa framework force unmount this file system.
+		 * Since we couldn't setup the sa framework, try to force
+		 * unmount this file system.
 		 */
 		if (zsb->z_os)
 			(void) zfs_umount(zsb->z_sb);