summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorKeith M Wesolowski <[email protected]>2013-07-27 10:50:07 -0700
committerBrian Behlendorf <[email protected]>2013-11-04 11:27:41 -0800
commit831baf06efb3023ddee7ed41800d3b44521bf2ee (patch)
tree01347f77efc0f3a717c3f143d960bf9e5db3d065 /module/zfs
parent19580676295b4e271da63dce145bb17c3731d069 (diff)
Illumos #3875
3875 panic in zfs_root() after failed rollback Reviewed by: Jerry Jelinek <[email protected]> Reviewed by: Matthew Ahrens <[email protected]> Approved by: Gordon Ross <[email protected]> References: https://www.illumos.org/issues/3875 illumos/illumos-gate@91948b51b8e978ddc88a36b2bc3ae83c20cdc9aa Ported-by: Richard Yao <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Issue #1775
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/dmu_objset.c32
-rw-r--r--module/zfs/dmu_send.c9
-rw-r--r--module/zfs/dsl_dataset.c72
-rw-r--r--module/zfs/zfs_ioctl.c15
-rw-r--r--module/zfs/zfs_vfsops.c101
5 files changed, 162 insertions, 67 deletions
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index f886254af..6a3b5f05e 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -517,6 +517,38 @@ dmu_objset_rele(objset_t *os, void *tag)
dsl_pool_rele(dp, tag);
}
+/*
+ * When we are called, os MUST refer to an objset associated with a dataset
+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
+ * == tag. We will then release and reacquire ownership of the dataset while
+ * holding the pool config_rwlock to avoid intervening namespace or ownership
+ * changes may occur.
+ *
+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
+ * release the hold on its dataset and acquire a new one on the dataset of the
+ * same name so that it can be partially torn down and reconstructed.
+ */
+void
+dmu_objset_refresh_ownership(objset_t *os, void *tag)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds, *newds;
+ char name[MAXNAMELEN];
+
+ ds = os->os_dsl_dataset;
+ VERIFY3P(ds, !=, NULL);
+ VERIFY3P(ds->ds_owner, ==, tag);
+ VERIFY(dsl_dataset_long_held(ds));
+
+ dsl_dataset_name(ds, name);
+ dp = dmu_objset_pool(os);
+ dsl_pool_config_enter(dp, FTAG);
+ dmu_objset_disown(os, tag);
+ VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
+ VERIFY3P(newds, ==, os->os_dsl_dataset);
+ dsl_pool_config_exit(dp, FTAG);
+}
+
void
dmu_objset_disown(objset_t *os, void *tag)
{
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index c5eb2fb79..d38e2c512 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -1612,7 +1612,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx)
}
}
error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
- origin_head, drc->drc_force);
+ origin_head, drc->drc_force, drc->drc_owner, tx);
if (error != 0) {
dsl_dataset_rele(origin_head, FTAG);
return (error);
@@ -1685,6 +1685,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_rele(origin_head, FTAG);
dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+ if (drc->drc_owner != NULL)
+ VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
} else {
dsl_dataset_t *ds = drc->drc_ds;
@@ -1787,8 +1790,10 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc)
}
int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
{
+ drc->drc_owner = owner;
+
if (drc->drc_newfs)
return (dmu_recv_new_end(drc));
else
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index b04ef69bb..ca0c4aae8 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -1669,16 +1669,52 @@ dsl_dataset_rename_snapshot(const char *fsname,
dsl_dataset_rename_snapshot_sync, &ddrsa, 1));
}
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset. We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here. We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
+static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+ boolean_t held;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ if (owner != NULL) {
+ VERIFY3P(ds->ds_owner, ==, owner);
+ dsl_dataset_long_rele(ds, owner);
+ }
+
+ held = dsl_dataset_long_held(ds);
+
+ if (owner != NULL)
+ dsl_dataset_long_hold(ds, owner);
+
+ if (held)
+ return (SET_ERROR(EBUSY));
+
+ return (0);
+}
+
+typedef struct dsl_dataset_rollback_arg {
+ const char *ddra_fsname;
+ void *ddra_owner;
+} dsl_dataset_rollback_arg_t;
+
static int
dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
{
- const char *fsname = arg;
+ dsl_dataset_rollback_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int64_t unused_refres_delta;
int error;
- error = dsl_dataset_hold(dp, fsname, FTAG, &ds);
+ error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
if (error != 0)
return (error);
@@ -1694,9 +1730,10 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EINVAL));
}
- if (dsl_dataset_long_held(ds)) {
+ error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+ if (error != 0) {
dsl_dataset_rele(ds, FTAG);
- return (SET_ERROR(EBUSY));
+ return (error);
}
/*
@@ -1733,12 +1770,12 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
static void
dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
{
- const char *fsname = arg;
+ dsl_dataset_rollback_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds, *clone;
uint64_t cloneobj;
- VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds));
+ VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
@@ -1754,11 +1791,26 @@ dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_rele(ds, FTAG);
}
+/*
+ * If owner != NULL:
+ *
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ * succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted. See
+ * notes above zfs_suspend_fs() for further details.
+ */
int
-dsl_dataset_rollback(const char *fsname)
+dsl_dataset_rollback(const char *fsname, void *owner)
{
+ dsl_dataset_rollback_arg_t ddra;
+
+ ddra.ddra_fsname = fsname;
+ ddra.ddra_owner = owner;
+
return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
- dsl_dataset_rollback_sync, (void *)fsname, 1));
+ dsl_dataset_rollback_sync, (void *)&ddra, 1));
}
struct promotenode {
@@ -2276,7 +2328,7 @@ dsl_dataset_promote(const char *name, char *conflsnap)
int
dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
- dsl_dataset_t *origin_head, boolean_t force)
+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
{
int64_t unused_refres_delta;
@@ -2305,7 +2357,7 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
return (SET_ERROR(ETXTBSY));
/* origin_head should have no long holds (e.g. is not mounted) */
- if (dsl_dataset_long_held(origin_head))
+ if (dsl_dataset_handoff_check(origin_head, owner, tx))
return (SET_ERROR(EBUSY));
/* check amount of any unconsumed refreservation */
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index d6736c29a..556a0c940 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1349,7 +1349,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
/*
* XXX we could probably try again, since the unmounting
* thread should be just about to disassociate the
- * objset from the zfsvfs.
+ * objset from the zsb.
*/
rrw_exit(&(*zsbp)->z_teardown_lock, tag);
return (SET_ERROR(EBUSY));
@@ -3504,13 +3504,13 @@ zfs_ioc_rollback(zfs_cmd_t *zc)
if (error == 0) {
int resume_err;
- error = dsl_dataset_rollback(zc->zc_name);
+ error = dsl_dataset_rollback(zc->zc_name, zsb);
resume_err = zfs_resume_fs(zsb, zc->zc_name);
error = error ? error : resume_err;
}
deactivate_super(zsb->z_sb);
} else {
- error = dsl_dataset_rollback(zc->zc_name);
+ error = dsl_dataset_rollback(zc->zc_name, NULL);
}
return (error);
}
@@ -4038,13 +4038,13 @@ zfs_ioc_recv(zfs_cmd_t *zc)
* If the suspend fails, then the recv_end will
* likely also fail, and clean up after itself.
*/
- end_err = dmu_recv_end(&drc);
+ end_err = dmu_recv_end(&drc, zsb);
if (error == 0)
error = zfs_resume_fs(zsb, tofs);
error = error ? error : end_err;
deactivate_super(zsb->z_sb);
} else {
- error = dmu_recv_end(&drc);
+ error = dmu_recv_end(&drc, NULL);
}
}
@@ -4528,8 +4528,11 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
* objset_phys_t). Suspend/resume the fs will do that.
*/
error = zfs_suspend_fs(zsb);
- if (error == 0)
+ if (error == 0) {
+ dmu_objset_refresh_ownership(zsb->z_os,
+ zsb);
error = zfs_resume_fs(zsb, zc->zc_name);
+ }
}
if (error == 0)
error = dmu_objset_userspace_upgrade(zsb->z_os);
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index 06a5affa4..9fc6c6fe1 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -1453,7 +1453,9 @@ EXPORT_SYMBOL(zfs_vget);
* Block out VFS ops and close zfs_sb_t
*
* Note, if successful, then we return with the 'z_teardown_lock' and
- * 'z_teardown_inactive_lock' write held.
+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
*/
int
zfs_suspend_fs(zfs_sb_t *zsb)
@@ -1463,8 +1465,6 @@ zfs_suspend_fs(zfs_sb_t *zsb)
if ((error = zfs_sb_teardown(zsb, B_FALSE)) != 0)
return (error);
- dmu_objset_disown(zsb->z_os, zsb);
-
return (0);
}
EXPORT_SYMBOL(zfs_suspend_fs);
@@ -1476,66 +1476,69 @@ int
zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
{
int err;
+ znode_t *zp;
+ uint64_t sa_obj = 0;
ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock));
ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock));
- err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zsb, &zsb->z_os);
- if (err) {
- zsb->z_os = NULL;
- } else {
- znode_t *zp;
- uint64_t sa_obj = 0;
+ /*
+ * We already own this, so just hold and rele it to update the
+ * objset_t, as the one we had before may have been evicted.
+ */
+ VERIFY0(dmu_objset_hold(osname, zsb, &zsb->z_os));
+ VERIFY3P(zsb->z_os->os_dsl_dataset->ds_owner, ==, zsb);
+ VERIFY(dsl_dataset_long_held(zsb->z_os->os_dsl_dataset));
+ dmu_objset_rele(zsb->z_os, zsb);
- /*
- * Make sure version hasn't changed
- */
+ /*
+ * Make sure version hasn't changed
+ */
- err = zfs_get_zplprop(zsb->z_os, ZFS_PROP_VERSION,
- &zsb->z_version);
+ err = zfs_get_zplprop(zsb->z_os, ZFS_PROP_VERSION,
+ &zsb->z_version);
- if (err)
- goto bail;
+ if (err)
+ goto bail;
- err = zap_lookup(zsb->z_os, MASTER_NODE_OBJ,
- ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ err = zap_lookup(zsb->z_os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj);
- if (err && zsb->z_version >= ZPL_VERSION_SA)
- goto bail;
+ if (err && zsb->z_version >= ZPL_VERSION_SA)
+ goto bail;
- if ((err = sa_setup(zsb->z_os, sa_obj,
- zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0)
- goto bail;
+ if ((err = sa_setup(zsb->z_os, sa_obj,
+ zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0)
+ goto bail;
- if (zsb->z_version >= ZPL_VERSION_SA)
- sa_register_update_callback(zsb->z_os,
- zfs_sa_upgrade);
+ if (zsb->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(zsb->z_os,
+ zfs_sa_upgrade);
- VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0);
+ VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0);
- zfs_set_fuid_feature(zsb);
- zsb->z_rollback_time = jiffies;
+ zfs_set_fuid_feature(zsb);
+ zsb->z_rollback_time = jiffies;
- /*
- * Attempt to re-establish all the active inodes with their
- * dbufs. If a zfs_rezget() fails, then we unhash the inode
- * and mark it stale. This prevents a collision if a new
- * inode/object is created which must use the same inode
- * number. The stale inode will be be released when the
- * VFS prunes the dentry holding the remaining references
- * on the stale inode.
- */
- mutex_enter(&zsb->z_znodes_lock);
- for (zp = list_head(&zsb->z_all_znodes); zp;
- zp = list_next(&zsb->z_all_znodes, zp)) {
- err2 = zfs_rezget(zp);
- if (err2) {
- remove_inode_hash(ZTOI(zp));
- zp->z_is_stale = B_TRUE;
- }
+ /*
+ * Attempt to re-establish all the active inodes with their
+ * dbufs. If a zfs_rezget() fails, then we unhash the inode
+ * and mark it stale. This prevents a collision if a new
+ * inode/object is created which must use the same inode
+ * number. The stale inode will be be released when the
+ * VFS prunes the dentry holding the remaining references
+ * on the stale inode.
+ */
+ mutex_enter(&zsb->z_znodes_lock);
+ for (zp = list_head(&zsb->z_all_znodes); zp;
+ zp = list_next(&zsb->z_all_znodes, zp)) {
+ err = zfs_rezget(zp);
+ if (err) {
+ remove_inode_hash(ZTOI(zp));
+ zp->z_is_stale = B_TRUE;
}
- mutex_exit(&zsb->z_znodes_lock);
}
+ mutex_exit(&zsb->z_znodes_lock);
bail:
/* release the VFS ops */
@@ -1544,8 +1547,8 @@ bail:
if (err) {
/*
- * Since we couldn't reopen zfs_sb_t or, or
- * setup the sa framework force unmount this file system.
+ * Since we couldn't setup the sa framework, try to force
+ * unmount this file system.
*/
if (zsb->z_os)
(void) zfs_umount(zsb->z_sb);