Linux 3.18 compat: Snapshot auto-mounting

Re-factor the .zfs/snapshot auto-mouting code to take in to account changes made to the upstream kernels. And to lay the groundwork for enabling access to .zfs snapshots via NFS clients. This patch makes the following core improvements. * All actively auto-mounted snapshots are now tracked in two global trees which are indexed by snapshot name and objset id respectively. This allows for fast lookups of any auto-mounted snapshot regardless without needing access to the parent dataset. * Snapshot entries are added to the tree in zfsctl_snapshot_mount(). However, they are now removed from the tree in the context of the unmount process. This eliminates the need complicated error logic in zfsctl_snapshot_unmount() to handle unmount failures. * References are now taken on the snapshot entries in the tree to ensure they always remain valid while a task is outstanding. * The MNT_SHRINKABLE flag is set on the snapshot vfsmount_t right after the auto-mount succeeds. This allows to kernel to unmount idle auto-mounted snapshots if needed removing the need for the zfsctl_unmount_snapshots() function. * Snapshots in active use will not be automatically unmounted. As long as at least one dentry is revalidated every zfs_expire_snapshot/2 seconds the auto-unmount expiration timer will be extended. * Commit torvalds/linux@bafc9b7 caused snapshots auto-mounted by ZFS to be immediately unmounted when the dentry was revalidated. This was a consequence of ZFS invaliding all snapdir dentries to ensure that negative dentries didn't mask new snapshots. This patch modifies the behavior such that only negative dentries are invalidated. This solves the issue and may result in a performance improvement. Signed-off-by: Brian Behlendorf <[email protected]> Closes #3589 Closes #3344 Closes #3295 Closes #3257 Closes #3243 Closes #3030 Closes #2841
author: Brian Behlendorf <[email protected]> 2015-04-24 16:21:13 -0700
committer: Brian Behlendorf <[email protected]> 2015-08-31 13:54:39 -0700
commit: 278bee9319ba5947b995673d2c76e0333f2d33d4 (patch)
tree: a621142a2d5314b460cbf1a8bc6b6092f7a46424
parent: b23975cbe0f249671c131b0d6e4ae1bb10594440 (diff)
13 files changed, 496 insertions, 419 deletions
diff --git a/config/kernel-follow-down-one.m4 b/config/kernel-follow-down-one.m4
new file mode 100644
index 000000000..63fa779d8
--- /dev/null
+++ b/config/kernel-follow-down-one.m4
@@ -0,0 +1,20 @@
+dnl #
+dnl # 2.6.38 API change
+dnl # follow_down() renamed follow_down_one().  The original follow_down()
+dnl # symbol still exists but will traverse down all the layers.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_FOLLOW_DOWN_ONE], [
+	AC_MSG_CHECKING([whether follow_down_one() is available])
+	ZFS_LINUX_TRY_COMPILE([
+		#include <linux/namei.h>
+	],[
+		struct path *p = NULL;
+		follow_down_one(p);
+	],[
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_FOLLOW_DOWN_ONE, 1,
+		    [follow_down_one() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 380ca973f..f84612dbf 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -101,6 +101,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
 	ZFS_AC_KERNEL_VFS_ITERATE
 	ZFS_AC_KERNEL_VFS_RW_ITERATE
 	ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
+	ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
 
 	AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
 		KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ"
diff --git a/include/linux/vfs_compat.h b/include/linux/vfs_compat.h
index 40832d9dd..bcec1146a 100644
--- a/include/linux/vfs_compat.h
+++ b/include/linux/vfs_compat.h
@@ -352,4 +352,15 @@ static inline struct inode *file_inode(const struct file *f)
 }
 #endif /* HAVE_FILE_INODE */
 
+/*
+ * 2.6.38 API change
+ */
+#ifdef HAVE_FOLLOW_DOWN_ONE
+#define	zpl_follow_down_one(path)		follow_down_one(path)
+#define	zpl_follow_up(path)			follow_up(path)
+#else
+#define	zpl_follow_down_one(path)		follow_down(path)
+#define	zpl_follow_up(path)			follow_up(path)
+#endif
+
 #endif /* _ZFS_VFS_H */
diff --git a/include/sys/zfs_ctldir.h b/include/sys/zfs_ctldir.h
index 1ff23a298..960a9a629 100644
--- a/include/sys/zfs_ctldir.h
+++ b/include/sys/zfs_ctldir.h
@@ -32,6 +32,7 @@
 #define	_ZFS_CTLDIR_H
 
 #include <sys/vnode.h>
+#include <sys/pathname.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 
@@ -46,23 +47,16 @@
 	(zfs_has_ctldir(zdp) && \
 	(ZTOZSB(zdp)->z_show_ctldir))
 
-typedef struct {
-	char			*se_name;
-	char			*se_path;
-	struct inode		*se_inode;
-	taskqid_t		se_taskqid;
-	avl_node_t		se_node;
-} zfs_snapentry_t;
+extern int zfs_expire_snapshot;
 
 /* zfsctl generic functions */
-extern int snapentry_compare(const void *a, const void *b);
-extern boolean_t zfsctl_is_node(struct inode *ip);
-extern boolean_t zfsctl_is_snapdir(struct inode *ip);
-extern void zfsctl_inode_inactive(struct inode *ip);
-extern void zfsctl_inode_destroy(struct inode *ip);
 extern int zfsctl_create(zfs_sb_t *zsb);
 extern void zfsctl_destroy(zfs_sb_t *zsb);
 extern struct inode *zfsctl_root(znode_t *zp);
+extern void zfsctl_init(void);
+extern void zfsctl_fini(void);
+extern boolean_t zfsctl_is_node(struct inode *ip);
+extern boolean_t zfsctl_is_snapdir(struct inode *ip);
 extern int zfsctl_fid(struct inode *ip, fid_t *fidp);
 
 /* zfsctl '.zfs' functions */
@@ -81,9 +75,9 @@ extern int zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr,
 extern int zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
     struct inode **ipp, cred_t *cr, int flags);
 extern void zfsctl_snapdir_inactive(struct inode *ip);
-extern int zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags);
-extern int zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count);
-extern int zfsctl_mount_snapshot(struct path *path, int flags);
+extern int zfsctl_snapshot_mount(struct path *path, int flags);
+extern int zfsctl_snapshot_unmount(char *snapname, int flags);
+extern int zfsctl_snapshot_unmount_delay(uint64_t objsetid, int delay);
 extern int zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid,
     zfs_sb_t **zsb);
 
@@ -92,10 +86,6 @@ extern int zfsctl_shares_lookup(struct inode *dip, char *name,
     struct inode **ipp, int flags, cred_t *cr, int *direntflags,
     pathname_t *realpnp);
 
-/* zfsctl_init/fini functions */
-extern void zfsctl_init(void);
-extern void zfsctl_fini(void);
-
 /*
  * These inodes numbers are reserved for the .zfs control directory.
  * It is important that they be no larger that 48-bits because only
diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h
index 4073d82f8..af99686a6 100644
--- a/include/sys/zfs_vfsops.h
+++ b/include/sys/zfs_vfsops.h
@@ -72,11 +72,10 @@ typedef struct zfs_sb {
 	list_t		z_all_znodes;	/* all znodes in the fs */
 	uint64_t	z_nr_znodes;	/* number of znodes in the fs */
 	unsigned long	z_rollback_time; /* last online rollback time */
+	unsigned long	z_snap_defer_time; /* last snapshot unmount deferal */
 	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
 	arc_prune_t	*z_arc_prune;	/* called by ARC to prune caches */
 	struct inode	*z_ctldir;	/* .zfs directory inode */
-	avl_tree_t	z_ctldir_snaps;	/* .zfs/snapshot entries */
-	kmutex_t	z_ctldir_lock;	/* .zfs ctldir lock */
 	boolean_t	z_show_ctldir;	/* expose .zfs in the root dir */
 	boolean_t	z_issnap;	/* true if this is a snapshot */
 	boolean_t	z_vscan;	/* virus scan on/off */
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index efa6cfa0a..f0aff7b45 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -88,6 +88,22 @@
 #include "zfs_namecheck.h"
 
 /*
+ * Two AVL trees are maintained which contain all currently automounted
+ * snapshots.  Every automounted snapshots maps to a single zfs_snapentry_t
+ * entry which MUST:
+ *
+ *   - be attached to both trees, and
+ *   - be unique, no duplicate entries are allowed.
+ *
+ * The zfs_snapshots_by_name tree is indexed by the full dataset name
+ * while the zfs_snapshots_by_objsetid tree is indexed by the unique
+ * objsetid.  This allows for fast lookups either by name or objsetid.
+ */
+static avl_tree_t zfs_snapshots_by_name;
+static avl_tree_t zfs_snapshots_by_objsetid;
+static kmutex_t zfs_snapshot_lock;
+
+/*
  * Control Directory Tunables (.zfs)
  */
 int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
@@ -97,45 +113,116 @@ int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
  */
 static taskq_t *zfs_expire_taskq;
 
+typedef struct {
+	char		*se_name;	/* full snapshot name */
+	char		*se_path;	/* full mount path */
+	uint64_t	se_objsetid;	/* snapshot objset id */
+	struct dentry   *se_root_dentry; /* snapshot root dentry */
+	taskqid_t	se_taskqid;	/* scheduled unmount taskqid */
+	avl_node_t	se_node_name;	/* zfs_snapshots_by_name link */
+	avl_node_t	se_node_objsetid; /* zfs_snapshots_by_objsetid link */
+	refcount_t	se_refcount;	/* reference count */
+} zfs_snapentry_t;
+
+static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
+
+/*
+ * Allocate a new zfs_snapentry_t being careful to make a copy of the
+ * the snapshot name and provided mount point.  No reference is taken.
+ */
 static zfs_snapentry_t *
-zfsctl_sep_alloc(void)
+zfsctl_snapshot_alloc(char *full_name, char *full_path, uint64_t objsetid,
+    struct dentry *root_dentry)
 {
-	return (kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP));
+	zfs_snapentry_t *se;
+
+	se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+
+	se->se_name = strdup(full_name);
+	se->se_path = strdup(full_path);
+	se->se_objsetid = objsetid;
+	se->se_root_dentry = root_dentry;
+	se->se_taskqid = -1;
+
+	refcount_create(&se->se_refcount);
+
+	return (se);
 }
 
-void
-zfsctl_sep_free(zfs_snapentry_t *sep)
+/*
+ * Free a zfs_snapentry_t the called must ensure there are no active
+ * references.
+ */
+static void
+zfsctl_snapshot_free(zfs_snapentry_t *se)
 {
-	kmem_free(sep->se_name, MAXNAMELEN);
-	kmem_free(sep->se_path, PATH_MAX);
-	kmem_free(sep, sizeof (zfs_snapentry_t));
+	refcount_destroy(&se->se_refcount);
+	strfree(se->se_name);
+	strfree(se->se_path);
+
+	kmem_free(se, sizeof (zfs_snapentry_t));
 }
 
 /*
- * Attempt to expire an automounted snapshot, unmounts are attempted every
- * 'zfs_expire_snapshot' seconds until they succeed.  The work request is
- * responsible for rescheduling itself and freeing the zfs_expire_snapshot_t.
+ * Hold a reference on the zfs_snapentry_t.
  */
 static void
-zfsctl_expire_snapshot(void *data)
+zfsctl_snapshot_hold(zfs_snapentry_t *se)
 {
-	zfs_snapentry_t *sep = (zfs_snapentry_t *)data;
-	zfs_sb_t *zsb = ITOZSB(sep->se_inode);
-	int error;
+	refcount_add(&se->se_refcount, NULL);
+}
+
+/*
+ * Release a reference on the zfs_snapentry_t.  When the number of
+ * references drops to zero the structure will be freed.
+ */
+static void
+zfsctl_snapshot_rele(zfs_snapentry_t *se)
+{
+	if (refcount_remove(&se->se_refcount, NULL) == 0)
+		zfsctl_snapshot_free(se);
+}
 
-	error = zfsctl_unmount_snapshot(zsb, sep->se_name, MNT_EXPIRE);
-	if (error == EBUSY)
-		sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
-		    zfsctl_expire_snapshot, sep, TQ_SLEEP,
-		    ddi_get_lbolt() + zfs_expire_snapshot * HZ);
+/*
+ * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  While the zfs_snapentry_t is part
+ * of the trees a reference is held.
+ */
+static void
+zfsctl_snapshot_add(zfs_snapentry_t *se)
+{
+	ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+	refcount_add(&se->se_refcount, NULL);
+	avl_add(&zfs_snapshots_by_name, se);
+	avl_add(&zfs_snapshots_by_objsetid, se);
 }
 
-int
-snapentry_compare(const void *a, const void *b)
+/*
+ * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  Upon removal a reference is dropped,
+ * this can result in the structure being freed if that was the last
+ * remaining reference.
+ */
+static void
+zfsctl_snapshot_remove(zfs_snapentry_t *se)
+{
+	ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+	avl_remove(&zfs_snapshots_by_name, se);
+	avl_remove(&zfs_snapshots_by_objsetid, se);
+	zfsctl_snapshot_rele(se);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_name.
+ */
+static int
+snapentry_compare_by_name(const void *a, const void *b)
 {
-	const zfs_snapentry_t *sa = a;
-	const zfs_snapentry_t *sb = b;
-	int ret = strcmp(sa->se_name, sb->se_name);
+	const zfs_snapentry_t *se_a = a;
+	const zfs_snapentry_t *se_b = b;
+	int ret;
+
+	ret = strcmp(se_a->se_name, se_b->se_name);
 
 	if (ret < 0)
 		return (-1);
@@ -145,12 +232,199 @@ snapentry_compare(const void *a, const void *b)
 		return (0);
 }
 
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
+ */
+static int
+snapentry_compare_by_objsetid(const void *a, const void *b)
+{
+	const zfs_snapentry_t *se_a = a;
+	const zfs_snapentry_t *se_b = b;
+
+	if (se_a->se_objsetid < se_b->se_objsetid)
+		return (-1);
+	else if (se_a->se_objsetid > se_b->se_objsetid)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_name.  If the snapname
+ * is found a pointer to the zfs_snapentry_t is returned and a reference
+ * taken on the structure.  The caller is responsible for dropping the
+ * reference with zfsctl_snapshot_rele().  If the snapname is not found
+ * NULL will be returned.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_name(char *snapname)
+{
+	zfs_snapentry_t *se, search;
+
+	ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+
+	search.se_name = snapname;
+	se = avl_find(&zfs_snapshots_by_name, &search, NULL);
+	if (se)
+		refcount_add(&se->se_refcount, NULL);
+
+	return (se);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
+ * rather than the snapname.  In all other respects it behaves the same
+ * as zfsctl_snapshot_find_by_name().
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_objsetid(uint64_t objsetid)
+{
+	zfs_snapentry_t *se, search;
+
+	ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+
+	search.se_objsetid = objsetid;
+	se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
+	if (se)
+		refcount_add(&se->se_refcount, NULL);
+
+	return (se);
+}
+
+/*
+ * Rename a zfs_snapentry_t in the zfs_snapshots_by_name.  The structure is
+ * removed, renamed, and added back to the new correct location in the tree.
+ */
+static int
+zfsctl_snapshot_rename(char *old_snapname, char *new_snapname)
+{
+	zfs_snapentry_t *se;
+
+	ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+
+	se = zfsctl_snapshot_find_by_name(old_snapname);
+	if (se == NULL)
+		return (ENOENT);
+
+	zfsctl_snapshot_remove(se);
+	strfree(se->se_name);
+	se->se_name = strdup(new_snapname);
+	zfsctl_snapshot_add(se);
+	zfsctl_snapshot_rele(se);
+
+	return (0);
+}
+
+/*
+ * Delayed task responsible for unmounting an expired automounted snapshot.
+ */
+static void
+snapentry_expire(void *data)
+{
+	zfs_snapentry_t *se = (zfs_snapentry_t *)data;
+	uint64_t objsetid = se->se_objsetid;
+
+	se->se_taskqid = -1;
+	(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
+	zfsctl_snapshot_rele(se);
+
+	/*
+	 * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+	 * This can occur when the snapshot is busy.
+	 */
+	mutex_enter(&zfs_snapshot_lock);
+	if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
+		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+		zfsctl_snapshot_rele(se);
+	}
+	mutex_exit(&zfs_snapshot_lock);
+}
+
+/*
+ * Cancel an automatic unmount of a snapname.  This callback is responsible
+ * for dropping the reference on the zfs_snapentry_t which was taken when
+ * during dispatch.
+ */
+static void
+zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
+{
+	ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
+
+	if (taskq_cancel_id(zfs_expire_taskq, se->se_taskqid) == 0) {
+		se->se_taskqid = -1;
+		zfsctl_snapshot_rele(se);
+	}
+}
+
+/*
+ * Dispatch the unmount task for delayed handling with a hold protecting it.
+ */
+static void
+zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
+{
+	ASSERT3S(se->se_taskqid, ==, -1);
+
+	se->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
+	    snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
+	zfsctl_snapshot_hold(se);
+}
+
+/*
+ * Schedule an automatic unmount of objset id to occur in delay seconds from
+ * now.  Any previous delayed unmount will be cancelled in favor of the
+ * updated deadline.  A reference is taken by zfsctl_snapshot_find_by_name()
+ * and held until the outstanding task is handled or cancelled.
+ */
+int
+zfsctl_snapshot_unmount_delay(uint64_t objsetid, int delay)
+{
+	zfs_snapentry_t *se;
+	int error = ENOENT;
+
+	mutex_enter(&zfs_snapshot_lock);
+	if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
+		zfsctl_snapshot_unmount_cancel(se);
+		zfsctl_snapshot_unmount_delay_impl(se, delay);
+		zfsctl_snapshot_rele(se);
+		error = 0;
+	}
+	mutex_exit(&zfs_snapshot_lock);
+
+	return (error);
+}
+
+/*
+ * Check if snapname is currently mounted.  Returned non-zero when mounted
+ * and zero when unmounted.
+ */
+static boolean_t
+zfsctl_snapshot_ismounted(char *snapname)
+{
+	zfs_snapentry_t *se;
+	boolean_t ismounted = B_FALSE;
+
+	mutex_enter(&zfs_snapshot_lock);
+	if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
+		zfsctl_snapshot_rele(se);
+		ismounted = B_TRUE;
+	}
+	mutex_exit(&zfs_snapshot_lock);
+
+	return (ismounted);
+}
+
+/*
+ * Check if the given inode is a part of the virtual .zfs directory.
+ */
 boolean_t
 zfsctl_is_node(struct inode *ip)
 {
 	return (ITOZ(ip)->z_is_ctldir);
 }
 
+/*
+ * Check if the given inode is a .zfs/snapshots/snapname directory.
+ */
 boolean_t
 zfsctl_is_snapdir(struct inode *ip)
 {
@@ -250,24 +524,6 @@ zfsctl_inode_lookup(zfs_sb_t *zsb, uint64_t id,
 }
 
 /*
- * Free zfsctl inode specific structures, currently there are none.
- */
-void
-zfsctl_inode_destroy(struct inode *ip)
-{
-}
-
-/*
- * An inode is being evicted from the cache.
- */
-void
-zfsctl_inode_inactive(struct inode *ip)
-{
-	if (zfsctl_is_snapdir(ip))
-		zfsctl_snapdir_inactive(ip);
-}
-
-/*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the zfs_sb_t.  The code in zfs_umount()
  * therefore checks against a vfs_count of 2 instead of 1.  This reference
@@ -295,13 +551,27 @@ zfsctl_create(zfs_sb_t *zsb)
 }
 
 /*
- * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
+ * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
+ * Only called when the filesystem is unmounted.
  */
 void
 zfsctl_destroy(zfs_sb_t *zsb)
 {
-	iput(zsb->z_ctldir);
-	zsb->z_ctldir = NULL;
+	if (zsb->z_issnap) {
+		zfs_snapentry_t *se;
+		uint64_t objsetid = dmu_objset_id(zsb->z_os);
+
+		mutex_enter(&zfs_snapshot_lock);
+		if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
+			zfsctl_snapshot_unmount_cancel(se);
+			zfsctl_snapshot_remove(se);
+			zfsctl_snapshot_rele(se);
+		}
+		mutex_exit(&zfs_snapshot_lock);
+	} else if (zsb->z_ctldir) {
+		iput(zsb->z_ctldir);
+		zsb->z_ctldir = NULL;
+	}
 }
 
 /*
@@ -316,7 +586,6 @@ zfsctl_root(znode_t *zp)
 	return (ZTOZSB(zp)->z_ctldir);
 }
 
-/*ARGSUSED*/
 int
 zfsctl_fid(struct inode *ip, fid_t *fidp)
 {
@@ -349,31 +618,33 @@ zfsctl_fid(struct inode *ip, fid_t *fidp)
 	return (0);
 }
 
+/*
+ * Construct a full dataset name in full_name: "pool/dataset@snap_name"
+ */
 static int
-zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname)
+zfsctl_snapshot_name(zfs_sb_t *zsb, const char *snap_name, int len,
+    char *full_name)
 {
-	objset_t *os = ITOZSB(ip)->z_os;
+	objset_t *os = zsb->z_os;
 
-	if (zfs_component_namecheck(name, NULL, NULL) != 0)
+	if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
 		return (SET_ERROR(EILSEQ));
 
-	dmu_objset_name(os, zname);
-	if ((strlen(zname) + 1 + strlen(name)) >= len)
+	dmu_objset_name(os, full_name);
+	if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
 		return (SET_ERROR(ENAMETOOLONG));
 
-	(void) strcat(zname, "@");
-	(void) strcat(zname, name);
+	(void) strcat(full_name, "@");
+	(void) strcat(full_name, snap_name);
 
 	return (0);
 }
 
 /*
- * Gets the full dataset name that corresponds to the given snapshot name
- * Example:
- * 	zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
  */
 static int
-zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
+zfsctl_snapshot_path(struct path *path, int len, char *full_path)
 {
 	char *path_buffer, *path_ptr;
 	int path_len, error = 0;
@@ -392,8 +663,8 @@ zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
 		goto out;
 	}
 
-	memcpy(zpath, path_ptr, path_len);
-	zpath[path_len] = '\0';
+	memcpy(full_path, path_ptr, path_len);
+	full_path[path_len] = '\0';
 out:
 	kmem_free(path_buffer, len);
 
@@ -403,7 +674,6 @@ out:
 /*
  * Special case the handling of "..".
  */
-/* ARGSUSED */
 int
 zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
@@ -438,7 +708,6 @@ zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
  * snapshot if it exist, creating the pseudo filesystem inode as necessary.
  * Perform a mount of the associated dataset on top of the inode.
  */
-/* ARGSUSED */
 int
 zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
@@ -457,49 +726,24 @@ zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
 
 	*ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id,
 	    &simple_dir_operations, &simple_dir_inode_operations);
-	if (*ipp) {
-#ifdef HAVE_AUTOMOUNT
-		(*ipp)->i_flags |= S_AUTOMOUNT;
-#endif /* HAVE_AUTOMOUNT */
-	} else {
+	if (*ipp == NULL)
 		error = SET_ERROR(ENOENT);
-	}
 
 	ZFS_EXIT(zsb);
 
 	return (error);
 }
 
-static void
-zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name)
-{
-	avl_index_t where;
-
-	ASSERT(MUTEX_HELD(&zsb->z_ctldir_lock));
-	ASSERT(sep != NULL);
-
-	/*
-	 * Change the name in the AVL tree.
-	 */
-	avl_remove(&zsb->z_ctldir_snaps, sep);
-	(void) strcpy(sep->se_name, name);
-	VERIFY(avl_find(&zsb->z_ctldir_snaps, sep, &where) == NULL);
-	avl_insert(&zsb->z_ctldir_snaps, sep, where);
-}
-
 /*
  * Renaming a directory under '.zfs/snapshot' will automatically trigger
  * a rename of the snapshot to the new given name.  The rename is confined
  * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
  */
-/*ARGSUSED*/
 int
 zfsctl_snapdir_rename(struct inode *sdip, char *snm,
     struct inode *tdip, char *tnm, cred_t *cr, int flags)
 {
 	zfs_sb_t *zsb = ITOZSB(sdip);
-	zfs_snapentry_t search, *sep;
-	avl_index_t where;
 	char *to, *from, *real, *fsname;
 	int error;
 
@@ -522,9 +766,9 @@ zfsctl_snapdir_rename(struct inode *sdip, char *snm,
 
 	dmu_objset_name(zsb->z_os, fsname);
 
-	error = zfsctl_snapshot_zname(sdip, snm, MAXNAMELEN, from);
+	error = zfsctl_snapshot_name(ITOZSB(sdip), snm, MAXNAMELEN, from);
 	if (error == 0)
-		error = zfsctl_snapshot_zname(tdip, tnm, MAXNAMELEN, to);
+		error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, MAXNAMELEN, to);
 	if (error == 0)
 		error = zfs_secpolicy_rename_perms(from, to, cr);
 	if (error != 0)
@@ -546,19 +790,13 @@ zfsctl_snapdir_rename(struct inode *sdip, char *snm,
 		goto out;
 	}
 
-	mutex_enter(&zsb->z_ctldir_lock);
+	mutex_enter(&zfs_snapshot_lock);
 
 	error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
-	if (error)
-		goto out_unlock;
-
-	search.se_name = (char *)snm;
-	sep = avl_find(&zsb->z_ctldir_snaps, &search, &where);
-	if (sep)
-		zfsctl_rename_snap(zsb, sep, tnm);
+	if (error == 0)
+		(void) zfsctl_snapshot_rename(snm, tnm);
 
-out_unlock:
-	mutex_exit(&zsb->z_ctldir_lock);
+	mutex_exit(&zfs_snapshot_lock);
 out:
 	kmem_free(from, MAXNAMELEN);
 	kmem_free(to, MAXNAMELEN);
@@ -574,7 +812,6 @@ out:
  * Removing a directory under '.zfs/snapshot' will automatically trigger
  * the removal of the snapshot with the given name.
  */
-/* ARGSUSED */
 int
 zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
 {
@@ -597,13 +834,13 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
 		}
 	}
 
-	error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname);
+	error = zfsctl_snapshot_name(ITOZSB(dip), name, MAXNAMELEN, snapname);
 	if (error == 0)
 		error = zfs_secpolicy_destroy_perms(snapname, cr);
 	if (error != 0)
 		goto out;
 
-	error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE);
+	error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
 	if ((error == 0) || (error == ENOENT))
 		error = dsl_destroy_snapshot(snapname, B_FALSE);
 out:
@@ -619,7 +856,6 @@ out:
  * Creating a directory under '.zfs/snapshot' will automatically trigger
  * the creation of a new snapshot with the given name.
  */
-/* ARGSUSED */
 int
 zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
 	struct inode **ipp, cred_t *cr, int flags)
@@ -656,36 +892,6 @@ out:
 }
 
 /*
- * When a .zfs/snapshot/<snapshot> inode is evicted they must be removed
- * from the snapshot list.  This will normally happen as part of the auto
- * unmount, however in the case of a manual snapshot unmount this will be
- * the only notification we receive.
- */
-void
-zfsctl_snapdir_inactive(struct inode *ip)
-{
-	zfs_sb_t *zsb = ITOZSB(ip);
-	zfs_snapentry_t *sep, *next;
-
-	mutex_enter(&zsb->z_ctldir_lock);
-
-	sep = avl_first(&zsb->z_ctldir_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
-
-		if (sep->se_inode == ip) {
-			avl_remove(&zsb->z_ctldir_snaps, sep);
-			taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
-			zfsctl_sep_free(sep);
-			break;
-		}
-		sep = next;
-	}
-
-	mutex_exit(&zsb->z_ctldir_lock);
-}
-
-/*
  * Attempt to unmount a snapshot by making a call to user space.
  * There is no assurance that this can or will succeed, is just a
  * best effort.  In the case where it does fail, perhaps because
@@ -697,18 +903,29 @@ zfsctl_snapdir_inactive(struct inode *ip)
 	"     2>/dev/null; " \
 	"umount -t zfs -n %s'%s'"
 
-static int
-__zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags)
+int
+zfsctl_snapshot_unmount(char *snapname, int flags)
 {
 	char *argv[] = { "/bin/sh", "-c", NULL, NULL };
 	char *envp[] = { NULL };
+	zfs_snapentry_t *se;
 	int error;
 
+	mutex_enter(&zfs_snapshot_lock);
+	if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
+		mutex_exit(&zfs_snapshot_lock);
+		return (ENOENT);
+	}
+	mutex_exit(&zfs_snapshot_lock);
+
 	argv[2] = kmem_asprintf(SET_UNMOUNT_CMD,
-	    flags & MNT_FORCE ? "-f " : "", sep->se_path);
+	    flags & MNT_FORCE ? "-f " : "", se->se_path);
+	zfsctl_snapshot_rele(se);
+	dprintf("unmount; path=%s\n", se->se_path);
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	strfree(argv[2]);
 
+
 	/*
 	 * The umount system utility will return 256 on error.  We must
 	 * assume this error is because the file system is busy so it is
@@ -717,91 +934,10 @@ __zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags)
 	if (error)
 		error = SET_ERROR(EBUSY);
 
-	/*
-	 * This was the result of a manual unmount, cancel the delayed work
-	 * to prevent zfsctl_expire_snapshot() from attempting a unmount.
-	 */
-	if ((error == 0) && !(flags & MNT_EXPIRE))
-		taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
-
-
-	return (error);
-}
-
-int
-zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags)
-{
-	zfs_snapentry_t search;
-	zfs_snapentry_t *sep;
-	int error = 0;
-
-	mutex_enter(&zsb->z_ctldir_lock);
-
-	search.se_name = name;
-	sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
-	if (sep) {
-		avl_remove(&zsb->z_ctldir_snaps, sep);
-		mutex_exit(&zsb->z_ctldir_lock);
-
-		error = __zfsctl_unmount_snapshot(sep, flags);
-
-		mutex_enter(&zsb->z_ctldir_lock);
-		if (error == EBUSY)
-			avl_add(&zsb->z_ctldir_snaps, sep);
-		else
-			zfsctl_sep_free(sep);
-	} else {
-		error = SET_ERROR(ENOENT);
-	}
-
-	mutex_exit(&zsb->z_ctldir_lock);
-	ASSERT3S(error, >=, 0);
-
 	return (error);
 }
 
-/*
- * Traverse all mounted snapshots and attempt to unmount them.  This
- * is best effort, on failure EEXIST is returned and count will be set
- * to the number of file snapshots which could not be unmounted.
- */
-int
-zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count)
-{
-	zfs_snapentry_t *sep, *next;
-	int error = 0;
-
-	*count = 0;
-
-	ASSERT(zsb->z_ctldir != NULL);
-	mutex_enter(&zsb->z_ctldir_lock);
-
-	sep = avl_first(&zsb->z_ctldir_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
-		avl_remove(&zsb->z_ctldir_snaps, sep);
-		mutex_exit(&zsb->z_ctldir_lock);
-
-		error = __zfsctl_unmount_snapshot(sep, flags);
-
-		mutex_enter(&zsb->z_ctldir_lock);
-		if (error == EBUSY) {
-			avl_add(&zsb->z_ctldir_snaps, sep);
-			(*count)++;
-		} else {
-			zfsctl_sep_free(sep);
-		}
-
-		sep = next;
-	}
-
-	mutex_exit(&zsb->z_ctldir_lock);
-
-	return ((*count > 0) ? EEXIST : 0);
-}
-
 #define	MOUNT_BUSY 0x80		/* Mount failed due to EBUSY (from mntent.h) */
-
 #define	SET_MOUNT_CMD \
 	"exec 0</dev/null " \
 	"     1>/dev/null " \
@@ -809,32 +945,46 @@ zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count)
 	"mount -t zfs -n '%s' '%s'"
 
 int
-zfsctl_mount_snapshot(struct path *path, int flags)
+zfsctl_snapshot_mount(struct path *path, int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *ip = dentry->d_inode;
-	zfs_sb_t *zsb = ITOZSB(ip);
+	zfs_sb_t *zsb;
+	zfs_sb_t *snap_zsb;
+	zfs_snapentry_t *se;
 	char *full_name, *full_path;
-	zfs_snapentry_t *sep;
-	zfs_snapentry_t search;
 	char *argv[] = { "/bin/sh", "-c", NULL, NULL };
 	char *envp[] = { NULL };
 	int error;
 
+	if (ip == NULL)
+		return (EISDIR);
+
+	zsb = ITOZSB(ip);
 	ZFS_ENTER(zsb);
 
 	full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
-	full_path = kmem_zalloc(PATH_MAX, KM_SLEEP);
+	full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
-	error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name);
+	error = zfsctl_snapshot_name(zsb, dname(dentry),
+	    MAXNAMELEN, full_name);
 	if (error)
 		goto error;
 
-	error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path);
+	error = zfsctl_snapshot_path(path, MAXPATHLEN, full_path);
 	if (error)
 		goto error;
 
 	/*
+	 * Multiple concurrent automounts of a snapshot are never allowed.
+	 * The snapshot may be manually mounted as many times as desired.
+	 */
+	if (zfsctl_snapshot_ismounted(full_name)) {
+		error = SET_ERROR(EISDIR);
+		goto error;
+	}
+
+	/*
 	 * Attempt to mount the snapshot from user space.  Normally this
 	 * would be done using the vfs_kern_mount() function, however that
 	 * function is marked GPL-only and cannot be used.  On error we
@@ -846,48 +996,37 @@ zfsctl_mount_snapshot(struct path *path, int flags)
 	 * Take note that if the program was executed successfully the return
 	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
 	 */
+	dprintf("mount; name=%s path=%s\n", full_name, full_path);
 	argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path);
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	strfree(argv[2]);
 	if (error && !(error & MOUNT_BUSY << 8)) {
-		printk("ZFS: Unable to automount %s at %s: %d\n",
-		    full_name, full_path, error);
+		cmn_err(CE_WARN, "Unable to automount %s/%s: %d",
+		    full_path, full_name, error);
 		error = SET_ERROR(EISDIR);
 		goto error;
 	}
 
-	error = 0;
-	mutex_enter(&zsb->z_ctldir_lock);
-
 	/*
-	 * Ensure a previous entry does not exist, if it does safely remove
-	 * it any cancel the outstanding expiration.  This can occur when a
-	 * snapshot is manually unmounted and then an automount is triggered.
+	 * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
+	 * to identify this as an automounted filesystem.
 	 */
-	search.se_name = full_name;
-	sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
-	if (sep) {
-		avl_remove(&zsb->z_ctldir_snaps, sep);
-		taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
-		zfsctl_sep_free(sep);
-	}
-
-	sep = zfsctl_sep_alloc();
-	sep->se_name = full_name;
-	sep->se_path = full_path;
-	sep->se_inode = ip;
-	avl_add(&zsb->z_ctldir_snaps, sep);
-
-	sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
-	    zfsctl_expire_snapshot, sep, TQ_SLEEP,
-	    ddi_get_lbolt() + zfs_expire_snapshot * HZ);
+	zpl_follow_down_one(path);
+	snap_zsb = ITOZSB(path->dentry->d_inode);
+	dentry = path->dentry;
+	path->mnt->mnt_flags |= MNT_SHRINKABLE;
+	zpl_follow_up(path);
+	error = 0;
 
-	mutex_exit(&zsb->z_ctldir_lock);
+	mutex_enter(&zfs_snapshot_lock);
+	se = zfsctl_snapshot_alloc(full_name, full_path,
+	    dmu_objset_id(snap_zsb->z_os), dentry);
+	zfsctl_snapshot_add(se);
+	zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+	mutex_exit(&zfs_snapshot_lock);
 error:
-	if (error) {
-		kmem_free(full_name, MAXNAMELEN);
-		kmem_free(full_path, PATH_MAX);
-	}
+	kmem_free(full_name, MAXNAMELEN);
+	kmem_free(full_path, MAXPATHLEN);
 
 	ZFS_EXIT(zsb);
 
@@ -895,82 +1034,35 @@ error:
 }
 
 /*
- * Check if this super block has a matching objset id.
- */
-static int
-zfsctl_test_super(struct super_block *sb, void *objsetidp)
-{
-	zfs_sb_t *zsb = sb->s_fs_info;
-	uint64_t objsetid = *(uint64_t *)objsetidp;
-
-	return (dmu_objset_id(zsb->z_os) == objsetid);
-}
-
-/*
- * Prevent a new super block from being allocated if an existing one
- * could not be located.  We only want to preform a lookup operation.
+ * Given the objset id of the snapshot return its zfs_sb_t as zsbp.
  */
-static int
-zfsctl_set_super(struct super_block *sb, void *objsetidp)
-{
-	return (-EEXIST);
-}
-
 int
 zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp)
 {
-	zfs_sb_t *zsb = sb->s_fs_info;
-	struct super_block *sbp;
-	zfs_snapentry_t *sep;
-	uint64_t id;
+	zfs_snapentry_t *se;
 	int error;
 
-	ASSERT(zsb->z_ctldir != NULL);
-
-	mutex_enter(&zsb->z_ctldir_lock);
-
 	/*
-	 * Verify that the snapshot is mounted.
+	 * Verify that the snapshot is mounted then lookup the mounted root
+	 * rather than the covered mount point.  This may fail if the
+	 * snapshot has just been unmounted by an unrelated user space
+	 * process.  This race cannot occur to an expired mount point
+	 * because we hold the zfs_snapshot_lock to prevent the race.
 	 */
-	sep = avl_first(&zsb->z_ctldir_snaps);
-	while (sep != NULL) {
-		error = dmu_snapshot_lookup(zsb->z_os, sep->se_name, &id);
-		if (error)
-			goto out;
-
-		if (id == objsetid)
-			break;
-
-		sep = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
-	}
-
-	if (sep != NULL) {
-		/*
-		 * Lookup the mounted root rather than the covered mount
-		 * point.  This may fail if the snapshot has just been
-		 * unmounted by an unrelated user space process.  This
-		 * race cannot occur to an expired mount point because
-		 * we hold the zsb->z_ctldir_lock to prevent the race.
-		 */
-		sbp = zpl_sget(&zpl_fs_type, zfsctl_test_super,
-		    zfsctl_set_super, 0, &id);
-		if (IS_ERR(sbp)) {
-			error = -PTR_ERR(sbp);
-		} else {
-			*zsbp = sbp->s_fs_info;
-			deactivate_super(sbp);
-		}
+	mutex_enter(&zfs_snapshot_lock);
+	if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
+		*zsbp = ITOZSB(se->se_root_dentry->d_inode);
+		ASSERT3U(dmu_objset_id((*zsbp)->z_os), ==, objsetid);
+		zfsctl_snapshot_rele(se);
+		error = SET_ERROR(0);
 	} else {
-		error = SET_ERROR(EINVAL);
+		error = SET_ERROR(ENOENT);
 	}
-out:
-	mutex_exit(&zsb->z_ctldir_lock);
-	ASSERT3S(error, >=, 0);
+	mutex_exit(&zfs_snapshot_lock);
 
 	return (error);
 }
 
-/* ARGSUSED */
 int
 zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
@@ -1009,6 +1101,14 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
 void
 zfsctl_init(void)
 {
+	avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+	    se_node_name));
+	avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+	    se_node_objsetid));
+	mutex_init(&zfs_snapshot_lock, NULL, MUTEX_DEFAULT, NULL);
+
 	zfs_expire_taskq = taskq_create("z_unmount", 1, defclsyspri,
 	    1, 8, TASKQ_PREPOPULATE);
 }
@@ -1021,6 +1121,10 @@ void
 zfsctl_fini(void)
 {
 	taskq_destroy(zfs_expire_taskq);
+
+	avl_destroy(&zfs_snapshots_by_name);
+	avl_destroy(&zfs_snapshots_by_objsetid);
+	mutex_destroy(&zfs_snapshot_lock);
 }
 
 module_param(zfs_expire_snapshot, int, 0644);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 51382e8b6..ba695ddbe 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3410,37 +3410,20 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
  * This function is best-effort.  Callers must deal gracefully if it
  * remains mounted (or is remounted after this call).
  *
- * XXX: This function should detect a failure to unmount a snapdir of a dataset
- * and return the appropriate error code when it is mounted. Its Illumos and
- * FreeBSD counterparts do this. We do not do this on Linux because there is no
- * clear way to access the mount information that FreeBSD and Illumos use to
- * distinguish between things with mounted snapshot directories, and things
- * without mounted snapshot directories, which include zvols. Returning a
- * failure for the latter causes `zfs destroy` to fail on zvol snapshots.
+ * Returns 0 if the argument is not a snapshot, or it is not currently a
+ * filesystem, or we were able to unmount it.  Returns error code otherwise.
  */
 int
 zfs_unmount_snap(const char *snapname)
 {
-	zfs_sb_t *zsb = NULL;
-	char *dsname;
-	char *fullname;
-	char *ptr;
+	int err;
 
-	if ((ptr = strchr(snapname, '@')) == NULL)
+	if (strchr(snapname, '@') == NULL)
 		return (0);
 
-	dsname = kmem_alloc(ptr - snapname + 1, KM_SLEEP);
-	strlcpy(dsname, snapname, ptr - snapname + 1);
-	fullname = strdup(snapname);
-
-	if (zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE) == 0) {
-		ASSERT(!dsl_pool_config_held(dmu_objset_pool(zsb->z_os)));
-		(void) zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE);
-		zfs_sb_rele(zsb, FTAG);
-	}
-
-	kmem_free(dsname, ptr - snapname + 1);
-	strfree(fullname);
+	err = zfsctl_snapshot_unmount((char *)snapname, MNT_FORCE);
+	if (err != 0 && err != ENOENT)
+		return (SET_ERROR(err));
 
 	return (0);
 }
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index a7005a2a1..f94073cbb 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -780,10 +780,6 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zsb->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
-	avl_create(&zsb->z_ctldir_snaps, snapentry_compare,
-	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
-	mutex_init(&zsb->z_ctldir_lock, NULL, MUTEX_DEFAULT, NULL);
-
 	*zsbp = zsb;
 	return (0);
 
@@ -896,8 +892,6 @@ zfs_sb_free(zfs_sb_t *zsb)
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zsb->z_hold_mtx[i]);
 	vmem_free(zsb->z_hold_mtx, sizeof (kmutex_t) * ZFS_OBJ_MTX_SZ);
-	mutex_destroy(&zsb->z_ctldir_lock);
-	avl_destroy(&zsb->z_ctldir_snaps);
 	kmem_free(zsb, sizeof (zfs_sb_t));
 }
 EXPORT_SYMBOL(zfs_sb_free);
@@ -1373,6 +1367,7 @@ zfs_domount(struct super_block *sb, void *data, int silent)
 		acltype_changed_cb(zsb, pval);
 		zsb->z_issnap = B_TRUE;
 		zsb->z_os->os_sync = ZFS_SYNC_DISABLED;
+		zsb->z_snap_defer_time = jiffies;
 
 		mutex_enter(&zsb->z_os->os_user_ptr_lock);
 		dmu_objset_set_user(zsb->z_os, zsb);
@@ -1422,8 +1417,8 @@ zfs_preumount(struct super_block *sb)
 {
 	zfs_sb_t *zsb = sb->s_fs_info;
 
-	if (zsb != NULL && zsb->z_ctldir != NULL)
-		zfsctl_destroy(zsb);
+	if (zsb)
+		zfsctl_destroy(sb->s_fs_info);
 }
 EXPORT_SYMBOL(zfs_preumount);
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 5e5f3c8db..2292ff652 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -4097,11 +4097,6 @@ zfs_inactive(struct inode *ip)
 	zfs_sb_t *zsb = ITOZSB(ip);
 	int error;
 
-	if (zfsctl_is_node(ip)) {
-		zfsctl_inode_inactive(ip);
-		return;
-	}
-
 	rw_enter(&zsb->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL) {
 		rw_exit(&zsb->z_teardown_inactive_lock);
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index d72015c34..d39743de9 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -274,9 +274,6 @@ zfs_inode_destroy(struct inode *ip)
 	znode_t *zp = ITOZ(ip);
 	zfs_sb_t *zsb = ZTOZSB(zp);
 
-	if (zfsctl_is_node(ip))
-		zfsctl_inode_destroy(ip);
-
 	mutex_enter(&zsb->z_znodes_lock);
 	if (list_link_active(&zp->z_link_node)) {
 		list_remove(&zsb->z_all_znodes, zp);
diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c
index d93d900aa..dd02e9e99 100644
--- a/module/zfs/zpl_ctldir.c
+++ b/module/zfs/zpl_ctldir.c
@@ -160,19 +160,9 @@ const struct inode_operations zpl_ops_root = {
 static struct vfsmount *
 zpl_snapdir_automount(struct path *path)
 {
-	struct dentry *dentry = path->dentry;
 	int error;
 
-	/*
-	 * We must briefly disable automounts for this dentry because the
-	 * user space mount utility will trigger another lookup on this
-	 * directory.  That will result in zpl_snapdir_automount() being
-	 * called repeatedly.  The DCACHE_NEED_AUTOMOUNT flag can be
-	 * safely reset once the mount completes.
-	 */
-	dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
-	error = -zfsctl_mount_snapshot(path, 0);
-	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+	error = -zfsctl_snapshot_mount(path, 0);
 	if (error)
 		return (ERR_PTR(error));
 
@@ -188,8 +178,10 @@ zpl_snapdir_automount(struct path *path)
 #endif /* HAVE_AUTOMOUNT */
 
 /*
- * Revalidate any dentry in the snapshot directory on lookup, since a snapshot
- * having the same name have been created or destroyed since it was cached.
+ * Negative dentries must always be revalidated so newly created snapshots
+ * can be detected and automounted.  Normal dentries should be kept because
+ * as of the 3.18 kernel revaliding the mountpoint dentry will result in
+ * the snapshot being immediately unmounted.
  */
 static int
 #ifdef HAVE_D_REVALIDATE_NAMEIDATA
@@ -198,7 +190,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i)
 zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
 #endif
 {
-	return (0);
+	return (!!dentry->d_inode);
 }
 
 dentry_operations_t zpl_dops_snapdirs = {
@@ -245,6 +237,9 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
 	ASSERT(error == 0 || ip == NULL);
 	d_clear_d_op(dentry);
 	d_set_d_op(dentry, &zpl_dops_snapdirs);
+#ifdef HAVE_AUTOMOUNT
+	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+#endif
 
 	return (d_splice_alias(ip, dentry));
 }
@@ -373,7 +368,7 @@ zpl_snapdir_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 	ZFS_ENTER(zsb);
 	error = simple_getattr(mnt, dentry, stat);
-	stat->nlink = stat->size = avl_numnodes(&zsb->z_ctldir_snaps) + 2;
+	stat->nlink = stat->size = 2;
 	stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zsb->z_os);
 	stat->atime = CURRENT_TIME;
 	ZFS_EXIT(zsb);
diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c
index e81a3cd04..6475c72d7 100644
--- a/module/zfs/zpl_inode.c
+++ b/module/zfs/zpl_inode.c
@@ -24,6 +24,7 @@
  */
 
 
+#include <sys/zfs_ctldir.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_znode.h>
@@ -240,21 +241,9 @@ zpl_rmdir(struct inode * dir, struct dentry *dentry)
 static int
 zpl_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
-	boolean_t issnap = ITOZSB(dentry->d_inode)->z_issnap;
 	int error;
 	fstrans_cookie_t cookie;
 
-	/*
-	 * Ensure MNT_SHRINKABLE is set on snapshots to ensure they are
-	 * unmounted automatically with the parent file system.  This
-	 * is done on the first getattr because it's not easy to get the
-	 * vfsmount structure at mount time.  This call path is explicitly
-	 * marked unlikely to avoid any performance impact.  FWIW, ext4
-	 * resorts to a similar trick for sysadmin convenience.
-	 */
-	if (unlikely(issnap && !(mnt->mnt_flags & MNT_SHRINKABLE)))
-		mnt->mnt_flags |= MNT_SHRINKABLE;
-
 	cookie = spl_fstrans_mark();
 	error = -zfs_getattr_fast(dentry->d_inode, stat);
 	spl_fstrans_unmark(cookie);
@@ -504,6 +493,19 @@ zpl_revalidate(struct dentry *dentry, unsigned int flags)
 		return (-ECHILD);
 
 	/*
+	 * Automounted snapshots rely on periodic dentry revalidation
+	 * to defer snapshots from being automatically unmounted.
+	 */
+	if (zsb->z_issnap) {
+		if (time_after(jiffies, zsb->z_snap_defer_time +
+		    MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
+			zsb->z_snap_defer_time = jiffies;
+			zfsctl_snapshot_unmount_delay(
+			    dmu_objset_id(zsb->z_os), zfs_expire_snapshot);
+		}
+	}
+
+	/*
 	 * After a rollback negative dentries created before the rollback
 	 * time must be invalidated.  Otherwise they can obscure files which
 	 * are only present in the rolled back dataset.
diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c
index a8d26ec1c..ecdbc36d8 100644
--- a/module/zfs/zpl_super.c
+++ b/module/zfs/zpl_super.c
@@ -198,20 +198,6 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data)
 	return (error);
 }
 
-static void
-zpl_umount_begin(struct super_block *sb)
-{
-	zfs_sb_t *zsb = sb->s_fs_info;
-	int count;
-
-	/*
-	 * Best effort to unmount snapshots in .zfs/snapshot/.  Normally this
-	 * isn't required because snapshots have the MNT_SHRINKABLE flag set.
-	 */
-	if (zsb->z_ctldir)
-		(void) zfsctl_unmount_snapshots(zsb, MNT_FORCE, &count);
-}
-
 /*
  * ZFS specific features must be explicitly handled here, the VFS will
  * automatically handled the following generic functionality.
@@ -359,7 +345,6 @@ const struct super_operations zpl_super_operations = {
 	.sync_fs		= zpl_sync_fs,
 	.statfs			= zpl_statfs,
 	.remount_fs		= zpl_remount_fs,
-	.umount_begin		= zpl_umount_begin,
 	.show_options		= zpl_show_options,
 	.show_stats		= NULL,
 #ifdef HAVE_NR_CACHED_OBJECTS
author	Brian Behlendorf <[email protected]>	2015-04-24 16:21:13 -0700
committer	Brian Behlendorf <[email protected]>	2015-08-31 13:54:39 -0700
commit	278bee9319ba5947b995673d2c76e0333f2d33d4 (patch)
tree	a621142a2d5314b460cbf1a8bc6b6092f7a46424
parent	b23975cbe0f249671c131b0d6e4ae1bb10594440 (diff)