Add .zfs control directory

Add support for the .zfs control directory. This was accomplished by leveraging as much of the existing ZFS infrastructure as posible and updating it for Linux as required. The bulk of the core functionality is now all there with the following limitations. *) The .zfs/snapshot directory automount support requires a 2.6.37 or newer kernel. The exception is RHEL6.2 which has backported the d_automount patches. *) Creating/destroying/renaming snapshots with mkdir/rmdir/mv in the .zfs/snapshot directory works as expected. However, this functionality is only available to root until zfs delegations are finished. * mkdir - create a snapshot * rmdir - destroy a snapshot * mv - rename a snapshot The following issues are known defeciences, but we expect them to be addressed by future commits. *) Add automount support for kernels older the 2.6.37. This should be possible using follow_link() which is what Linux did before. *) Accessing the .zfs/snapshot directory via NFS is not yet possible. The majority of the ground work for this is complete. However, finishing this work will require resolving some lingering integration issues with the Linux NFS kernel server. *) The .zfs/shares directory exists but no futher smb functionality has yet been implemented. Contributions-by: Rohan Puri <[email protected]> Contributiobs-by: Andrew Barnes <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #173
author: Brian Behlendorf <[email protected]> 2011-11-11 12:45:53 +0530
committer: Brian Behlendorf <[email protected]> 2012-03-22 13:03:47 -0700
commit: ebe7e575eae1e03b1faa545a424f008faeac589d (patch)
tree: 8699359f0f50019b3c2f49b46f0ff06a874c34e4 /module/zfs
parent: 49be0ccf1fdc2ce852271d4d2f8b7a9c2c4be6db (diff)
13 files changed, 1675 insertions, 83 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index b303168c8..5ec75a03a 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -64,6 +64,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap_micro.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_acl.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_byteswap.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ctldir.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_debug.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_dir.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_fm.o
@@ -83,6 +84,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/zio_checksum.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zio_compress.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zio_inject.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zle.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_ctldir.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_export.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_file.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_inode.o
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 0703a9466..1d0b4619f 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1584,6 +1584,41 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
 	return (0);
 }
 
+/*
+ * Determine the objset id for a given snapshot name.
+ */
+int
+dmu_snapshot_id(objset_t *os, const char *snapname, uint64_t *idp)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+	int error;
+
+	if (ds->ds_phys->ds_snapnames_zapobj == 0)
+		return (ENOENT);
+
+	zap_cursor_init(&cursor, ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj);
+
+	error = zap_cursor_move_to_key(&cursor, snapname, MT_EXACT);
+	if (error) {
+		zap_cursor_fini(&cursor);
+		return (error);
+	}
+
+	error = zap_cursor_retrieve(&cursor, &attr);
+	if (error) {
+		zap_cursor_fini(&cursor);
+		return (error);
+	}
+
+	*idp = attr.za_first_integer;
+	zap_cursor_fini(&cursor);
+
+	return (0);
+}
+
 int
 dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp)
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 718c3ad52..2deec8cf1 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2373,8 +2373,7 @@ dsl_snapshot_rename_one(const char *name, void *arg)
 		return (err == ENOENT ? 0 : err);
 	}
 
-/* XXX: Ignore for SPL version until mounting the FS is supported */
-#if defined(_KERNEL) && !defined(HAVE_SPL)
+#ifdef _KERNEL
 	/*
 	 * For all filesystems undergoing rename, we'll need to unmount it.
 	 */
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
new file mode 100644
index 000000000..01bf52fe9
--- /dev/null
+++ b/module/zfs/zfs_ctldir.c
@@ -0,0 +1,984 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <[email protected]>
+ *   Brian Behlendorf <[email protected]>
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' and 'shares' directory, but this may
+ * expand in the future.  The elements are built dynamically, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab.  We have three
+ * types of objects:
+ *
+ *	ctldir ------> snapshotdir -------> snapshot
+ *                                             |
+ *                                             |
+ *                                             V
+ *                                         mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding inode.
+ *
+ * All mounts are handled automatically by an user mode helper which invokes
+ * the mount mount procedure.  Unmounts are handled by allowing the mount
+ * point to expire so the kernel may automatically unmount it.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
+ * share the same zfs_sb_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
+ * (ie: snapshots) are complete ZFS filesystems and have their own unique
+ * zfs_sb_t.  However, the fsid reported by these mounts will be the same
+ * as that used by the parent zfs_sb_t to make NFS happy.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dsl_deleg.h>
+#include <sys/mount.h>
+#include <sys/zpl.h>
+#include "zfs_namecheck.h"
+
+/*
+ * Control Directory Tunables (.zfs)
+ */
+int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
+
+static zfs_snapentry_t *
+zfsctl_sep_alloc(void)
+{
+	return kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+}
+
+void
+zfsctl_sep_free(zfs_snapentry_t *sep)
+{
+	kmem_free(sep->se_name, MAXNAMELEN);
+	kmem_free(sep->se_path, PATH_MAX);
+	kmem_free(sep, sizeof (zfs_snapentry_t));
+}
+
+/*
+ * Attempt to expire an automounted snapshot, unmounts are attempted every
+ * 'zfs_expire_snapshot' seconds until they succeed.  The work request is
+ * responsible for rescheduling itself and freeing the zfs_expire_snapshot_t.
+ */
+static void
+zfsctl_expire_snapshot(void *data)
+{
+	zfs_snapentry_t *sep;
+	zfs_sb_t *zsb;
+	int error;
+
+	sep = spl_get_work_data(data, zfs_snapentry_t, se_work.work);
+	zsb = ITOZSB(sep->se_inode);
+
+	error = zfsctl_unmount_snapshot(zsb, sep->se_name, MNT_EXPIRE);
+	if (error == EBUSY)
+		schedule_delayed_work(&sep->se_work, zfs_expire_snapshot * HZ);
+}
+
+int
+snapentry_compare(const void *a, const void *b)
+{
+	const zfs_snapentry_t *sa = a;
+	const zfs_snapentry_t *sb = b;
+	int ret = strcmp(sa->se_name, sb->se_name);
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+boolean_t
+zfsctl_is_node(struct inode *ip)
+{
+	return (ITOZ(ip)->z_is_ctldir);
+}
+
+boolean_t
+zfsctl_is_snapdir(struct inode *ip)
+{
+	return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
+}
+
+/*
+ * Allocate a new inode with the passed id and ops.
+ */
+static struct inode *
+zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
+{
+	struct timespec now = current_fs_time(zsb->z_sb);
+	struct inode *ip;
+	znode_t *zp;
+
+	ip = new_inode(zsb->z_sb);
+	if (ip == NULL)
+		return (NULL);
+
+	zp = ITOZ(ip);
+	ASSERT3P(zp->z_dirlocks, ==, NULL);
+	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+	zp->z_id = id;
+	zp->z_unlinked = 0;
+	zp->z_atime_dirty = 0;
+	zp->z_zn_prefetch = 0;
+	zp->z_moved = 0;
+	zp->z_sa_hdl = NULL;
+	zp->z_blksz = 0;
+	zp->z_seq = 0;
+	zp->z_mapcnt = 0;
+	zp->z_gen = 0;
+	zp->z_size = 0;
+	zp->z_atime[0] = 0;
+	zp->z_atime[1] = 0;
+	zp->z_links = 0;
+	zp->z_pflags = 0;
+	zp->z_uid = 0;
+	zp->z_gid = 0;
+	zp->z_mode = 0;
+	zp->z_sync_cnt = 0;
+	zp->z_is_zvol = B_FALSE;
+	zp->z_is_mapped = B_FALSE;
+	zp->z_is_ctldir = B_TRUE;
+	zp->z_is_sa = B_FALSE;
+	ip->i_ino = id;
+	ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO);
+	ip->i_uid = 0;
+	ip->i_gid = 0;
+	ip->i_blkbits = SPA_MINBLOCKSHIFT;
+	ip->i_atime = now;
+	ip->i_mtime = now;
+	ip->i_ctime = now;
+	ip->i_fop = fops;
+	ip->i_op = ops;
+
+	if (insert_inode_locked(ip)) {
+		unlock_new_inode(ip);
+		iput(ip);
+		return (NULL);
+	}
+
+	mutex_enter(&zsb->z_znodes_lock);
+	list_insert_tail(&zsb->z_all_znodes, zp);
+	membar_producer();
+	mutex_exit(&zsb->z_znodes_lock);
+
+	unlock_new_inode(ip);
+
+	return (ip);
+}
+
+/*
+ * Lookup the inode with given id, it will be allocated if needed.
+ */
+static struct inode *
+zfsctl_inode_lookup(zfs_sb_t *zsb, unsigned long id,
+    const struct file_operations *fops, const struct inode_operations *ops)
+{
+	struct inode *ip = NULL;
+
+	while (ip == NULL) {
+		ip = ilookup(zsb->z_sb, id);
+		if (ip)
+			break;
+
+		/* May fail due to concurrent zfsctl_inode_alloc() */
+		ip = zfsctl_inode_alloc(zsb, id, fops, ops);
+	}
+
+	return (ip);
+}
+
+/*
+ * Free zfsctl inode specific structures, currently there are none.
+ */
+void
+zfsctl_inode_destroy(struct inode *ip)
+{
+	return;
+}
+
+/*
+ * An inode is being evicted from the cache.
+ */
+void
+zfsctl_inode_inactive(struct inode *ip)
+{
+	if (zfsctl_is_snapdir(ip))
+		zfsctl_snapdir_inactive(ip);
+}
+
+/*
+ * Create the '.zfs' directory.  This directory is cached as part of the VFS
+ * structure.  This results in a hold on the zfs_sb_t.  The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1.  This reference
+ * is removed when the ctldir is destroyed in the unmount.  All other entities
+ * under the '.zfs' directory are created dynamically as needed.
+ */
+int
+zfsctl_create(zfs_sb_t *zsb)
+{
+	ASSERT(zsb->z_ctldir == NULL);
+
+	zsb->z_ctldir = zfsctl_inode_alloc(zsb, ZFSCTL_INO_ROOT,
+	    &zpl_fops_root, &zpl_ops_root);
+	if (zsb->z_ctldir == NULL)
+		return (ENOENT);
+
+	return (0);
+}
+
+/*
+ * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
+ */
+void
+zfsctl_destroy(zfs_sb_t *zsb)
+{
+	iput(zsb->z_ctldir);
+	zsb->z_ctldir = NULL;
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+struct inode *
+zfsctl_root(znode_t *zp)
+{
+	ASSERT(zfs_has_ctldir(zp));
+	igrab(ZTOZSB(zp)->z_ctldir);
+	return (ZTOZSB(zp)->z_ctldir);
+}
+
+/*ARGSUSED*/
+int
+zfsctl_fid(struct inode *ip, fid_t *fidp)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfs_sb_t	*zsb = ITOZSB(ip);
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		i;
+
+	ZFS_ENTER(zsb);
+
+	if (fidp->fid_len < SHORT_FID_LEN) {
+		fidp->fid_len = SHORT_FID_LEN;
+		ZFS_EXIT(zsb);
+		return (ENOSPC);
+	}
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = SHORT_FID_LEN;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* .zfs znodes always have a generation number of 0 */
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = 0;
+
+	ZFS_EXIT(zsb);
+	return (0);
+}
+
+static int
+zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname)
+{
+	objset_t *os = ITOZSB(ip)->z_os;
+
+	if (snapshot_namecheck(name, NULL, NULL) != 0)
+		return (EILSEQ);
+
+	dmu_objset_name(os, zname);
+	if ((strlen(zname) + 1 + strlen(name)) >= len)
+		return (ENAMETOOLONG);
+
+	(void) strcat(zname, "@");
+	(void) strcat(zname, name);
+
+	return (0);
+}
+
+static int
+zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
+{
+	char *path_buffer, *path_ptr;
+	int path_len, error = 0;
+
+	path_buffer = kmem_alloc(len, KM_SLEEP);
+
+	path_ptr = d_path(path, path_buffer, len);
+	if (IS_ERR(path_ptr)) {
+		error = -PTR_ERR(path_ptr);
+		goto out;
+	}
+
+	path_len = path_buffer + len - 1 - path_ptr;
+	if (path_len > len) {
+		error = EFAULT;
+		goto out;
+	}
+
+	memcpy(zpath, path_ptr, path_len);
+	zpath[path_len] = '\0';
+out:
+	kmem_free(path_buffer, len);
+
+	return (error);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfs_sb_t *zsb = ITOZSB(dip);
+	int error = 0;
+
+	ZFS_ENTER(zsb);
+
+	if (strcmp(name, "..") == 0) {
+		*ipp = dip->i_sb->s_root->d_inode;
+	} else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
+		*ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIR,
+		    &zpl_fops_snapdir, &zpl_ops_snapdir);
+	} else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
+		*ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SHARES,
+		    &zpl_fops_shares, &zpl_ops_shares);
+	} else {
+		*ipp = NULL;
+	}
+
+	if (*ipp == NULL)
+		error = ENOENT;
+
+	ZFS_EXIT(zsb);
+
+	return (error);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem inode as necessary.
+ * Perform a mount of the associated dataset on top of the inode.
+ */
+/* ARGSUSED */
+int
+zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfs_sb_t *zsb = ITOZSB(dip);
+	uint64_t id;
+	int error;
+
+	ZFS_ENTER(zsb);
+
+	error = dmu_snapshot_id(zsb->z_os, name, &id);
+	if (error) {
+		ZFS_EXIT(zsb);
+		return (error);
+	}
+
+	*ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id,
+	    &simple_dir_operations, &simple_dir_inode_operations);
+	if (*ipp) {
+#ifdef HAVE_AUTOMOUNT
+		(*ipp)->i_flags |= S_AUTOMOUNT;
+#endif /* HAVE_AUTOMOUNT */
+	} else {
+		error = ENOENT;
+	}
+
+	ZFS_EXIT(zsb);
+
+	return (error);
+}
+
+static void
+zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name)
+{
+	avl_index_t where;
+
+	ASSERT(MUTEX_HELD(&zsb->z_ctldir_lock));
+	ASSERT(sep != NULL);
+
+	/*
+	 * Change the name in the AVL tree.
+	 */
+	avl_remove(&zsb->z_ctldir_snaps, sep);
+	(void) strcpy(sep->se_name, name);
+	VERIFY(avl_find(&zsb->z_ctldir_snaps, sep, &where) == NULL);
+	avl_insert(&zsb->z_ctldir_snaps, sep, where);
+}
+
+/*
+ * Renaming a directory under '.zfs/snapshot' will automatically trigger
+ * a rename of the snapshot to the new given name.  The rename is confined
+ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
+ */
+/*ARGSUSED*/
+int
+zfsctl_snapdir_rename(struct inode *sdip, char *sname,
+    struct inode *tdip, char *tname, cred_t *cr, int flags)
+{
+	zfs_sb_t *zsb = ITOZSB(sdip);
+	zfs_snapentry_t search, *sep;
+	avl_index_t where;
+	char *to, *from, *real;
+	int error;
+
+	ZFS_ENTER(zsb);
+
+	to = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	from = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+
+	if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
+		error = dmu_snapshot_realname(zsb->z_os, sname, real,
+		    MAXNAMELEN, NULL);
+		if (error == 0) {
+			sname = real;
+		} else if (error != ENOTSUP) {
+			goto out;
+		}
+	}
+
+	error = zfsctl_snapshot_zname(sdip, sname, MAXNAMELEN, from);
+	if (!error)
+		error = zfsctl_snapshot_zname(tdip, tname, MAXNAMELEN, to);
+	if (!error)
+		error = zfs_secpolicy_rename_perms(from, to, cr);
+	if (error)
+		goto out;
+
+	/*
+	 * Cannot move snapshots out of the snapdir.
+	 */
+	if (sdip != tdip) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * No-op when names are identical.
+	 */
+	if (strcmp(sname, tname) == 0) {
+		error = 0;
+		goto out;
+	}
+
+	mutex_enter(&zsb->z_ctldir_lock);
+
+	error = dmu_objset_rename(from, to, B_FALSE);
+	if (error)
+		goto out_unlock;
+
+	search.se_name = (char *)sname;
+	sep = avl_find(&zsb->z_ctldir_snaps, &search, &where);
+	if (sep)
+		zfsctl_rename_snap(zsb, sep, tname);
+
+out_unlock:
+	mutex_exit(&zsb->z_ctldir_lock);
+out:
+	kmem_free(from, MAXNAMELEN);
+	kmem_free(to, MAXNAMELEN);
+	kmem_free(real, MAXNAMELEN);
+
+	ZFS_EXIT(zsb);
+
+	return (error);
+}
+
+/*
+ * Removing a directory under '.zfs/snapshot' will automatically trigger
+ * the removal of the snapshot with the given name.
+ */
+/* ARGSUSED */
+int
+zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
+{
+	zfs_sb_t *zsb = ITOZSB(dip);
+	char *snapname, *real;
+	int error;
+
+	ZFS_ENTER(zsb);
+
+	snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+
+	if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
+		error = dmu_snapshot_realname(zsb->z_os, name, real,
+		    MAXNAMELEN, NULL);
+		if (error == 0) {
+			name = real;
+		} else if (error != ENOTSUP) {
+			goto out;
+		}
+	}
+
+	error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname);
+	if (!error)
+		error = zfs_secpolicy_destroy_perms(snapname, cr);
+	if (error)
+		goto out;
+
+	error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE);
+	if ((error == 0) || (error == ENOENT))
+		error = dmu_objset_destroy(snapname, B_FALSE);
+out:
+	kmem_free(snapname, MAXNAMELEN);
+	kmem_free(real, MAXNAMELEN);
+
+	ZFS_EXIT(zsb);
+
+	return (error);
+}
+
+/*
+ * Creating a directory under '.zfs/snapshot' will automatically trigger
+ * the creation of a new snapshot with the given name.
+ */
+/* ARGSUSED */
+int
+zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
+	struct inode **ipp, cred_t *cr, int flags)
+{
+	zfs_sb_t *zsb = ITOZSB(dip);
+	char *dsname;
+	int error;
+
+	dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+
+	if (snapshot_namecheck(dirname, NULL, NULL) != 0) {
+		error = EILSEQ;
+		goto out;
+	}
+
+	dmu_objset_name(zsb->z_os, dsname);
+
+	error = zfs_secpolicy_snapshot_perms(dsname, cr);
+	if (error)
+		goto out;
+
+	if (error == 0) {
+		error = dmu_objset_snapshot(dsname, dirname,
+		    NULL, NULL, B_FALSE, B_FALSE, -1);
+		if (error)
+			goto out;
+
+		error = zfsctl_snapdir_lookup(dip, dirname, ipp,
+		    0, cr, NULL, NULL);
+	}
+out:
+	kmem_free(dsname, MAXNAMELEN);
+
+	return (error);
+}
+
+/*
+ * When a .zfs/snapshot/<snapshot> inode is evicted they must be removed
+ * from the snapshot list.  This will normally happen as part of the auto
+ * unmount, however in the case of a manual snapshot unmount this will be
+ * the only notification we receive.
+ */
+void
+zfsctl_snapdir_inactive(struct inode *ip)
+{
+	zfs_sb_t *zsb = ITOZSB(ip);
+	zfs_snapentry_t *sep, *next;
+
+	mutex_enter(&zsb->z_ctldir_lock);
+
+	sep = avl_first(&zsb->z_ctldir_snaps);
+	while (sep != NULL) {
+		next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
+
+		if (sep->se_inode == ip) {
+			avl_remove(&zsb->z_ctldir_snaps, sep);
+			cancel_delayed_work_sync(&sep->se_work);
+			zfsctl_sep_free(sep);
+			break;
+		}
+		sep = next;
+	}
+
+	mutex_exit(&zsb->z_ctldir_lock);
+}
+
+/*
+ * Attempt to unmount a snapshot by making a call to user space.
+ * There is no assurance that this can or will succeed, is just a
+ * best effort.  In the case where it does fail, perhaps because
+ * it's in use, the unmount will fail harmlessly.
+ */
+#define SET_UNMOUNT_CMD \
+	"exec 0</dev/null " \
+	"     1>/dev/null " \
+	"     2>/dev/null; " \
+	"umount -t zfs -n %s%s"
+
+static int
+__zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags)
+{
+	char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+	char *envp[] = { NULL };
+	int error;
+
+	argv[2] = kmem_asprintf(SET_UNMOUNT_CMD,
+	    flags & MNT_FORCE ? "-f " : "", sep->se_path);
+	error = call_usermodehelper(argv[0], argv, envp, 1);
+	strfree(argv[2]);
+
+	/*
+	 * The umount system utility will return 256 on error.  We must
+	 * assume this error is because the file system is busy so it is
+	 * converted to the more sensible EBUSY.
+	 */
+	if (error)
+		error = EBUSY;
+
+	/*
+	 * This was the result of a manual unmount, cancel the delayed work
+	 * to prevent zfsctl_expire_snapshot() from attempting a unmount.
+	 */
+	if ((error == 0) && !(flags & MNT_EXPIRE))
+		cancel_delayed_work(&sep->se_work);
+
+	return (error);
+}
+
+int
+zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags)
+{
+	zfs_snapentry_t search;
+	zfs_snapentry_t *sep;
+	int error = 0;
+
+	mutex_enter(&zsb->z_ctldir_lock);
+
+	search.se_name = name;
+	sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
+	if (sep) {
+		avl_remove(&zsb->z_ctldir_snaps, sep);
+		error = __zfsctl_unmount_snapshot(sep, flags);
+		if (error == EBUSY)
+			avl_add(&zsb->z_ctldir_snaps, sep);
+		else
+			zfsctl_sep_free(sep);
+	} else {
+		error = ENOENT;
+	}
+
+	mutex_exit(&zsb->z_ctldir_lock);
+	ASSERT3S(error, >=, 0);
+
+	return (error);
+}
+
+/*
+ * Traverse all mounted snapshots and attempt to unmount them.  This
+ * is best effort, on failure EEXIST is returned and count will be set
+ * to the number of file snapshots which could not be unmounted.
+ */
+int
+zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count)
+{
+	zfs_snapentry_t *sep, *next;
+	int error = 0;
+
+	*count = 0;
+
+	ASSERT(zsb->z_ctldir != NULL);
+	mutex_enter(&zsb->z_ctldir_lock);
+
+	sep = avl_first(&zsb->z_ctldir_snaps);
+	while (sep != NULL) {
+		next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
+		avl_remove(&zsb->z_ctldir_snaps, sep);
+		error = __zfsctl_unmount_snapshot(sep, flags);
+		if (error == EBUSY) {
+			avl_add(&zsb->z_ctldir_snaps, sep);
+			(*count)++;
+		} else {
+			zfsctl_sep_free(sep);
+		}
+
+		sep = next;
+	}
+
+	mutex_exit(&zsb->z_ctldir_lock);
+
+	return ((*count > 0) ? EEXIST : 0);
+}
+
+#define SET_MOUNT_CMD \
+	"exec 0</dev/null " \
+	"     1>/dev/null " \
+	"     2>/dev/null; " \
+	"mount -t zfs -n %s %s"
+
+int
+zfsctl_mount_snapshot(struct path *path, int flags)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *ip = dentry->d_inode;
+	zfs_sb_t *zsb = ITOZSB(ip);
+	char *full_name, *full_path;
+	zfs_snapentry_t *sep;
+	zfs_snapentry_t search;
+	char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+	char *envp[] = { NULL };
+	int error;
+
+	ZFS_ENTER(zsb);
+
+	full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
+	full_path = kmem_zalloc(PATH_MAX, KM_SLEEP);
+
+	error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name);
+	if (error)
+		goto error;
+
+	error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path);
+	if (error)
+		goto error;
+
+	/*
+	 * Attempt to mount the snapshot from user space.  Normally this
+	 * would be done using the vfs_kern_mount() function, however that
+	 * function is marked GPL-only and cannot be used.  On error we
+	 * careful to log the real error to the console and return EISDIR
+	 * to safely abort the automount.  This should be very rare.
+	 */
+	argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path);
+	error = call_usermodehelper(argv[0], argv, envp, 1);
+	strfree(argv[2]);
+	if (error) {
+		printk("ZFS: Unable to automount %s at %s: %d\n",
+		    full_name, full_path, error);
+		error = EISDIR;
+		goto error;
+	}
+
+	mutex_enter(&zsb->z_ctldir_lock);
+
+	/*
+	 * Ensure a previous entry does not exist, if it does safely remove
+	 * it any cancel the outstanding expiration.  This can occur when a
+	 * snapshot is manually unmounted and then an automount is triggered.
+	 */
+	search.se_name = full_name;
+	sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
+	if (sep) {
+		avl_remove(&zsb->z_ctldir_snaps, sep);
+		cancel_delayed_work_sync(&sep->se_work);
+		zfsctl_sep_free(sep);
+	}
+
+	sep = zfsctl_sep_alloc();
+	sep->se_name = full_name;
+	sep->se_path = full_path;
+	sep->se_inode = ip;
+	avl_add(&zsb->z_ctldir_snaps, sep);
+
+        spl_init_delayed_work(&sep->se_work, zfsctl_expire_snapshot, sep);
+	schedule_delayed_work(&sep->se_work, zfs_expire_snapshot * HZ);
+
+	mutex_exit(&zsb->z_ctldir_lock);
+error:
+	if (error) {
+		kmem_free(full_name, MAXNAMELEN);
+		kmem_free(full_path, PATH_MAX);
+	}
+
+	ZFS_EXIT(zsb);
+
+	return (error);
+}
+
+/*
+ * Check if this super block has a matching objset id.
+ */
+static int
+zfsctl_test_super(struct super_block *sb, void *objsetidp)
+{
+	zfs_sb_t *zsb = sb->s_fs_info;
+	uint64_t objsetid = *(uint64_t *)objsetidp;
+
+	return (dmu_objset_id(zsb->z_os) == objsetid);
+}
+
+/*
+ * Prevent a new super block from being allocated if an existing one
+ * could not be located.  We only want to preform a lookup operation.
+ */
+static int
+zfsctl_set_super(struct super_block *sb, void *objsetidp)
+{
+	return (-EEXIST);
+}
+
+int
+zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp)
+{
+	zfs_sb_t *zsb = sb->s_fs_info;
+	struct super_block *sbp;
+	zfs_snapentry_t *sep;
+	uint64_t id;
+	int error;
+
+	ASSERT(zsb->z_ctldir != NULL);
+
+	mutex_enter(&zsb->z_ctldir_lock);
+
+	/*
+	 * Verify that the snapshot is mounted.
+	 */
+	sep = avl_first(&zsb->z_ctldir_snaps);
+	while (sep != NULL) {
+		error = dmu_snapshot_id(zsb->z_os, sep->se_name, &id);
+		if (error)
+			goto out;
+
+		if (id == objsetid)
+			break;
+
+		sep = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
+	}
+
+	if (sep != NULL) {
+		/*
+		 * Lookup the mounted root rather than the covered mount
+		 * point.  This may fail if the snapshot has just been
+		 * unmounted by an unrelated user space process.  This
+		 * race cannot occur to an expired mount point because
+		 * we hold the zsb->z_ctldir_lock to prevent the race.
+		 */
+		sbp = sget(&zpl_fs_type, zfsctl_test_super,
+		    zfsctl_set_super, &id);
+		if (IS_ERR(sbp)) {
+			error = -PTR_ERR(sbp);
+		} else {
+			*zsbp = sbp->s_fs_info;
+			deactivate_super(sbp);
+		}
+	} else {
+		error = EINVAL;
+	}
+out:
+	mutex_exit(&zsb->z_ctldir_lock);
+	ASSERT3S(error, >=, 0);
+
+	return (error);
+}
+
+/* ARGSUSED */
+int
+zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfs_sb_t *zsb = ITOZSB(dip);
+	struct inode *ip;
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zsb);
+
+	if (zsb->z_shares_dir == 0) {
+		ZFS_EXIT(zsb);
+		return (-ENOTSUP);
+	}
+
+	error = zfs_zget(zsb, zsb->z_shares_dir, &dzp);
+	if (error) {
+		ZFS_EXIT(zsb);
+		return (error);
+	}
+
+	error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL);
+
+	iput(ZTOI(dzp));
+	ZFS_EXIT(zsb);
+
+	return (error);
+}
+
+
+/*
+ * Initialize the various pieces we'll need to create and manipulate .zfs
+ * directories.  Currently this is unused but available.
+ */
+void
+zfsctl_init(void)
+{
+}
+
+/*
+ * Cleanup the various pieces we needed for .zfs directories.  In particular
+ * ensure the expiry timer is canceled safely.
+ */
+void
+zfsctl_fini(void)
+{
+}
+
+module_param(zfs_expire_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
index 8f1a0c2cc..6cd9c8508 100644
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@@ -50,6 +50,7 @@
 #include <sys/zap.h>
 #include <sys/dmu.h>
 #include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
@@ -415,28 +416,24 @@ zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags,
 
 		/*
 		 * If we are a snapshot mounted under .zfs, return
-		 * the vp for the snapshot directory.
+		 * the inode pointer for the snapshot directory.
 		 */
 		if ((error = sa_lookup(dzp->z_sa_hdl,
 		    SA_ZPL_PARENT(zsb), &parent, sizeof (parent))) != 0)
 			return (error);
-#ifdef HAVE_SNAPSHOT
+
 		if (parent == dzp->z_id && zsb->z_parent != zsb) {
 			error = zfsctl_root_lookup(zsb->z_parent->z_ctldir,
-			    "snapshot", ipp, NULL, 0, NULL, kcred,
-			    NULL, NULL, NULL);
+			    "snapshot", ipp, 0, kcred, NULL, NULL);
 			return (error);
 		}
-#endif /* HAVE_SNAPSHOT */
 		rw_enter(&dzp->z_parent_lock, RW_READER);
 		error = zfs_zget(zsb, parent, &zp);
 		if (error == 0)
 			*ipp = ZTOI(zp);
 		rw_exit(&dzp->z_parent_lock);
-#ifdef HAVE_SNAPSHOT
 	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
 		*ipp = zfsctl_root(dzp);
-#endif /* HAVE_SNAPSHOT */
 	} else {
 		int zf;
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 532f17aa1..d2ad1af71 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -58,6 +58,7 @@
 #include <sys/mount.h>
 #include <sys/sdt.h>
 #include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
@@ -2690,33 +2691,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc)
 	return (error);
 }
 
-#ifdef HAVE_SNAPSHOT
-/*
- * Search the vfs list for a specified resource.  Returns a pointer to it
- * or NULL if no suitable entry is found. The caller of this routine
- * is responsible for releasing the returned vfs pointer.
- */
-static vfs_t *
-zfs_get_vfs(const char *resource)
-{
-	struct vfs *vfsp;
-	struct vfs *vfs_found = NULL;
-
-	vfs_list_read_lock();
-	vfsp = rootvfs;
-	do {
-		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
-			mntget(vfsp);
-			vfs_found = vfsp;
-			break;
-		}
-		vfsp = vfsp->vfs_next;
-	} while (vfsp != rootvfs);
-	vfs_list_unlock();
-	return (vfs_found);
-}
-#endif /* HAVE_SNAPSHOT */
-
 /* ARGSUSED */
 static void
 zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
@@ -3067,38 +3041,52 @@ out:
 	return (error);
 }
 
+/*
+ * inputs:
+ * name		dataset name, or when 'arg == NULL' the full snapshot name
+ * arg		short snapshot name (i.e. part after the '@')
+ */
 int
 zfs_unmount_snap(const char *name, void *arg)
 {
-#ifdef HAVE_SNAPSHOT
-	vfs_t *vfsp = NULL;
+	zfs_sb_t *zsb = NULL;
+	char *dsname;
+	char *snapname;
+	char *fullname;
+	char *ptr;
+	int error;
 
 	if (arg) {
-		char *snapname = arg;
-		char *fullname = kmem_asprintf("%s@%s", name, snapname);
-		vfsp = zfs_get_vfs(fullname);
-		strfree(fullname);
-	} else if (strchr(name, '@')) {
-		vfsp = zfs_get_vfs(name);
+		dsname = strdup(name);
+		snapname = strdup(arg);
+	} else {
+		ptr = strchr(name, '@');
+		if (ptr) {
+			dsname = strdup(name);
+			dsname[ptr - name] = '\0';
+			snapname = strdup(ptr + 1);
+		} else {
+			return (0);
+		}
 	}
 
-	if (vfsp) {
-		/*
-		 * Always force the unmount for snapshots.
-		 */
-		int flag = MS_FORCE;
-		int err;
+	fullname = kmem_asprintf("%s@%s", dsname, snapname);
 
-		if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
-			mntput(vfsp);
-			return (err);
-		}
-		mntput(vfsp);
-		if ((err = dounmount(vfsp, flag, kcred)) != 0)
-			return (err);
+	error = zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE);
+	if (error == 0) {
+		error = zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE);
+		zfs_sb_rele(zsb, FTAG);
+
+		/* Allow ENOENT for consistency with upstream */
+		if (error == ENOENT)
+			error = 0;
 	}
-#endif /* HAVE_SNAPSHOT */
-	return (0);
+
+	strfree(dsname);
+	strfree(snapname);
+	strfree(fullname);
+
+	return (error);
 }
 
 /*
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index fb319a547..8f1c713c0 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -56,6 +56,7 @@
 #include <sys/modctl.h>
 #include <sys/refstr.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/bootconf.h>
 #include <sys/sunddi.h>
@@ -710,6 +711,10 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zsb->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
+	avl_create(&zsb->z_ctldir_snaps, snapentry_compare,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
+	mutex_init(&zsb->z_ctldir_lock, NULL, MUTEX_DEFAULT, NULL);
+
 	*zsbp = zsb;
 	return (0);
 
@@ -819,6 +824,8 @@ zfs_sb_free(zfs_sb_t *zsb)
 	rw_destroy(&zsb->z_fuid_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zsb->z_hold_mtx[i]);
+	mutex_destroy(&zsb->z_ctldir_lock);
+	avl_destroy(&zsb->z_ctldir_snaps);
 	kmem_free(zsb, sizeof (zfs_sb_t));
 }
 EXPORT_SYMBOL(zfs_sb_free);
@@ -1183,9 +1190,6 @@ zfs_domount(struct super_block *sb, void *data, int silent)
 		mutex_exit(&zsb->z_os->os_user_ptr_lock);
 	} else {
 		error = zfs_sb_setup(zsb, B_TRUE);
-#ifdef HAVE_SNAPSHOT
-		(void) zfs_snap_create(zsb);
-#endif /* HAVE_SNAPSHOT */
 	}
 
 	/* Allocate a root inode for the filesystem. */
@@ -1202,6 +1206,9 @@ zfs_domount(struct super_block *sb, void *data, int silent)
 		error = ENOMEM;
 		goto out;
 	}
+
+	if (!zsb->z_issnap)
+		zfsctl_create(zsb);
 out:
 	if (error) {
 		dmu_objset_disown(zsb->z_os, zsb);
@@ -1212,6 +1219,27 @@ out:
 }
 EXPORT_SYMBOL(zfs_domount);
 
+/*
+ * Called when an unmount is requested and certain sanity checks have
+ * already passed.  At this point no dentries or inodes have been reclaimed
+ * from their respective caches.  We drop the extra reference on the .zfs
+ * control directory to allow everything to be reclaimed.  All snapshots
+ * must already have been unmounted to reach this point.
+ */
+void
+zfs_preumount(struct super_block *sb)
+{
+	zfs_sb_t *zsb = sb->s_fs_info;
+
+	if (zsb->z_ctldir != NULL)
+		zfsctl_destroy(zsb);
+}
+EXPORT_SYMBOL(zfs_preumount);
+
+/*
+ * Called once all other unmount released tear down has occurred.
+ * It is our responsibility to release any remaining infrastructure.
+ */
 /*ARGSUSED*/
 int
 zfs_umount(struct super_block *sb)
@@ -1288,11 +1316,10 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
 
 		ZFS_EXIT(zsb);
 
-#ifdef HAVE_SNAPSHOT
-		err = zfsctl_lookup_objset(vfsp, objsetid, &zsb);
+		err = zfsctl_lookup_objset(sb, objsetid, &zsb);
 		if (err)
 			return (EINVAL);
-#endif /* HAVE_SNAPSHOT */
+
 		ZFS_ENTER(zsb);
 	}
 
@@ -1309,22 +1336,20 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
 		return (EINVAL);
 	}
 
-#ifdef HAVE_SNAPSHOT
 	/* A zero fid_gen means we are in the .zfs control directories */
 	if (fid_gen == 0 &&
 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
 		*ipp = zsb->z_ctldir;
 		ASSERT(*ipp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
-			VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, NULL,
-			    0, NULL, NULL, NULL, NULL, NULL) == 0);
+			VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
+			    0, kcred, NULL, NULL) == 0);
 		} else {
 			igrab(*ipp);
 		}
 		ZFS_EXIT(zsb);
 		return (0);
 	}
-#endif /* HAVE_SNAPSHOT */
 
 	gen_mask = -1ULL >> (64 - 8 * i);
 
@@ -1550,6 +1575,7 @@ EXPORT_SYMBOL(zfs_get_zplprop);
 void
 zfs_init(void)
 {
+	zfsctl_init();
 	zfs_znode_init();
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 	register_filesystem(&zpl_fs_type);
@@ -1561,4 +1587,5 @@ zfs_fini(void)
 {
 	unregister_filesystem(&zpl_fs_type);
 	zfs_znode_fini();
+	zfsctl_fini();
 }
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 74b96b8d7..2da5fec86 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -63,6 +63,7 @@
 #include <sys/sid.h>
 #include <sys/mode.h>
 #include "fs/fs_subr.h"
+#include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_vnops.h>
@@ -2045,7 +2046,7 @@ zfs_readdir(struct inode *ip, void *dirent, filldir_t filldir,
 			dmu_prefetch(os, objnum, 0, 0);
 		}
 
-		if (*pos >= 2) {
+		if (*pos > 2 || (*pos == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			*pos = zap_cursor_serialize(&zc);
 		} else {
@@ -3876,9 +3877,10 @@ zfs_inactive(struct inode *ip)
 	zfs_sb_t *zsb = ITOZSB(ip);
 	int error;
 
-#ifdef HAVE_SNAPSHOT
-	/* Early return for snapshot inode? */
-#endif /* HAVE_SNAPSHOT */
+	if (zfsctl_is_node(ip)) {
+		zfsctl_inode_inactive(ip);
+		return;
+	}
 
 	rw_enter(&zsb->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL) {
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 709ae74f8..3a6872f3e 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -52,6 +52,7 @@
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/kidmap.h>
@@ -267,6 +268,9 @@ zfs_inode_destroy(struct inode *ip)
 	znode_t *zp = ITOZ(ip);
 	zfs_sb_t *zsb = ZTOZSB(zp);
 
+	if (zfsctl_is_node(ip))
+		zfsctl_inode_destroy(ip);
+
 	mutex_enter(&zsb->z_znodes_lock);
 	list_remove(&zsb->z_all_znodes, zp);
 	zsb->z_nr_znodes--;
@@ -353,6 +357,8 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
 
 	zp = ITOZ(ip);
 	ASSERT(zp->z_dirlocks == NULL);
+	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT3P(zp->z_xattr_cached, ==, NULL);
 	zp->z_moved = 0;
 	zp->z_sa_hdl = NULL;
 	zp->z_unlinked = 0;
@@ -362,7 +368,9 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_is_zvol = 0;
+	zp->z_is_zvol = B_FALSE;
+	zp->z_is_mapped = B_FALSE;
+	zp->z_is_ctldir = B_FALSE;
 
 	zfs_znode_sa_init(zsb, zp, db, obj_type, hdl);
 
@@ -434,6 +442,10 @@ zfs_inode_update(znode_t *zp)
 	zsb = ZTOZSB(zp);
 	ip = ZTOI(zp);
 
+	/* Skip .zfs control nodes which do not exist on disk. */
+	if (zfsctl_is_node(ip))
+		return;
+
 	sa_lookup(zp->z_sa_hdl, SA_ZPL_ATIME(zsb), &atime, 16);
 	sa_lookup(zp->z_sa_hdl, SA_ZPL_MTIME(zsb), &mtime, 16);
 	sa_lookup(zp->z_sa_hdl, SA_ZPL_CTIME(zsb), &ctime, 16);
diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c
new file mode 100644
index 000000000..6c742c9e8
--- /dev/null
+++ b/module/zfs/zpl_ctldir.c
@@ -0,0 +1,519 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <[email protected]>
+ *   Brian Behlendorf <[email protected]>
+ */
+
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+/*
+ * Common open routine.  Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zpl_common_open(struct inode *ip, struct file *filp)
+{
+	if (filp->f_mode & FMODE_WRITE)
+		return (-EACCES);
+
+	return generic_file_open(ip, filp);
+}
+
+static int
+zpl_common_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *ip = dentry->d_inode;
+	int error = 0;
+
+	switch (filp->f_pos) {
+	case 0:
+		error = filldir(dirent, ".", 1, 0, ip->i_ino, DT_DIR);
+		if (error)
+			break;
+
+		filp->f_pos++;
+		/* fall-thru */
+	case 1:
+		error = filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR);
+		if (error)
+			break;
+
+		filp->f_pos++;
+		/* fall-thru */
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+/*
+ * Get root directory contents.
+ */
+static int
+zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *ip = dentry->d_inode;
+	zfs_sb_t *zsb = ITOZSB(ip);
+	int error = 0;
+
+	ZFS_ENTER(zsb);
+
+	switch (filp->f_pos) {
+	case 0:
+		error = filldir(dirent, ".", 1, 0, ip->i_ino, DT_DIR);
+		if (error)
+			goto out;
+
+		filp->f_pos++;
+		/* fall-thru */
+	case 1:
+		error = filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR);
+		if (error)
+			goto out;
+
+		filp->f_pos++;
+		/* fall-thru */
+	case 2:
+		error = filldir(dirent, ZFS_SNAPDIR_NAME,
+		    strlen(ZFS_SNAPDIR_NAME), 2, ZFSCTL_INO_SNAPDIR, DT_DIR);
+		if (error)
+			goto out;
+
+		filp->f_pos++;
+		/* fall-thru */
+	case 3:
+		error = filldir(dirent, ZFS_SHAREDIR_NAME,
+		    strlen(ZFS_SHAREDIR_NAME), 3, ZFSCTL_INO_SHARES, DT_DIR);
+		if (error)
+			goto out;
+
+		filp->f_pos++;
+		/* fall-thru */
+	}
+out:
+	ZFS_EXIT(zsb);
+
+	return (error);
+}
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_root_getattr(struct vfsmount *mnt, struct dentry *dentry,
+    struct kstat *stat)
+{
+	int error;
+
+	error = simple_getattr(mnt, dentry, stat);
+	stat->atime = CURRENT_TIME;
+
+	return (error);
+}
+
+static struct dentry *
+zpl_root_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return d_splice_alias(NULL, dentry);
+		else
+			return ERR_PTR(error);
+	}
+
+        return d_splice_alias(ip, dentry);
+}
+
+/*
+ * The '.zfs' control directory file and inode operations.
+ */
+const struct file_operations zpl_fops_root = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= zpl_root_readdir,
+};
+
+const struct inode_operations zpl_ops_root = {
+	.lookup		= zpl_root_lookup,
+	.getattr	= zpl_root_getattr,
+};
+
+static struct dentry *
+zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
+    struct nameidata *nd)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip,
+	    0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return d_splice_alias(NULL, dentry);
+		else
+			return ERR_PTR(error);
+	}
+
+	/*
+	 * Auto mounting of snapshots is only supported for 2.6.37 and
+	 * newer kernels.  Prior to this kernel the ops->follow_link()
+	 * callback was used as a hack to trigger the mount.  The
+	 * resulting vfsmount was then explicitly grafted in to the
+	 * name space.  While it might be possible to add compatibility
+	 * code to accomplish this it would require considerable care.
+	 */
+#ifdef HAVE_AUTOMOUNT
+	dentry->d_op = &zpl_dops_snapdirs;
+#endif /* HAVE_AUTOMOUNT */
+
+	return d_splice_alias(ip, dentry);
+}
+
+/* ARGSUSED */
+static int
+zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *dip = dentry->d_inode;
+	zfs_sb_t *zsb = ITOZSB(dip);
+	char snapname[MAXNAMELEN];
+	uint64_t id, cookie;
+	boolean_t case_conflict;
+	int error = 0;
+
+	ZFS_ENTER(zsb);
+
+	cookie = filp->f_pos;
+	switch (filp->f_pos) {
+	case 0:
+		error = filldir(dirent, ".", 1, 0, dip->i_ino, DT_DIR);
+		if (error)
+			goto out;
+
+		filp->f_pos++;
+		/* fall-thru */
+	case 1:
+		error = filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR);
+		if (error)
+			goto out;
+
+		filp->f_pos++;
+		/* fall-thru */
+	default:
+		while (error == 0) {
+			error = -dmu_snapshot_list_next(zsb->z_os, MAXNAMELEN,
+			    snapname, &id, &cookie, &case_conflict);
+			if (error)
+				goto out;
+
+			error = filldir(dirent, snapname, strlen(snapname),
+			    filp->f_pos, ZFSCTL_INO_SHARES - id, DT_DIR);
+			if (error)
+				goto out;
+
+			filp->f_pos = cookie;
+		}
+	}
+out:
+	ZFS_EXIT(zsb);
+
+	if (error == -ENOENT)
+		return (0);
+
+	return (error);
+}
+
+int
+zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry)
+{
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_snapdir_rename(sdip, dname(sdentry),
+	    tdip, dname(tdentry), cr, 0);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, int mode)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dip, dentry, mode | S_IFDIR, cr);
+
+	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
+	if (error == 0) {
+#ifdef HAVE_AUTOMOUNT
+		dentry->d_op = &zpl_dops_snapdirs;
+#endif /* HAVE_AUTOMOUNT */
+		d_instantiate(dentry, ip);
+	}
+
+	kmem_free(vap, sizeof(vattr_t));
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+#ifdef HAVE_AUTOMOUNT
+static struct vfsmount *
+zpl_snapdir_automount(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	int error;
+
+	/*
+	 * We must briefly disable automounts for this dentry because the
+	 * user space mount utility will trigger another lookup on this
+	 * directory.  That will result in zpl_snapdir_automount() being
+	 * called repeatedly.  The DCACHE_NEED_AUTOMOUNT flag can be
+	 * safely reset once the mount completes.
+	 */
+	dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
+	error = -zfsctl_mount_snapshot(path, 0);
+	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+	if (error)
+		return ERR_PTR(error);
+
+	/*
+	 * Rather than returning the new vfsmount for the snapshot we must
+	 * return NULL to indicate a mount collision.  This is done because
+	 * the user space mount calls do_add_mount() which adds the vfsmount
+	 * to the name space.  If we returned the new mount here it would be
+	 * added again to the vfsmount list resulting in list corruption.
+	 */
+	return (NULL);
+}
+#endif /* HAVE_AUTOMOUNT */
+
+/*
+ * Get snapshot directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_snapdir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+    struct kstat *stat)
+{
+	zfs_sb_t *zsb = ITOZSB(dentry->d_inode);
+	int error;
+
+	ZFS_ENTER(zsb);
+	error = simple_getattr(mnt, dentry, stat);
+	stat->nlink = stat->size = avl_numnodes(&zsb->z_ctldir_snaps) + 2;
+	stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zsb->z_os);
+	stat->atime = CURRENT_TIME;
+	ZFS_EXIT(zsb);
+
+	return (error);
+}
+
+/*
+ * The '.zfs/snapshot' directory file operations.  These mainly control
+ * generating the list of available snapshots when doing an 'ls' in the
+ * directory.  See zpl_snapdir_readdir().
+ */
+const struct file_operations zpl_fops_snapdir = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= zpl_snapdir_readdir,
+};
+
+/*
+ * The '.zfs/snapshot' directory inode operations.  These mainly control
+ * creating an inode for a snapshot directory and initializing the needed
+ * infrastructure to automount the snapshot.  See zpl_snapdir_lookup().
+ */
+const struct inode_operations zpl_ops_snapdir = {
+	.lookup		= zpl_snapdir_lookup,
+	.getattr	= zpl_snapdir_getattr,
+	.rename		= zpl_snapdir_rename,
+	.rmdir		= zpl_snapdir_rmdir,
+	.mkdir		= zpl_snapdir_mkdir,
+};
+
+#ifdef HAVE_AUTOMOUNT
+const struct dentry_operations zpl_dops_snapdirs = {
+	.d_automount	= zpl_snapdir_automount,
+};
+#endif /* HAVE_AUTOMOUNT */
+
+static struct dentry *
+zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
+    struct nameidata *nd)
+{
+	cred_t *cr = CRED();
+	struct inode *ip = NULL;
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_shares_lookup(dip, dname(dentry), &ip,
+	    0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return d_splice_alias(NULL, dentry);
+		else
+			return ERR_PTR(error);
+	}
+
+	return d_splice_alias(ip, dentry);
+}
+
+/* ARGSUSED */
+static int
+zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	cred_t *cr = CRED();
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *ip = dentry->d_inode;
+	zfs_sb_t *zsb = ITOZSB(ip);
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zsb);
+
+	if (zsb->z_shares_dir == 0) {
+		error = zpl_common_readdir(filp, dirent, filldir);
+		ZFS_EXIT(zsb);
+		return (error);
+	}
+
+	error = -zfs_zget(zsb, zsb->z_shares_dir, &dzp);
+	if (error) {
+		ZFS_EXIT(zsb);
+		return (error);
+	}
+
+	crhold(cr);
+	error = -zfs_readdir(ZTOI(dzp), dirent, filldir, &filp->f_pos, cr);
+	crfree(cr);
+
+	iput(ZTOI(dzp));
+	ZFS_EXIT(zsb);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zpl_shares_getattr(struct vfsmount *mnt, struct dentry *dentry,
+    struct kstat *stat)
+{
+	struct inode *ip = dentry->d_inode;
+	zfs_sb_t *zsb = ITOZSB(ip);
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zsb);
+
+	if (zsb->z_shares_dir == 0) {
+		error = simple_getattr(mnt, dentry, stat);
+		stat->nlink = stat->size = 2;
+		stat->atime = CURRENT_TIME;
+		ZFS_EXIT(zsb);
+		return (error);
+	}
+
+	error = -zfs_zget(zsb, zsb->z_shares_dir, &dzp);
+	if (error == 0)
+		error = -zfs_getattr_fast(dentry->d_inode, stat);
+
+	iput(ZTOI(dzp));
+	ZFS_EXIT(zsb);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+/*
+ * The '.zfs/shares' directory file operations.
+ */
+const struct file_operations zpl_fops_shares = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= zpl_shares_readdir,
+};
+
+/*
+ * The '.zfs/shares' directory inode operations.
+ */
+const struct inode_operations zpl_ops_shares = {
+	.lookup		= zpl_shares_lookup,
+	.getattr	= zpl_shares_getattr,
+};
diff --git a/module/zfs/zpl_export.c b/module/zfs/zpl_export.c
index 4fe998437..f82ee3088 100644
--- a/module/zfs/zpl_export.c
+++ b/module/zfs/zpl_export.c
@@ -25,6 +25,7 @@
 
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
 #include <sys/zpl.h>
 
 
@@ -42,7 +43,10 @@ zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable)
 
 	fid->fid_len = len_bytes - offsetof(fid_t, fid_data);
 
-	rc = zfs_fid(ip, fid);
+	if (zfsctl_is_node(ip))
+		rc = zfsctl_fid(ip, fid);
+	else
+		rc = zfs_fid(ip, fid);
 
 	len_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
 	*max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32);
diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c
index 9b5533755..d9b918b43 100644
--- a/module/zfs/zpl_inode.c
+++ b/module/zfs/zpl_inode.c
@@ -25,6 +25,7 @@
 
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
 #include <sys/vfs.h>
 #include <sys/zpl.h>
 
@@ -51,7 +52,7 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	return d_splice_alias(ip, dentry);
 }
 
-static void
+void
 zpl_vap_init(vattr_t *vap, struct inode *dir, struct dentry *dentry,
     mode_t mode, cred_t *cr)
 {
@@ -171,8 +172,20 @@ zpl_rmdir(struct inode * dir, struct dentry *dentry)
 static int
 zpl_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
+	boolean_t issnap = ITOZSB(dentry->d_inode)->z_issnap;
 	int error;
 
+	/*
+	 * Ensure MNT_SHRINKABLE is set on snapshots to ensure they are
+	 * unmounted automatically with the parent file system.  This
+	 * is done on the first getattr because it's not easy to get the
+	 * vfsmount structure at mount time.  This call path is explicitly
+	 * marked unlikely to avoid any performance impact.  FWIW, ext4
+	 * resorts to a similar trick for sysadmin convenience.
+	 */
+	if (unlikely(issnap && !(mnt->mnt_flags & MNT_SHRINKABLE)))
+		mnt->mnt_flags |= MNT_SHRINKABLE;
+
 	error = -zfs_getattr_fast(dentry->d_inode, stat);
 	ASSERT3S(error, <=, 0);
 
diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c
index 0e6e9360f..98d0a0312 100644
--- a/module/zfs/zpl_super.c
+++ b/module/zfs/zpl_super.c
@@ -26,6 +26,7 @@
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
 #include <sys/zpl.h>
 
 
@@ -139,6 +140,20 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data)
 	return (error);
 }
 
+static void
+zpl_umount_begin(struct super_block *sb)
+{
+	zfs_sb_t *zsb = sb->s_fs_info;
+	int count;
+
+	/*
+	 * Best effort to unmount snapshots in .zfs/snapshot/.  Normally this
+	 * isn't required because snapshots have the MNT_SHRINKABLE flag set.
+	 */
+	if (zsb->z_ctldir)
+		(void) zfsctl_unmount_snapshots(zsb, MNT_FORCE, &count);
+}
+
 /*
  * The Linux VFS automatically handles the following flags:
  * MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, MNT_NOATIME, MNT_READONLY
@@ -199,13 +214,7 @@ zpl_get_sb(struct file_system_type *fs_type, int flags,
 static void
 zpl_kill_sb(struct super_block *sb)
 {
-#ifdef HAVE_SNAPSHOT
-	zfs_sb_t *zsb = sb->s_fs_info;
-
-	if (zsb && dmu_objset_is_snapshot(zsb->z_os))
-		zfs_snap_destroy(zsb);
-#endif /* HAVE_SNAPSHOT */
-
+	zfs_preumount(sb);
 	kill_anon_super(sb);
 }
 
@@ -306,6 +315,7 @@ const struct super_operations zpl_super_operations = {
 	.sync_fs		= zpl_sync_fs,
 	.statfs			= zpl_statfs,
 	.remount_fs		= zpl_remount_fs,
+	.umount_begin		= zpl_umount_begin,
 	.show_options		= zpl_show_options,
 	.show_stats		= NULL,
 #ifdef HAVE_NR_CACHED_OBJECTS
author	Brian Behlendorf <[email protected]>	2011-11-11 12:45:53 +0530
committer	Brian Behlendorf <[email protected]>	2012-03-22 13:03:47 -0700
commit	ebe7e575eae1e03b1faa545a424f008faeac589d (patch)
tree	8699359f0f50019b3c2f49b46f0ff06a874c34e4 /module/zfs
parent	49be0ccf1fdc2ce852271d4d2f8b7a9c2c4be6db (diff)