9 files changed, 144 insertions, 42 deletions
diff --git a/include/linux/vfs_compat.h b/include/linux/vfs_compat.h
index c4e1771ae..c9fa76ece 100644
--- a/include/linux/vfs_compat.h
+++ b/include/linux/vfs_compat.h
@@ -95,6 +95,14 @@ bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap)
 #endif /* HAVE_BDI && !HAVE_BDI_SETUP_AND_REGISTER */
 
 /*
+ * 2.6.38 API change,
+ * LOOKUP_RCU flag introduced to distinguish rcu-walk from ref-walk cases.
+ */
+#ifndef LOOKUP_RCU
+#define LOOKUP_RCU      0x0
+#endif /* LOOKUP_RCU */
+
+/*
  * 3.2-rc1 API change,
  * Add set_nlink() if it is not exported by the Linux kernel.
  *
diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h
index 4dd46710f..f685c1296 100644
--- a/include/sys/zfs_vfsops.h
+++ b/include/sys/zfs_vfsops.h
@@ -69,6 +69,7 @@ typedef struct zfs_sb {
 	krwlock_t	z_teardown_inactive_lock;
 	list_t		z_all_znodes;	/* all znodes in the fs */
 	uint64_t	z_nr_znodes;	/* number of znodes in the fs */
+	unsigned long	z_rollback_time;/* last online rollback time */
 	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
 	struct inode	*z_ctldir;	/* .zfs directory inode */
 	avl_tree_t	z_ctldir_snaps;	/* .zfs/snapshot entries */
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index 0b75d5295..41233547b 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -216,6 +216,7 @@ typedef struct znode {
 	boolean_t	z_is_zvol;	/* are we used by the zvol */
 	boolean_t	z_is_mapped;	/* are we mmap'ed */
 	boolean_t	z_is_ctldir;	/* are we .zfs entry */
+	boolean_t	z_is_stale;	/* are we stale due to rollback? */
 	struct inode	z_inode;	/* generic vfs inode */
 } znode_t;
 
diff --git a/include/sys/zpl.h b/include/sys/zpl.h
index e34b323bd..61a57ef29 100644
--- a/include/sys/zpl.h
+++ b/include/sys/zpl.h
@@ -28,18 +28,20 @@
 #include <sys/vfs.h>
 #include <linux/vfs_compat.h>
 #include <linux/xattr_compat.h>
+#include <linux/dcache_compat.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 
 /* zpl_inode.c */
 extern void zpl_vap_init(vattr_t *vap, struct inode *dir,
-    struct dentry *dentry, zpl_umode_t mode, cred_t *cr);
+    zpl_umode_t mode, cred_t *cr);
 
 extern const struct inode_operations zpl_inode_operations;
 extern const struct inode_operations zpl_dir_inode_operations;
 extern const struct inode_operations zpl_symlink_inode_operations;
 extern const struct inode_operations zpl_special_inode_operations;
+extern dentry_operations_t zpl_dentry_operations;
 
 /* zpl_file.c */
 extern ssize_t zpl_read_common(struct inode *ip, const char *buf,
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 622d865df..b3801d494 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -197,6 +197,7 @@ zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
 	zp->z_is_mapped = B_FALSE;
 	zp->z_is_ctldir = B_TRUE;
 	zp->z_is_sa = B_FALSE;
+	zp->z_is_stale = B_FALSE;
 	ip->i_ino = id;
 	ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO);
 	ip->i_uid = 0;
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index fc5c2ba39..ac5c317ce 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -1032,7 +1032,7 @@ EXPORT_SYMBOL(zfs_sb_prune);
 #endif /* HAVE_SHRINK */
 
 /*
- * Teardown the zfs_sb_t::z_os.
+ * Teardown the zfs_sb_t.
  *
  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
  * and 'z_teardown_inactive_lock' held.
@@ -1053,7 +1053,6 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
 		 * for non-snapshots.
 		 */
 		shrink_dcache_sb(zsb->z_parent->z_sb);
-		(void) spl_invalidate_inodes(zsb->z_parent->z_sb, 0);
 	}
 
 	/*
@@ -1079,25 +1078,26 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
 	}
 
 	/*
-	 * At this point there are no vops active, and any new vops will
-	 * fail with EIO since we have z_teardown_lock for writer (only
-	 * relavent for forced unmount).
+	 * At this point there are no VFS ops active, and any new VFS ops
+	 * will fail with EIO since we have z_teardown_lock for writer (only
+	 * relevant for forced unmount).
 	 *
 	 * Release all holds on dbufs.
 	 */
 	mutex_enter(&zsb->z_znodes_lock);
 	for (zp = list_head(&zsb->z_all_znodes); zp != NULL;
-	    zp = list_next(&zsb->z_all_znodes, zp))
+	    zp = list_next(&zsb->z_all_znodes, zp)) {
 		if (zp->z_sa_hdl) {
 			ASSERT(atomic_read(&ZTOI(zp)->i_count) > 0);
 			zfs_znode_dmu_fini(zp);
 		}
+	}
 	mutex_exit(&zsb->z_znodes_lock);
 
 	/*
-	 * If we are unmounting, set the unmounted flag and let new vops
+	 * If we are unmounting, set the unmounted flag and let new VFS ops
 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
-	 * other vops will fail with EIO.
+	 * other VFS ops will fail with EIO.
 	 */
 	if (unmounting) {
 		zsb->z_unmounted = B_TRUE;
@@ -1392,7 +1392,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
 EXPORT_SYMBOL(zfs_vget);
 
 /*
- * Block out VOPs and close zfs_sb_t::z_os
+ * Block out VFS ops and close zfs_sb_t
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
  * 'z_teardown_inactive_lock' write held.
@@ -1404,6 +1404,7 @@ zfs_suspend_fs(zfs_sb_t *zsb)
 
 	if ((error = zfs_sb_teardown(zsb, B_FALSE)) != 0)
 		return (error);
+
 	dmu_objset_disown(zsb->z_os, zsb);
 
 	return (0);
@@ -1411,7 +1412,7 @@ zfs_suspend_fs(zfs_sb_t *zsb)
 EXPORT_SYMBOL(zfs_suspend_fs);
 
 /*
- * Reopen zfs_sb_t::z_os and release VOPs.
+ * Reopen zfs_sb_t and release VFS ops.
  */
 int
 zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
@@ -1440,30 +1441,37 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
 			goto bail;
 
 		VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0);
+		zsb->z_rollback_time = jiffies;
 
 		/*
-		 * Attempt to re-establish all the active znodes with
-		 * their dbufs.  If a zfs_rezget() fails, then we'll let
-		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
-		 * when they try to use their znode.
+		 * Attempt to re-establish all the active inodes with their
+		 * dbufs.  If a zfs_rezget() fails, then we unhash the inode
+		 * and mark it stale.  This prevents a collision if a new
+		 * inode/object is created which must use the same inode
+		 * number.  The stale inode will be be released when the
+		 * VFS prunes the dentry holding the remaining references
+		 * on the stale inode.
 		 */
 		mutex_enter(&zsb->z_znodes_lock);
 		for (zp = list_head(&zsb->z_all_znodes); zp;
 		    zp = list_next(&zsb->z_all_znodes, zp)) {
-			(void) zfs_rezget(zp);
+			err2 = zfs_rezget(zp);
+			if (err2) {
+				remove_inode_hash(ZTOI(zp));
+				zp->z_is_stale = B_TRUE;
+			}
 		}
 		mutex_exit(&zsb->z_znodes_lock);
-
 	}
 
 bail:
-	/* release the VOPs */
+	/* release the VFS ops */
 	rw_exit(&zsb->z_teardown_inactive_lock);
 	rrw_exit(&zsb->z_teardown_lock, FTAG);
 
 	if (err) {
 		/*
-		 * Since we couldn't reopen zfs_sb_t::z_os, force
+		 * Since we couldn't reopen zfs_sb_t, force
 		 * unmount this file system.
 		 */
 		(void) zfs_umount(zsb->z_sb);
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 8074f1d00..9bf26a734 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -274,8 +274,10 @@ zfs_inode_destroy(struct inode *ip)
 		zfsctl_inode_destroy(ip);
 
 	mutex_enter(&zsb->z_znodes_lock);
-	list_remove(&zsb->z_all_znodes, zp);
-	zsb->z_nr_znodes--;
+	if (list_link_active(&zp->z_link_node)) {
+		list_remove(&zsb->z_all_znodes, zp);
+		zsb->z_nr_znodes--;
+	}
 	mutex_exit(&zsb->z_znodes_lock);
 
 	if (zp->z_acl_cached) {
@@ -348,7 +350,7 @@ zfs_inode_set_ops(zfs_sb_t *zsb, struct inode *ip)
 static znode_t *
 zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
     dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl,
-    struct dentry *dentry, struct inode *dip)
+    struct inode *dip)
 {
 	znode_t	*zp;
 	struct inode *ip;
@@ -379,6 +381,7 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
 	zp->z_is_zvol = B_FALSE;
 	zp->z_is_mapped = B_FALSE;
 	zp->z_is_ctldir = B_FALSE;
+	zp->z_is_stale = B_FALSE;
 
 	zfs_znode_sa_init(zsb, zp, db, obj_type, hdl);
 
@@ -414,11 +417,15 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
 	zfs_inode_update(zp);
 	zfs_inode_set_ops(zsb, ip);
 
-	if (insert_inode_locked(ip))
-		goto error;
-
-	if (dentry)
-		d_instantiate(dentry, ip);
+	/*
+	 * The only way insert_inode_locked() can fail is if the ip->i_ino
+	 * number is already hashed for this super block.  This can never
+	 * happen because the inode numbers map 1:1 with the object numbers.
+	 *
+	 * The one exception is rolling back a mounted file system, but in
+	 * this case all the active inode are unhashed during the rollback.
+	 */
+	VERIFY3S(insert_inode_locked(ip), ==, 0);
 
 	mutex_enter(&zsb->z_znodes_lock);
 	list_insert_tail(&zsb->z_all_znodes, zp);
@@ -720,9 +727,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 
 	if (!(flag & IS_ROOT_NODE)) {
 		*zpp = zfs_znode_alloc(zsb, db, 0, obj_type, obj, sa_hdl,
-		    vap->va_dentry, ZTOI(dzp));
-		ASSERT(*zpp != NULL);
-		ASSERT(dzp != NULL);
+		    ZTOI(dzp));
+		VERIFY(*zpp != NULL);
+		VERIFY(dzp != NULL);
 	} else {
 		/*
 		 * If we are creating the root node, the "parent" we
@@ -931,7 +938,7 @@ again:
 	 * bonus buffer.
 	 */
 	zp = zfs_znode_alloc(zsb, db, doi.doi_data_block_size,
-	    doi.doi_bonus_type, obj_num, NULL, NULL, NULL);
+	    doi.doi_bonus_type, obj_num, NULL, NULL);
 	if (zp == NULL) {
 		err = ENOENT;
 	} else {
@@ -961,8 +968,20 @@ zfs_rezget(znode_t *zp)
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
-
 	mutex_exit(&zp->z_acl_lock);
+
+	rw_enter(&zp->z_xattr_lock, RW_WRITER);
+	if (zp->z_xattr_cached) {
+		nvlist_free(zp->z_xattr_cached);
+		zp->z_xattr_cached = NULL;
+	}
+
+	if (zp->z_xattr_parent) {
+		iput(ZTOI(zp->z_xattr_parent));
+		zp->z_xattr_parent = NULL;
+	}
+	rw_exit(&zp->z_xattr_lock);
+
 	ASSERT(zp->z_sa_hdl == NULL);
 	err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db);
 	if (err) {
@@ -1016,6 +1035,7 @@ zfs_rezget(znode_t *zp)
 
 	zp->z_unlinked = (zp->z_links == 0);
 	zp->z_blksz = doi.doi_data_block_size;
+	zfs_inode_update(zp);
 
 	ZFS_OBJ_HOLD_EXIT(zsb, obj_num);
 
diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c
index 2e5209f8c..54bdbe409 100644
--- a/module/zfs/zpl_ctldir.c
+++ b/module/zfs/zpl_ctldir.c
@@ -366,7 +366,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode)
 
 	crhold(cr);
 	vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-	zpl_vap_init(vap, dip, dentry, mode | S_IFDIR, cr);
+	zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
 
 	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
 	if (error == 0) {
diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c
index 6175c2e93..15ee0f610 100644
--- a/module/zfs/zpl_inode.c
+++ b/module/zfs/zpl_inode.c
@@ -46,6 +46,10 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 	ASSERT3S(error, <=, 0);
 	crfree(cr);
 
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+	spin_unlock(&dentry->d_lock);
+
 	if (error) {
 		if (error == -ENOENT)
 			return d_splice_alias(NULL, dentry);
@@ -57,12 +61,10 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 }
 
 void
-zpl_vap_init(vattr_t *vap, struct inode *dir, struct dentry *dentry,
-    zpl_umode_t mode, cred_t *cr)
+zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr)
 {
 	vap->va_mask = ATTR_MODE;
 	vap->va_mode = mode;
-	vap->va_dentry = dentry;
 	vap->va_uid = crgetfsuid(cr);
 
 	if (dir && dir->i_mode & S_ISGID) {
@@ -90,12 +92,14 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
 
 	crhold(cr);
 	vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-	zpl_vap_init(vap, dir, dentry, mode, cr);
+	zpl_vap_init(vap, dir, mode, cr);
 
 	error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
 	if (error == 0) {
 		error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
 		VERIFY3S(error, ==, 0);
+		d_instantiate(dentry, ip);
+		d_set_d_op(dentry, &zpl_dentry_operations);
 	}
 
 	kmem_free(vap, sizeof(vattr_t));
@@ -123,11 +127,15 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
 
 	crhold(cr);
 	vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-	zpl_vap_init(vap, dir, dentry, mode, cr);
+	zpl_vap_init(vap, dir, mode, cr);
 	vap->va_rdev = rdev;
 
-	error = -zfs_create(dir, (char *)dentry->d_name.name,
-	    vap, 0, mode, &ip, cr, 0, NULL);
+	error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ip);
+		d_set_d_op(dentry, &zpl_dentry_operations);
+	}
+
 	kmem_free(vap, sizeof(vattr_t));
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
@@ -159,9 +167,14 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
 
 	crhold(cr);
 	vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-	zpl_vap_init(vap, dir, dentry, mode | S_IFDIR, cr);
+	zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
 
 	error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ip);
+		d_set_d_op(dentry, &zpl_dentry_operations);
+	}
+
 	kmem_free(vap, sizeof(vattr_t));
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
@@ -262,9 +275,14 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
 
 	crhold(cr);
 	vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-	zpl_vap_init(vap, dir, dentry, S_IFLNK | S_IRWXUGO, cr);
+	zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
 
 	error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0);
+	if (error == 0) {
+		d_instantiate(dentry, ip);
+		d_set_d_op(dentry, &zpl_dentry_operations);
+	}
+
 	kmem_free(vap, sizeof(vattr_t));
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
@@ -334,6 +352,7 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 	}
 
 	d_instantiate(dentry, ip);
+	d_set_d_op(dentry, &zpl_dentry_operations);
 out:
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
@@ -378,6 +397,44 @@ zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len)
 }
 #endif /* HAVE_INODE_FALLOCATE */
 
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned int flags = nd->flags;
+#else
+zpl_revalidate(struct dentry *dentry, unsigned int flags)
+{
+#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
+	zfs_sb_t *zsb = dentry->d_sb->s_fs_info;
+	int error;
+
+	if (flags & LOOKUP_RCU)
+		return (-ECHILD);
+
+	/*
+	 * After a rollback negative dentries created before the rollback
+	 * time must be invalidated.  Otherwise they can obscure files which
+	 * are only present in the rolled back dataset.
+	 */
+	if (dentry->d_inode == NULL) {
+		spin_lock(&dentry->d_lock);
+		error = time_before(dentry->d_time, zsb->z_rollback_time);
+		spin_unlock(&dentry->d_lock);
+
+		if (error)
+			return (0);
+	}
+
+	/*
+	 * The dentry may reference a stale inode if a mounted file system
+	 * was rolled back to a point in time where the object didn't exist.
+	 */
+	if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
+		return (0);
+
+	return (1);
+}
 
 const struct inode_operations zpl_inode_operations = {
 	.create		= zpl_create,
@@ -440,3 +497,7 @@ const struct inode_operations zpl_special_inode_operations = {
 	.removexattr	= generic_removexattr,
 	.listxattr	= zpl_xattr_list,
 };
+
+dentry_operations_t zpl_dentry_operations = {
+	.d_revalidate	= zpl_revalidate,
+};