2 files changed, 244 insertions, 104 deletions
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index a8019ba5c..30b30891b 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -163,32 +163,7 @@
  *	return (error);			// done, report error
  */
 
-#if defined(_KERNEL) && defined(HAVE_MMAP)
-/*
- * Utility functions to map and unmap a single physical page.  These
- * are used to manage the mappable copies of ZFS file data, and therefore
- * do not update ref/mod bits.
- */
-caddr_t
-zfs_map_page(page_t *pp, enum seg_rw rw)
-{
-	if (kpm_enable)
-		return (hat_kpm_mapin(pp, 0));
-	ASSERT(rw == S_READ || rw == S_WRITE);
-	return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
-	    (caddr_t)-1));
-}
-
-void
-zfs_unmap_page(page_t *pp, caddr_t addr)
-{
-	if (kpm_enable) {
-		hat_kpm_mapout(pp, 0, addr);
-	} else {
-		ppmapout(addr);
-	}
-}
-
+#if defined(_KERNEL)
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
@@ -197,25 +172,39 @@ zfs_unmap_page(page_t *pp, caddr_t addr)
  *		the page and the dmu buffer.
  */
 static void
-update_pages(struct inode *ip, int64_t start, int len, objset_t *os,
-    uint64_t oid)
+update_pages(struct inode *ip, int64_t start, int len,
+    objset_t *os, uint64_t oid)
 {
+	struct address_space *mp = ip->i_mapping;
+	struct page *pp;
+	uint64_t nbytes;
 	int64_t	off;
+	void *pb;
 
-	off = start & PAGEOFFSET;
-	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-		page_t *pp;
-		uint64_t nbytes = MIN(PAGESIZE - off, len);
+	off = start & (PAGE_CACHE_SIZE-1);
+	for (start &= PAGE_CACHE_MASK; len > 0; start += PAGE_CACHE_SIZE) {
+		nbytes = MIN(PAGE_CACHE_SIZE - off, len);
 
-		if (pp = page_lookup(ip, start, SE_SHARED)) {
-			caddr_t va;
+		pp = find_lock_page(mp, start >> PAGE_CACHE_SHIFT);
+		if (pp) {
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
 
-			va = zfs_map_page(pp, S_WRITE);
-			(void) dmu_read(os, oid, start+off, nbytes, va+off,
+			pb = kmap(pp);
+			(void) dmu_read(os, oid, start+off, nbytes, pb+off,
 			    DMU_READ_PREFETCH);
-			zfs_unmap_page(pp, va);
-			page_unlock(pp);
+			kunmap(pp);
+
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
+
+			mark_page_accessed(pp);
+			SetPageUptodate(pp);
+			ClearPageError(pp);
+			unlock_page(pp);
+			page_cache_release(pp);
 		}
+
 		len -= nbytes;
 		off = 0;
 	}
@@ -234,28 +223,39 @@ update_pages(struct inode *ip, int64_t start, int len, objset_t *os,
 static int
 mappedread(struct inode *ip, int nbytes, uio_t *uio)
 {
+	struct address_space *mp = ip->i_mapping;
+	struct page *pp;
 	znode_t *zp = ITOZ(ip);
 	objset_t *os = ITOZSB(ip)->z_os;
 	int64_t	start, off;
+	uint64_t bytes;
 	int len = nbytes;
 	int error = 0;
+	void *pb;
 
 	start = uio->uio_loffset;
-	off = start & PAGEOFFSET;
-	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-		page_t *pp;
-		uint64_t bytes = MIN(PAGESIZE - off, len);
-
-		if (pp = page_lookup(ip, start, SE_SHARED)) {
-			caddr_t va;
-
-			va = zfs_map_page(pp, S_READ);
-			error = uiomove(va + off, bytes, UIO_READ, uio);
-			zfs_unmap_page(pp, va);
-			page_unlock(pp);
+	off = start & (PAGE_CACHE_SIZE-1);
+	for (start &= PAGE_CACHE_MASK; len > 0; start += PAGE_CACHE_SIZE) {
+		bytes = MIN(PAGE_CACHE_SIZE - off, len);
+
+		pp = find_lock_page(mp, start >> PAGE_CACHE_SHIFT);
+		if (pp) {
+			ASSERT(PageUptodate(pp));
+
+			pb = kmap(pp);
+			error = uiomove(pb + off, bytes, UIO_READ, uio);
+			kunmap(pp);
+
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
+
+			mark_page_accessed(pp);
+			unlock_page(pp);
+			page_cache_release(pp);
 		} else {
 			error = dmu_read_uio(os, zp->z_id, uio, bytes);
 		}
+
 		len -= bytes;
 		off = 0;
 		if (error)
@@ -263,7 +263,7 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio)
 	}
 	return (error);
 }
-#endif /* _KERNEL && HAVE_MMAP */
+#endif /* _KERNEL */
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
@@ -273,7 +273,8 @@ offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
  *	IN:	ip	- inode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
- *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
+ *		ioflag	- FSYNC flags; used to provide FRSYNC semantics.
+ *			  O_DIRECT flag; used to bypass page cache.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
@@ -394,15 +395,11 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
-/* XXX: Drop this, ARC update handled by zpl layer */
-#ifdef HAVE_MMAP
-		if (vn_has_cached_data(ip))
+		if (zp->z_is_mapped && !(ioflag & O_DIRECT))
 			error = mappedread(ip, nbytes, uio);
 		else
 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
-#else
-		error = dmu_read_uio(os, zp->z_id, uio, nbytes);
-#endif /* HAVE_MMAP */
+
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
@@ -429,6 +426,7 @@ EXPORT_SYMBOL(zfs_read);
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- FAPPEND flag set if in append mode.
+ *			  O_DIRECT flag; used to bypass page cache.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range.
@@ -700,13 +698,9 @@ again:
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
-/* XXX: Drop this, ARC update handled by zpl layer */
-#ifdef HAVE_MMAP
-		if (tx_bytes && vn_has_cached_data(ip)) {
-			update_pages(ip, woff,
-			    tx_bytes, zsb->z_os, zp->z_id);
-		}
-#endif /* HAVE_MMAP */
+
+		if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT))
+			update_pages(ip, woff, tx_bytes, zsb->z_os, zp->z_id);
 
 		/*
 		 * If we made no progress, we're done.  If we made even
@@ -3392,6 +3386,7 @@ top:
 }
 EXPORT_SYMBOL(zfs_link);
 
+#ifdef HAVE_MMAP
 /*
  * zfs_null_putapage() is used when the file system has been force
  * unmounted. It just drops the pages.
@@ -3627,48 +3622,30 @@ out:
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
+#endif /* HAVE_MMAP */
 
 /*ARGSUSED*/
 void
-zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+zfs_inactive(struct inode *ip)
 {
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	znode_t	*zp = ITOZ(ip);
+	zfs_sb_t *zsb = ITOZSB(ip);
 	int error;
 
-	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
-	if (zp->z_sa_hdl == NULL) {
-		/*
-		 * The fs has been unmounted, or we did a
-		 * suspend/resume and this file no longer exists.
-		 */
-		if (vn_has_cached_data(vp)) {
-			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
-			    B_INVAL, cr);
-		}
+	truncate_inode_pages(&ip->i_data, 0);
 
-		mutex_enter(&zp->z_lock);
-		mutex_enter(&vp->v_lock);
-		ASSERT(vp->v_count == 1);
-		vp->v_count = 0;
-		mutex_exit(&vp->v_lock);
-		mutex_exit(&zp->z_lock);
-		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-		zfs_znode_free(zp);
-		return;
-	}
+#ifdef HAVE_SNAPSHOT
+	/* Early return for snapshot inode? */
+#endif /* HAVE_SNAPSHOT */
 
-	/*
-	 * Attempt to push any data in the page cache.  If this fails
-	 * we will get kicked out later in zfs_zinactive().
-	 */
-	if (vn_has_cached_data(vp)) {
-		(void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
-		    cr);
+	rw_enter(&zsb->z_teardown_inactive_lock, RW_READER);
+	if (zp->z_sa_hdl == NULL) {
+		rw_exit(&zsb->z_teardown_inactive_lock);
+		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
-		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_t *tx = dmu_tx_create(zsb->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
@@ -3712,6 +3689,7 @@ zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp,
 }
 EXPORT_SYMBOL(zfs_seek);
 
+#ifdef HAVE_MMAP
 /*
  * Pre-filter the generic locking function to trap attempts to place
  * a mandatory lock on a memory mapped file.
@@ -4056,6 +4034,7 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 
 	return (0);
 }
+#endif /* HAVE_MMAP */
 
 /*
  * convoff - converts the given data (start, whence) to the
diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c
index 41901bb6c..585f971e8 100644
--- a/module/zfs/zpl_file.c
+++ b/module/zfs/zpl_file.c
@@ -145,24 +145,185 @@ zpl_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 	return (wrote);
 }
 
+/*
+ * It's worth taking a moment to describe how mmap is implemented
+ * for zfs because it differs considerably from other Linux filesystems.
+ * However, this issue is handled the same way under OpenSolaris.
+ *
+ * The issue is that by design zfs bypasses the Linux page cache and
+ * leaves all caching up to the ARC.  This has been shown to work
+ * well for the common read(2)/write(2) case.  However, mmap(2)
+ * is problem because it relies on being tightly integrated with the
+ * page cache.  To handle this we cache mmap'ed files twice, once in
+ * the ARC and a second time in the page cache.  The code is careful
+ * to keep both copies synchronized.
+ *
+ * When a file with an mmap'ed region is written to using write(2)
+ * both the data in the ARC and existing pages in the page cache
+ * are updated.  For a read(2) data will be read first from the page
+ * cache then the ARC if needed.  Neither a write(2) or read(2) will
+ * will ever result in new pages being added to the page cache.
+ *
+ * New pages are added to the page cache only via .readpage() which
+ * is called when the vfs needs to read a page off disk to back the
+ * virtual memory region.  These pages may be modified without
+ * notifying the ARC and will be written out periodically via
+ * .writepage().  This will occur due to either a sync or the usual
+ * page aging behavior.  Note because a read(2) of a mmap'ed file
+ * will always check the page cache first even when the ARC is out
+ * of date correct data will still be returned.
+ *
+ * While this implementation ensures correct behavior it does have
+ * have some drawbacks.  The most obvious of which is that it
+ * increases the required memory footprint when access mmap'ed
+ * files.  It also adds additional complexity to the code keeping
+ * both caches synchronized.
+ *
+ * Longer term it may be possible to cleanly resolve this wart by
+ * mapping page cache pages directly on to the ARC buffers.  The
+ * Linux address space operations are flexible enough to allow
+ * selection of which pages back a particular index.  The trick
+ * would be working out the details of which subsystem is in
+ * charge, the ARC, the page cache, or both.  It may also prove
+ * helpful to move the ARC buffers to a scatter-gather lists
+ * rather than a vmalloc'ed region.
+ */
+static int
+zpl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	znode_t *zp = ITOZ(filp->f_mapping->host);
+	int error;
+
+	error = generic_file_mmap(filp, vma);
+	if (error)
+		return (error);
+
+	mutex_enter(&zp->z_lock);
+	zp->z_is_mapped = 1;
+	mutex_exit(&zp->z_lock);
+
+	return (error);
+}
+
+/*
+ * Populate a page with data for the Linux page cache.  This function is
+ * only used to support mmap(2).  There will be an identical copy of the
+ * data in the ARC which is kept up to date via .write() and .writepage().
+ *
+ * Current this function relies on zpl_read_common() and the O_DIRECT
+ * flag to read in a page.  This works but the more correct way is to
+ * update zfs_fillpage() to be Linux friendly and use that interface.
+ */
+static int
+zpl_readpage(struct file *filp, struct page *pp)
+{
+	struct inode *ip;
+	loff_t off, i_size;
+	size_t len, wrote;
+	cred_t *cr;
+	void *pb;
+	int error = 0;
+
+	ASSERT(PageLocked(pp));
+	ip = pp->mapping->host;
+	off = page_offset(pp);
+	i_size = i_size_read(ip);
+	ASSERT3S(off, <, i_size);
+
+	cr = (cred_t *)get_current_cred();
+	len = MIN(PAGE_CACHE_SIZE, i_size - off);
+
+	pb = kmap(pp);
+
+	/* O_DIRECT is passed to bypass the page cache and avoid deadlock. */
+	wrote = zpl_read_common(ip, pb, len, off, UIO_SYSSPACE, O_DIRECT, cr);
+	if (wrote != len)
+		error = -EIO;
+
+	if (!error && (len < PAGE_CACHE_SIZE))
+		memset(pb + len, 0, PAGE_CACHE_SIZE - len);
+
+	kunmap(pp);
+	put_cred(cr);
+
+	if (error) {
+		SetPageError(pp);
+		ClearPageUptodate(pp);
+	} else {
+		ClearPageError(pp);
+		SetPageUptodate(pp);
+		flush_dcache_page(pp);
+	}
+
+	unlock_page(pp);
+
+	return (error);
+}
+
+/*
+ * Write out dirty pages to the ARC, this function is only required to
+ * support mmap(2).  Mapped pages may be dirtied by memory operations
+ * which never call .write().  These dirty pages are kept in sync with
+ * the ARC buffers via this hook.
+ *
+ * Currently this function relies on zpl_write_common() and the O_DIRECT
+ * flag to push out the page.  This works but the more correct way is
+ * to update zfs_putapage() to be Linux friendly and use that interface.
+ */
+static int
+zpl_writepage(struct page *pp, struct writeback_control *wbc)
+{
+	struct inode *ip;
+	loff_t off, i_size;
+	size_t len, read;
+	cred_t *cr;
+	void *pb;
+	int error = 0;
+
+	ASSERT(PageLocked(pp));
+	ip = pp->mapping->host;
+	off = page_offset(pp);
+	i_size = i_size_read(ip);
+
+	cr = (cred_t *)get_current_cred();
+	len = MIN(PAGE_CACHE_SIZE, i_size - off);
+
+	pb = kmap(pp);
+
+	/* O_DIRECT is passed to bypass the page cache and avoid deadlock. */
+	read = zpl_write_common(ip, pb, len, off, UIO_SYSSPACE, O_DIRECT, cr);
+	if (read != len)
+		error = -EIO;
+
+	kunmap(pp);
+	put_cred(cr);
+
+	if (error) {
+		SetPageError(pp);
+		ClearPageUptodate(pp);
+	} else {
+		ClearPageError(pp);
+		SetPageUptodate(pp);
+	}
+
+	unlock_page(pp);
+
+	return (error);
+}
+
 const struct address_space_operations zpl_address_space_operations = {
-#if 0
 	.readpage	= zpl_readpage,
 	.writepage	= zpl_writepage,
-	.direct_IO	= zpl_direct_IO,
-#endif
 };
 
 const struct file_operations zpl_file_operations = {
 	.open		= generic_file_open,
 	.llseek		= generic_file_llseek,
-	.read		= zpl_read,	/* do_sync_read */
-	.write		= zpl_write,	/* do_sync_write */
+	.read		= zpl_read,
+	.write		= zpl_write,
 	.readdir	= zpl_readdir,
-	.mmap		= generic_file_mmap,
+	.mmap		= zpl_mmap,
 	.fsync		= zpl_fsync,
-	.aio_read	= NULL,		/* generic_file_aio_read */
-	.aio_write	= NULL,		/* generic_file_aio_write */
 };
 
 const struct file_operations zpl_dir_file_operations = {