From 21a96fb635fb14a59efe44d7ea6767c5f9a586bc Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Thu, 5 Mar 2015 11:52:26 -0800
Subject: Fix "BUG: Bad page state" caused by writeback flag

Commit d958324 fixed the deadlock between page lock and range lock by
unlocking the page lock before acquiring the range lock. However,
this created a new issue #3075.

The problem is that if we can't set the write back bit before releasing
the page lock.  Then other processes will be unaware that the page is
under active write back.  They may therefore truncate the page,
invalidate the page, or not honor the sync semantics.

To workaround this problem we re-dirty the page before dropping the
page lock.  While this doesn't prevent the page from being truncated
it does ensure it won't be invalidated.  Then the range lock and the
page lock are reacquired in the correct deadlock-free order.

Once both locks are safely held the page state can be rechecked.  If
all is well and the page is in the expect state the dirty bit can be
removed, the write back bit set, and the page removed from the skip
count.  If not the page will be handled as appropriate.

Signed-off-by: Chunwei Chen <tuxoko@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3075
---
 module/zfs/zfs_vnops.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'module')

diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index d69b34f18..1d23d6db3 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -3873,6 +3873,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	uint64_t	mtime[2], ctime[2];
 	sa_bulk_attr_t	bulk[3];
 	int		cnt = 0;
+	struct address_space *mapping;
 
 	ZFS_ENTER(zsb);
 	ZFS_VERIFY_ZP(zp);
@@ -3919,10 +3920,59 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	 * 2) Before setting or clearing write back on a page the range lock
 	 *    must be held in order to prevent a lock inversion with the
 	 *    zfs_free_range() function.
+	 *
+	 * This presents a problem because upon entering this function the
+	 * page lock is already held.  To safely acquire the range lock the
+	 * page lock must be dropped.  This creates a window where another
+	 * process could truncate, invalidate, dirty, or write out the page.
+	 *
+	 * Therefore, after successfully reacquiring the range and page locks
+	 * the current page state is checked.  In the common case everything
+	 * will be as is expected and it can be written out.  However, if
+	 * the page state has changed it must be handled accordingly.
 	 */
+	mapping = pp->mapping;
+	redirty_page_for_writepage(wbc, pp);
 	unlock_page(pp);
+
 	rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER);
+	lock_page(pp);
+
+	/* Page mapping changed or it was no longer dirty, we're done */
+	if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
+		unlock_page(pp);
+		zfs_range_unlock(rl);
+		ZFS_EXIT(zsb);
+		return (0);
+	}
+
+	/* Another process started write block if required */
+	if (PageWriteback(pp)) {
+		unlock_page(pp);
+		zfs_range_unlock(rl);
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(pp);
+
+		ZFS_EXIT(zsb);
+		return (0);
+	}
+
+	/* Clear the dirty flag the required locks are held */
+	if (!clear_page_dirty_for_io(pp)) {
+		unlock_page(pp);
+		zfs_range_unlock(rl);
+		ZFS_EXIT(zsb);
+		return (0);
+	}
+
+	/*
+	 * Counterpart for redirty_page_for_writepage() above.  This page
+	 * was in fact not skipped and should not be counted as if it were.
+	 */
+	wbc->pages_skipped--;
 	set_page_writeback(pp);
+	unlock_page(pp);
 
 	tx = dmu_tx_create(zsb->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
-- 
cgit v1.2.3