OpenZFS 9689 - zfs range lock code should not be zpl-specific

The ZFS range locking code in zfs_rlock.c/h depends on ZPL-specific data structures, specifically znode_t. However, it's also used by the ZVOL code, which uses a "dummy" znode_t to pass to the range locking code. We should clean this up so that the range locking code is generic and can be used equally by ZPL and ZVOL, and also can be used by future consumers that may need to run in userland (libzpool) as well as the kernel. Porting notes: * Added missing sys/avl.h include to sys/zfs_rlock.h. * Removed 'dbuf is within the locked range' ASSERTs from dmu_sync(). This was needed because ztest does not yet use a locked_range_t. * Removed "Approved by:" tag requirement from OpenZFS commit check to prevent needless warnings when integrating changes which has not been merged to illumos. * Reverted free_list range lock changes which were originally needed to defer the cv_destroy() which was called immediately after cv_broadcast(). With d2733258 this should be safe but if not we may need to reintroduce this logic. * Reverts: The following two commits were reverted and squashed in to this change in order to make it easier to apply OpenZFS 9689. - d88895a0, which removed the dummy znode from zvol_state - e3a07cd0, which updated ztest to use range locks * Preserved optimized rangelock comparison function. Preserved the rangelock free list. The cv_destroy() function will block waiting for all processes in cv_wait() to be scheduled and drop their reference. This is done to ensure it's safe to free the condition variable. However, blocking while holding the rl->rl_lock mutex can result in a deadlock on Linux. A free list is introduced to defer the cv_destroy() and kmem_free() until after the mutex is released. Authored by: Matthew Ahrens <[email protected]> Reviewed by: Brian Behlendorf <[email protected]> Reviewed by: Serapheim Dimitropoulos <[email protected]> Reviewed by: George Wilson <[email protected]> Reviewed by: Brad Lewis <[email protected]> Ported-by: Brian Behlendorf <[email protected]> OpenZFS-issue: https://illumos.org/issues/9689 OpenZFS-commit: https://github.com/openzfs/openzfs/pull/680 External-issue: DLPX-58662 Closes #7980
author: Matt Ahrens <[email protected]> 2018-10-01 15:13:12 -0700
committer: Brian Behlendorf <[email protected]> 2018-10-11 10:19:33 -0700
commit: 5d43cc9a59d61aea37a1236e9d28628856030947 (patch)
tree: 366ef984481334ae6f99287048ced780eb097712 /module
parent: 50a343d85c04698d51c154375a00994dea81e6db (diff)
5 files changed, 396 insertions, 368 deletions
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 2ff484b63..180c1f12f 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1924,11 +1924,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
-	/* dbuf is within the locked range */
-	ASSERT3U(db->db.db_offset, >=, zgd->zgd_rl->r_off);
-	ASSERT3U(db->db.db_offset + db->db.db_size, <=,
-	    zgd->zgd_rl->r_off + zgd->zgd_rl->r_len);
-
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c
index 7ecc353d2..d514a4fc7 100644
--- a/module/zfs/zfs_rlock.c
+++ b/module/zfs/zfs_rlock.c
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /*
@@ -34,9 +34,9 @@
  * Interface
  * ---------
  * Defined in zfs_rlock.h but essentially:
- *	rl = zfs_range_lock(zp, off, len, lock_type);
- *	zfs_range_unlock(rl);
- *	zfs_range_reduce(rl, off, len);
+ *	lr = rangelock_enter(zp, off, len, lock_type);
+ *	rangelock_reduce(lr, off, len); // optional
+ *	rangelock_exit(lr);
  *
  * AVL tree
  * --------
@@ -46,9 +46,10 @@
  *
  * Common case
  * -----------
- * The (hopefully) usual case is of no overlaps or contention for
- * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
- * searched that finds no overlap, and *this* rl_t is placed in the tree.
+ * The (hopefully) usual case is of no overlaps or contention for locks. On
+ * entry to rangelock_enter(), a locked_range_t is allocated; the tree
+ * searched that finds no overlap, and *this* locked_range_t is placed in the
+ * tree.
  *
  * Overlaps/Reference counting/Proxy locks
  * ---------------------------------------
@@ -87,68 +88,85 @@
  *
  * Grow block handling
  * -------------------
- * ZFS supports multiple block sizes currently up to 128K. The smallest
+ * ZFS supports multiple block sizes, up to 16MB. The smallest
  * block size is used for the file which is grown as needed. During this
  * growth all other writers and readers must be excluded.
  * So if the block size needs to be grown then the whole file is
  * exclusively locked, then later the caller will reduce the lock
- * range to just the range to be written using zfs_reduce_range.
+ * range to just the range to be written using rangelock_reduce().
  */
 
+#include <sys/zfs_context.h>
 #include <sys/zfs_rlock.h>
-#include <sys/sysmacros.h>
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+static int
+rangelock_compare(const void *arg1, const void *arg2)
+{
+	const locked_range_t *rl1 = (const locked_range_t *)arg1;
+	const locked_range_t *rl2 = (const locked_range_t *)arg2;
+
+	return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
+}
+
+/*
+ * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
+ * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
+ * and may increase the range that's locked for RL_WRITER.
+ */
+void
+rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg)
+{
+	mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&rl->rl_tree, rangelock_compare,
+	    sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
+	rl->rl_cb = cb;
+	rl->rl_arg = arg;
+}
+
+void
+rangelock_fini(rangelock_t *rl)
+{
+	mutex_destroy(&rl->rl_lock);
+	avl_destroy(&rl->rl_tree);
+}
 
 /*
  * Check if a write lock can be grabbed, or wait and recheck until available.
  */
 static void
-zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new)
+rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
 {
-	avl_tree_t *tree = &zrl->zr_avl;
-	rl_t *rl;
+	avl_tree_t *tree = &rl->rl_tree;
+	locked_range_t *lr;
 	avl_index_t where;
-	uint64_t end_size;
-	uint64_t off = new->r_off;
-	uint64_t len = new->r_len;
+	uint64_t orig_off = new->lr_offset;
+	uint64_t orig_len = new->lr_length;
+	rangelock_type_t orig_type = new->lr_type;
 
 	for (;;) {
 		/*
-		 * Range locking is also used by zvol. However, for zvol, we
-		 * don't need to append or grow blocksize, so skip that
-		 * processing.
-		 *
-		 * Yes, this is ugly, and would be solved by not handling
-		 * grow or append in range lock code. If that was done then
-		 * we could make the range locking code generically available
-		 * to other non-zfs consumers.
+		 * Call callback which can modify new->r_off,len,type.
+		 * Note, the callback is used by the ZPL to handle appending
+		 * and changing blocksizes.  It isn't needed for zvols.
 		 */
-		if (zrl->zr_size) { /* caller is ZPL */
-			/*
-			 * If in append mode pick up the current end of file.
-			 * This is done under z_range_lock to avoid races.
-			 */
-			if (new->r_type == RL_APPEND)
-				new->r_off = *zrl->zr_size;
-
-			/*
-			 * If we need to grow the block size then grab the whole
-			 * file range. This is also done under z_range_lock to
-			 * avoid races.
-			 */
-			end_size = MAX(*zrl->zr_size, new->r_off + len);
-			if (end_size > *zrl->zr_blksz &&
-			    (!ISP2(*zrl->zr_blksz) ||
-			    *zrl->zr_blksz < *zrl->zr_max_blksz)) {
-				new->r_off = 0;
-				new->r_len = UINT64_MAX;
-			}
+		if (rl->rl_cb != NULL) {
+			rl->rl_cb(new, rl->rl_arg);
 		}
 
 		/*
+		 * If the type was APPEND, the callback must convert it to
+		 * WRITER.
+		 */
+		ASSERT3U(new->lr_type, ==, RL_WRITER);
+
+		/*
 		 * First check for the usual case of no locks
 		 */
 		if (avl_numnodes(tree) == 0) {
-			new->r_type = RL_WRITER; /* convert to writer */
 			avl_add(tree, new);
 			return;
 		}
@@ -156,31 +174,33 @@ zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new)
 		/*
 		 * Look for any locks in the range.
 		 */
-		rl = avl_find(tree, new, &where);
-		if (rl)
+		lr = avl_find(tree, new, &where);
+		if (lr != NULL)
 			goto wait; /* already locked at same offset */
 
-		rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-		if (rl && (rl->r_off < new->r_off + new->r_len))
+		lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+		if (lr != NULL &&
+		    lr->lr_offset < new->lr_offset + new->lr_length)
 			goto wait;
 
-		rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
-		if (rl && rl->r_off + rl->r_len > new->r_off)
+		lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
+		if (lr != NULL &&
+		    lr->lr_offset + lr->lr_length > new->lr_offset)
 			goto wait;
 
-		new->r_type = RL_WRITER; /* convert possible RL_APPEND */
 		avl_insert(tree, new, where);
 		return;
 wait:
-		if (!rl->r_write_wanted) {
-			cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
-			rl->r_write_wanted = B_TRUE;
+		if (!lr->lr_write_wanted) {
+			cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
+			lr->lr_write_wanted = B_TRUE;
 		}
-		cv_wait(&rl->r_wr_cv, &zrl->zr_mutex);
+		cv_wait(&lr->lr_write_cv, &rl->rl_lock);
 
 		/* reset to original */
-		new->r_off = off;
-		new->r_len = len;
+		new->lr_offset = orig_off;
+		new->lr_length = orig_len;
+		new->lr_type = orig_type;
 	}
 }
 
@@ -188,29 +208,29 @@ wait:
  * If this is an original (non-proxy) lock then replace it by
  * a proxy and return the proxy.
  */
-static rl_t *
-zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
+static locked_range_t *
+rangelock_proxify(avl_tree_t *tree, locked_range_t *lr)
 {
-	rl_t *proxy;
+	locked_range_t *proxy;
 
-	if (rl->r_proxy)
-		return (rl); /* already a proxy */
+	if (lr->lr_proxy)
+		return (lr); /* already a proxy */
 
-	ASSERT3U(rl->r_cnt, ==, 1);
-	ASSERT(rl->r_write_wanted == B_FALSE);
-	ASSERT(rl->r_read_wanted == B_FALSE);
-	avl_remove(tree, rl);
-	rl->r_cnt = 0;
+	ASSERT3U(lr->lr_count, ==, 1);
+	ASSERT(lr->lr_write_wanted == B_FALSE);
+	ASSERT(lr->lr_read_wanted == B_FALSE);
+	avl_remove(tree, lr);
+	lr->lr_count = 0;
 
 	/* create a proxy range lock */
-	proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	proxy->r_off = rl->r_off;
-	proxy->r_len = rl->r_len;
-	proxy->r_cnt = 1;
-	proxy->r_type = RL_READER;
-	proxy->r_proxy = B_TRUE;
-	proxy->r_write_wanted = B_FALSE;
-	proxy->r_read_wanted = B_FALSE;
+	proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+	proxy->lr_offset = lr->lr_offset;
+	proxy->lr_length = lr->lr_length;
+	proxy->lr_count = 1;
+	proxy->lr_type = RL_READER;
+	proxy->lr_proxy = B_TRUE;
+	proxy->lr_write_wanted = B_FALSE;
+	proxy->lr_read_wanted = B_FALSE;
 	avl_add(tree, proxy);
 
 	return (proxy);
@@ -220,29 +240,27 @@ zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
  * Split the range lock at the supplied offset
  * returning the *front* proxy.
  */
-static rl_t *
-zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
+static locked_range_t *
+rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off)
 {
-	rl_t *front, *rear;
-
-	ASSERT3U(rl->r_len, >, 1);
-	ASSERT3U(off, >, rl->r_off);
-	ASSERT3U(off, <, rl->r_off + rl->r_len);
-	ASSERT(rl->r_write_wanted == B_FALSE);
-	ASSERT(rl->r_read_wanted == B_FALSE);
+	ASSERT3U(lr->lr_length, >, 1);
+	ASSERT3U(off, >, lr->lr_offset);
+	ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
+	ASSERT(lr->lr_write_wanted == B_FALSE);
+	ASSERT(lr->lr_read_wanted == B_FALSE);
 
 	/* create the rear proxy range lock */
-	rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	rear->r_off = off;
-	rear->r_len = rl->r_off + rl->r_len - off;
-	rear->r_cnt = rl->r_cnt;
-	rear->r_type = RL_READER;
-	rear->r_proxy = B_TRUE;
-	rear->r_write_wanted = B_FALSE;
-	rear->r_read_wanted = B_FALSE;
-
-	front = zfs_range_proxify(tree, rl);
-	front->r_len = off - rl->r_off;
+	locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+	rear->lr_offset = off;
+	rear->lr_length = lr->lr_offset + lr->lr_length - off;
+	rear->lr_count = lr->lr_count;
+	rear->lr_type = RL_READER;
+	rear->lr_proxy = B_TRUE;
+	rear->lr_write_wanted = B_FALSE;
+	rear->lr_read_wanted = B_FALSE;
+
+	locked_range_t *front = rangelock_proxify(tree, lr);
+	front->lr_length = off - lr->lr_offset;
 
 	avl_insert_here(tree, rear, front, AVL_AFTER);
 	return (front);
@@ -252,28 +270,27 @@ zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
  * Create and add a new proxy range lock for the supplied range.
  */
 static void
-zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
 {
-	rl_t *rl;
-
-	ASSERT(len);
-	rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	rl->r_off = off;
-	rl->r_len = len;
-	rl->r_cnt = 1;
-	rl->r_type = RL_READER;
-	rl->r_proxy = B_TRUE;
-	rl->r_write_wanted = B_FALSE;
-	rl->r_read_wanted = B_FALSE;
-	avl_add(tree, rl);
+	ASSERT(len != 0);
+	locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+	lr->lr_offset = off;
+	lr->lr_length = len;
+	lr->lr_count = 1;
+	lr->lr_type = RL_READER;
+	lr->lr_proxy = B_TRUE;
+	lr->lr_write_wanted = B_FALSE;
+	lr->lr_read_wanted = B_FALSE;
+	avl_add(tree, lr);
 }
 
 static void
-zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
+rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
+    locked_range_t *prev, avl_index_t where)
 {
-	rl_t *next;
-	uint64_t off = new->r_off;
-	uint64_t len = new->r_len;
+	locked_range_t *next;
+	uint64_t off = new->lr_offset;
+	uint64_t len = new->lr_length;
 
 	/*
 	 * prev arrives either:
@@ -282,37 +299,37 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
 	 *   range may overlap with the new range
 	 * - null, if there were no ranges starting before the new one
 	 */
-	if (prev) {
-		if (prev->r_off + prev->r_len <= off) {
+	if (prev != NULL) {
+		if (prev->lr_offset + prev->lr_length <= off) {
 			prev = NULL;
-		} else if (prev->r_off != off) {
+		} else if (prev->lr_offset != off) {
 			/*
 			 * convert to proxy if needed then
 			 * split this entry and bump ref count
 			 */
-			prev = zfs_range_split(tree, prev, off);
+			prev = rangelock_split(tree, prev, off);
 			prev = AVL_NEXT(tree, prev); /* move to rear range */
 		}
 	}
-	ASSERT((prev == NULL) || (prev->r_off == off));
+	ASSERT((prev == NULL) || (prev->lr_offset == off));
 
-	if (prev)
+	if (prev != NULL)
 		next = prev;
 	else
-		next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+		next = avl_nearest(tree, where, AVL_AFTER);
 
-	if (next == NULL || off + len <= next->r_off) {
+	if (next == NULL || off + len <= next->lr_offset) {
 		/* no overlaps, use the original new rl_t in the tree */
 		avl_insert(tree, new, where);
 		return;
 	}
 
-	if (off < next->r_off) {
+	if (off < next->lr_offset) {
 		/* Add a proxy for initial range before the overlap */
-		zfs_range_new_proxy(tree, off, next->r_off - off);
+		rangelock_new_proxy(tree, off, next->lr_offset - off);
 	}
 
-	new->r_cnt = 0; /* will use proxies in tree */
+	new->lr_count = 0; /* will use proxies in tree */
 	/*
 	 * We now search forward through the ranges, until we go past the end
 	 * of the new range. For each entry we make it a proxy if it
@@ -320,47 +337,51 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
 	 * gaps between the ranges then we create a new proxy range.
 	 */
 	for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
-		if (off + len <= next->r_off)
+		if (off + len <= next->lr_offset)
 			break;
-		if (prev && prev->r_off + prev->r_len < next->r_off) {
+		if (prev != NULL && prev->lr_offset + prev->lr_length <
+		    next->lr_offset) {
 			/* there's a gap */
-			ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
-			zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
-			    next->r_off - (prev->r_off + prev->r_len));
+			ASSERT3U(next->lr_offset, >,
+			    prev->lr_offset + prev->lr_length);
+			rangelock_new_proxy(tree,
+			    prev->lr_offset + prev->lr_length,
+			    next->lr_offset -
+			    (prev->lr_offset + prev->lr_length));
 		}
-		if (off + len == next->r_off + next->r_len) {
+		if (off + len == next->lr_offset + next->lr_length) {
 			/* exact overlap with end */
-			next = zfs_range_proxify(tree, next);
-			next->r_cnt++;
+			next = rangelock_proxify(tree, next);
+			next->lr_count++;
 			return;
 		}
-		if (off + len < next->r_off + next->r_len) {
+		if (off + len < next->lr_offset + next->lr_length) {
 			/* new range ends in the middle of this block */
-			next = zfs_range_split(tree, next, off + len);
-			next->r_cnt++;
+			next = rangelock_split(tree, next, off + len);
+			next->lr_count++;
 			return;
 		}
-		ASSERT3U(off + len, >, next->r_off + next->r_len);
-		next = zfs_range_proxify(tree, next);
-		next->r_cnt++;
+		ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
+		next = rangelock_proxify(tree, next);
+		next->lr_count++;
 	}
 
 	/* Add the remaining end range. */
-	zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
-	    (off + len) - (prev->r_off + prev->r_len));
+	rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
+	    (off + len) - (prev->lr_offset + prev->lr_length));
 }
 
 /*
  * Check if a reader lock can be grabbed, or wait and recheck until available.
  */
 static void
-zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
+rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
 {
-	avl_tree_t *tree = &zrl->zr_avl;
-	rl_t *prev, *next;
+	avl_tree_t *tree = &rl->rl_tree;
+	locked_range_t *prev, *next;
 	avl_index_t where;
-	uint64_t off = new->r_off;
-	uint64_t len = new->r_len;
+	uint64_t off = new->lr_offset;
+	uint64_t len = new->lr_length;
 
 	/*
 	 * Look for any writer locks in the range.
@@ -368,21 +389,22 @@ zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
 retry:
 	prev = avl_find(tree, new, &where);
 	if (prev == NULL)
-		prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+		prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
 
 	/*
 	 * Check the previous range for a writer lock overlap.
 	 */
-	if (prev && (off < prev->r_off + prev->r_len)) {
-		if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
-			if (!prev->r_read_wanted) {
-				cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
-				prev->r_read_wanted = B_TRUE;
+	if (prev && (off < prev->lr_offset + prev->lr_length)) {
+		if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
+			if (!prev->lr_read_wanted) {
+				cv_init(&prev->lr_read_cv,
+				    NULL, CV_DEFAULT, NULL);
+				prev->lr_read_wanted = B_TRUE;
 			}
-			cv_wait(&prev->r_rd_cv, &zrl->zr_mutex);
+			cv_wait(&prev->lr_read_cv, &rl->rl_lock);
 			goto retry;
 		}
-		if (off + len < prev->r_off + prev->r_len)
+		if (off + len < prev->lr_offset + prev->lr_length)
 			goto got_lock;
 	}
 
@@ -390,95 +412,97 @@ retry:
 	 * Search through the following ranges to see if there's
 	 * write lock any overlap.
 	 */
-	if (prev)
+	if (prev != NULL)
 		next = AVL_NEXT(tree, prev);
 	else
-		next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-	for (; next; next = AVL_NEXT(tree, next)) {
-		if (off + len <= next->r_off)
+		next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+	for (; next != NULL; next = AVL_NEXT(tree, next)) {
+		if (off + len <= next->lr_offset)
 			goto got_lock;
-		if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
-			if (!next->r_read_wanted) {
-				cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
-				next->r_read_wanted = B_TRUE;
+		if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
+			if (!next->lr_read_wanted) {
+				cv_init(&next->lr_read_cv,
+				    NULL, CV_DEFAULT, NULL);
+				next->lr_read_wanted = B_TRUE;
 			}
-			cv_wait(&next->r_rd_cv, &zrl->zr_mutex);
+			cv_wait(&next->lr_read_cv, &rl->rl_lock);
 			goto retry;
 		}
-		if (off + len <= next->r_off + next->r_len)
+		if (off + len <= next->lr_offset + next->lr_length)
 			goto got_lock;
 	}
 
 got_lock:
 	/*
 	 * Add the read lock, which may involve splitting existing
-	 * locks and bumping ref counts (r_cnt).
+	 * locks and bumping ref counts (r_count).
 	 */
-	zfs_range_add_reader(tree, new, prev, where);
+	rangelock_add_reader(tree, new, prev, where);
 }
 
 /*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER). Returns the range lock structure
- * for later unlocking or reduce range (if entire file
- * previously locked as RL_WRITER).
+ * Lock a range (offset, length) as either shared (RL_READER) or exclusive
+ * (RL_WRITER or RL_APPEND).  If RL_APPEND is specified, rl_cb() will convert
+ * it to a RL_WRITER lock (with the offset at the end of the file).  Returns
+ * the range lock structure for later unlocking (or reduce range if the
+ * entire file is locked as RL_WRITER).
  */
-rl_t *
-zfs_range_lock(zfs_rlock_t *zrl, uint64_t off, uint64_t len, rl_type_t type)
+locked_range_t *
+rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
+    rangelock_type_t type)
 {
-	rl_t *new;
-
 	ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
 
-	new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	new->r_zrl = zrl;
-	new->r_off = off;
+	locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+	new->lr_rangelock = rl;
+	new->lr_offset = off;
 	if (len + off < off)	/* overflow */
 		len = UINT64_MAX - off;
-	new->r_len = len;
-	new->r_cnt = 1; /* assume it's going to be in the tree */
-	new->r_type = type;
-	new->r_proxy = B_FALSE;
-	new->r_write_wanted = B_FALSE;
-	new->r_read_wanted = B_FALSE;
-
-	mutex_enter(&zrl->zr_mutex);
+	new->lr_length = len;
+	new->lr_count = 1; /* assume it's going to be in the tree */
+	new->lr_type = type;
+	new->lr_proxy = B_FALSE;
+	new->lr_write_wanted = B_FALSE;
+	new->lr_read_wanted = B_FALSE;
+
+	mutex_enter(&rl->rl_lock);
 	if (type == RL_READER) {
 		/*
 		 * First check for the usual case of no locks
 		 */
-		if (avl_numnodes(&zrl->zr_avl) == 0)
-			avl_add(&zrl->zr_avl, new);
+		if (avl_numnodes(&rl->rl_tree) == 0)
+			avl_add(&rl->rl_tree, new);
 		else
-			zfs_range_lock_reader(zrl, new);
-	} else /* RL_WRITER or RL_APPEND */
-		zfs_range_lock_writer(zrl, new);
-	mutex_exit(&zrl->zr_mutex);
+			rangelock_enter_reader(rl, new);
+	} else
+		rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */
+	mutex_exit(&rl->rl_lock);
 	return (new);
 }
 
+/*
+ * Safely free the locked_range_t.
+ */
 static void
-zfs_range_free(void *arg)
+rangelock_free(locked_range_t *lr)
 {
-	rl_t *rl = arg;
+	if (lr->lr_write_wanted)
+		cv_destroy(&lr->lr_write_cv);
 
-	if (rl->r_write_wanted)
-		cv_destroy(&rl->r_wr_cv);
+	if (lr->lr_read_wanted)
+		cv_destroy(&lr->lr_read_cv);
 
-	if (rl->r_read_wanted)
-		cv_destroy(&rl->r_rd_cv);
-
-	kmem_free(rl, sizeof (rl_t));
+	kmem_free(lr, sizeof (locked_range_t));
 }
 
 /*
  * Unlock a reader lock
  */
 static void
-zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
+rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove,
+    list_t *free_list)
 {
-	avl_tree_t *tree = &zrl->zr_avl;
-	rl_t *rl, *next = NULL;
+	avl_tree_t *tree = &rl->rl_tree;
 	uint64_t len;
 
 	/*
@@ -488,53 +512,48 @@ zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
 	 * removed from the tree and replaced by proxies (one or
 	 * more ranges mapping to the entire range).
 	 */
-	if (remove->r_cnt == 1) {
+	if (remove->lr_count == 1) {
 		avl_remove(tree, remove);
-
-		if (remove->r_write_wanted)
-			cv_broadcast(&remove->r_wr_cv);
-
-		if (remove->r_read_wanted)
-			cv_broadcast(&remove->r_rd_cv);
-
+		if (remove->lr_write_wanted)
+			cv_broadcast(&remove->lr_write_cv);
+		if (remove->lr_read_wanted)
+			cv_broadcast(&remove->lr_read_cv);
 		list_insert_tail(free_list, remove);
 	} else {
-		ASSERT0(remove->r_cnt);
-		ASSERT0(remove->r_write_wanted);
-		ASSERT0(remove->r_read_wanted);
+		ASSERT0(remove->lr_count);
+		ASSERT0(remove->lr_write_wanted);
+		ASSERT0(remove->lr_read_wanted);
 		/*
 		 * Find start proxy representing this reader lock,
 		 * then decrement ref count on all proxies
 		 * that make up this range, freeing them as needed.
 		 */
-		rl = avl_find(tree, remove, NULL);
-		ASSERT(rl);
-		ASSERT(rl->r_cnt);
-		ASSERT(rl->r_type == RL_READER);
-		for (len = remove->r_len; len != 0; rl = next) {
-			len -= rl->r_len;
-			if (len) {
-				next = AVL_NEXT(tree, rl);
-				ASSERT(next);
-				ASSERT(rl->r_off + rl->r_len == next->r_off);
-				ASSERT(next->r_cnt);
-				ASSERT(next->r_type == RL_READER);
+		locked_range_t *lr = avl_find(tree, remove, NULL);
+		ASSERT3P(lr, !=, NULL);
+		ASSERT3U(lr->lr_count, !=, 0);
+		ASSERT3U(lr->lr_type, ==, RL_READER);
+		locked_range_t *next = NULL;
+		for (len = remove->lr_length; len != 0; lr = next) {
+			len -= lr->lr_length;
+			if (len != 0) {
+				next = AVL_NEXT(tree, lr);
+				ASSERT3P(next, !=, NULL);
+				ASSERT3U(lr->lr_offset + lr->lr_length, ==,
+				    next->lr_offset);
+				ASSERT3U(next->lr_count, !=, 0);
+				ASSERT3U(next->lr_type, ==, RL_READER);
 			}
-			rl->r_cnt--;
-			if (rl->r_cnt == 0) {
-				avl_remove(tree, rl);
-
-				if (rl->r_write_wanted)
-					cv_broadcast(&rl->r_wr_cv);
-
-				if (rl->r_read_wanted)
-					cv_broadcast(&rl->r_rd_cv);
-
-				list_insert_tail(free_list, rl);
+			lr->lr_count--;
+			if (lr->lr_count == 0) {
+				avl_remove(tree, lr);
+				if (lr->lr_write_wanted)
+					cv_broadcast(&lr->lr_write_cv);
+				if (lr->lr_read_wanted)
+					cv_broadcast(&lr->lr_read_cv);
+				list_insert_tail(free_list, lr);
 			}
 		}
-
-		kmem_free(remove, sizeof (rl_t));
+		kmem_free(remove, sizeof (locked_range_t));
 	}
 }
 
@@ -542,91 +561,79 @@ zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
  * Unlock range and destroy range lock structure.
  */
 void
-zfs_range_unlock(rl_t *rl)
+rangelock_exit(locked_range_t *lr)
 {
-	zfs_rlock_t *zrl = rl->r_zrl;
+	rangelock_t *rl = lr->lr_rangelock;
 	list_t free_list;
-	rl_t *free_rl;
-
-	ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
-	ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
-	ASSERT(!rl->r_proxy);
-	list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node));
+	locked_range_t *free_lr;
 
-	mutex_enter(&zrl->zr_mutex);
-	if (rl->r_type == RL_WRITER) {
-		/* writer locks can't be shared or split */
-		avl_remove(&zrl->zr_avl, rl);
-		if (rl->r_write_wanted)
-			cv_broadcast(&rl->r_wr_cv);
+	ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
+	ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
+	ASSERT(!lr->lr_proxy);
 
-		if (rl->r_read_wanted)
-			cv_broadcast(&rl->r_rd_cv);
+	/*
+	 * The free list is used to defer the cv_destroy() and
+	 * subsequent kmem_free until after the mutex is dropped.
+	 */
+	list_create(&free_list, sizeof (locked_range_t),
+	    offsetof(locked_range_t, lr_node));
 
-		list_insert_tail(&free_list, rl);
+	mutex_enter(&rl->rl_lock);
+	if (lr->lr_type == RL_WRITER) {
+		/* writer locks can't be shared or split */
+		avl_remove(&rl->rl_tree, lr);
+		if (lr->lr_write_wanted)
+			cv_broadcast(&lr->lr_write_cv);
+		if (lr->lr_read_wanted)
+			cv_broadcast(&lr->lr_read_cv);
+		list_insert_tail(&free_list, lr);
 	} else {
 		/*
-		 * lock may be shared, let zfs_range_unlock_reader()
-		 * release the zp->z_range_lock lock and free the rl_t
+		 * lock may be shared, let rangelock_exit_reader()
+		 * release the lock and free the locked_range_t.
 		 */
-		zfs_range_unlock_reader(zrl, rl, &free_list);
+		rangelock_exit_reader(rl, lr, &free_list);
 	}
-	mutex_exit(&zrl->zr_mutex);
+	mutex_exit(&rl->rl_lock);
 
-	while ((free_rl = list_head(&free_list)) != NULL) {
-		list_remove(&free_list, free_rl);
-		zfs_range_free(free_rl);
-	}
+	while ((free_lr = list_remove_head(&free_list)) != NULL)
+		rangelock_free(free_lr);
 
 	list_destroy(&free_list);
 }
 
 /*
  * Reduce range locked as RL_WRITER from whole file to specified range.
- * Asserts the whole file is exclusivly locked and so there's only one
+ * Asserts the whole file is exclusively locked and so there's only one
  * entry in the tree.
  */
 void
-zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
+rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
 {
-	zfs_rlock_t *zrl = rl->r_zrl;
+	rangelock_t *rl = lr->lr_rangelock;
 
 	/* Ensure there are no other locks */
-	ASSERT(avl_numnodes(&zrl->zr_avl) == 1);
-	ASSERT(rl->r_off == 0);
-	ASSERT(rl->r_type == RL_WRITER);
-	ASSERT(!rl->r_proxy);
-	ASSERT3U(rl->r_len, ==, UINT64_MAX);
-	ASSERT3U(rl->r_cnt, ==, 1);
-
-	mutex_enter(&zrl->zr_mutex);
-	rl->r_off = off;
-	rl->r_len = len;
-
-	if (rl->r_write_wanted)
-		cv_broadcast(&rl->r_wr_cv);
-	if (rl->r_read_wanted)
-		cv_broadcast(&rl->r_rd_cv);
-
-	mutex_exit(&zrl->zr_mutex);
-}
-
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int
-zfs_range_compare(const void *arg1, const void *arg2)
-{
-	const rl_t *rl1 = (const rl_t *)arg1;
-	const rl_t *rl2 = (const rl_t *)arg2;
-
-	return (AVL_CMP(rl1->r_off, rl2->r_off));
+	ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
+	ASSERT3U(lr->lr_offset, ==, 0);
+	ASSERT3U(lr->lr_type, ==, RL_WRITER);
+	ASSERT(!lr->lr_proxy);
+	ASSERT3U(lr->lr_length, ==, UINT64_MAX);
+	ASSERT3U(lr->lr_count, ==, 1);
+
+	mutex_enter(&rl->rl_lock);
+	lr->lr_offset = off;
+	lr->lr_length = len;
+	mutex_exit(&rl->rl_lock);
+	if (lr->lr_write_wanted)
+		cv_broadcast(&lr->lr_write_cv);
+	if (lr->lr_read_wanted)
+		cv_broadcast(&lr->lr_read_cv);
 }
 
-#ifdef _KERNEL
-EXPORT_SYMBOL(zfs_range_lock);
-EXPORT_SYMBOL(zfs_range_unlock);
-EXPORT_SYMBOL(zfs_range_reduce);
-EXPORT_SYMBOL(zfs_range_compare);
+#if defined(_KERNEL)
+EXPORT_SYMBOL(rangelock_init);
+EXPORT_SYMBOL(rangelock_fini);
+EXPORT_SYMBOL(rangelock_enter);
+EXPORT_SYMBOL(rangelock_exit);
+EXPORT_SYMBOL(rangelock_reduce);
 #endif
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 4e163e2e3..36f47e77a 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -477,7 +477,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	/*
 	 * Lock the range against changes.
 	 */
-	rl_t *rl = zfs_range_lock(&zp->z_range_lock,
+	locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
 	    uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
@@ -550,7 +550,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
 	task_io_account_read(nread);
 out:
-	zfs_range_unlock(rl);
+	rangelock_exit(lr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -652,19 +652,18 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 #endif
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 
-	rl_t	 *rl;
-
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
+	locked_range_t *lr;
 	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
-		rl = zfs_range_lock(&zp->z_range_lock, 0, n, RL_APPEND);
-		woff = rl->r_off;
-		if (rl->r_len == UINT64_MAX) {
+		lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+		woff = lr->lr_offset;
+		if (lr->lr_length == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
@@ -679,11 +678,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
-		rl = zfs_range_lock(&zp->z_range_lock, woff, n, RL_WRITER);
+		lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 	}
 
 	if (woff >= limit) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EFBIG));
 	}
@@ -776,12 +775,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		}
 
 		/*
-		 * If zfs_range_lock() over-locked we grow the blocksize
+		 * If rangelock_enter() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
-		 * on the first iteration since zfs_range_reduce() will
-		 * shrink down r_len to the appropriate size.
+		 * on the first iteration since rangelock_reduce() will
+		 * shrink down lr_length to the appropriate size.
 		 */
-		if (rl->r_len == UINT64_MAX) {
+		if (lr->lr_length == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
@@ -797,7 +796,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
-			zfs_range_reduce(rl, woff, n);
+			rangelock_reduce(lr, woff, n);
 		}
 
 		/*
@@ -915,7 +914,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 	}
 
 	zfs_inode_update(zp);
-	zfs_range_unlock(rl);
+	rangelock_exit(lr);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
@@ -967,7 +966,7 @@ zfs_get_done(zgd_t *zgd, int error)
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
-	zfs_range_unlock(zgd->zgd_rl);
+	rangelock_exit(zgd->zgd_lr);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
@@ -1031,8 +1030,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
-		zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, size,
-		    RL_READER);
+		zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+		    offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
@@ -1053,12 +1052,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
-			zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset,
-			    size, RL_READER);
+			zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+			    offset, size, RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
-			zfs_range_unlock(zgd->zgd_rl);
+			rangelock_exit(zgd->zgd_lr);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
@@ -4432,7 +4431,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	loff_t		offset;
 	loff_t		pgoff;
 	unsigned int	pglen;
-	rl_t		*rl;
 	dmu_tx_t	*tx;
 	caddr_t		va;
 	int		err = 0;
@@ -4506,13 +4504,14 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	redirty_page_for_writepage(wbc, pp);
 	unlock_page(pp);
 
-	rl = zfs_range_lock(&zp->z_range_lock, pgoff, pglen, RL_WRITER);
+	locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
+	    pgoff, pglen, RL_WRITER);
 	lock_page(pp);
 
 	/* Page mapping changed or it was no longer dirty, we're done */
 	if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
 		unlock_page(pp);
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
@@ -4520,7 +4519,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	/* Another process started write block if required */
 	if (PageWriteback(pp)) {
 		unlock_page(pp);
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 
 		if (wbc->sync_mode != WB_SYNC_NONE)
 			wait_on_page_writeback(pp);
@@ -4532,7 +4531,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	/* Clear the dirty flag the required locks are held */
 	if (!clear_page_dirty_for_io(pp)) {
 		unlock_page(pp);
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
@@ -4559,7 +4558,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 		__set_page_dirty_nobuffers(pp);
 		ClearPageError(pp);
 		end_page_writeback(pp);
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
@@ -4586,7 +4585,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	    zfs_putpage_commit_cb, pp);
 	dmu_tx_commit(tx);
 
-	zfs_range_unlock(rl);
+	rangelock_exit(lr);
 
 	if (wbc->sync_mode != WB_SYNC_NONE) {
 		/*
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 677320435..8925b6700 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -91,6 +91,37 @@ static kmem_cache_t *znode_cache = NULL;
 static kmem_cache_t *znode_hold_cache = NULL;
 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
 
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(locked_range_t *new, void *arg)
+{
+	znode_t *zp = arg;
+
+	/*
+	 * If in append mode, convert to writer and lock starting at the
+	 * current end of file.
+	 */
+	if (new->lr_type == RL_APPEND) {
+		new->lr_offset = zp->z_size;
+		new->lr_type = RL_WRITER;
+	}
+
+	/*
+	 * If we need to grow the block size then lock the whole file range.
+	 */
+	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+		new->lr_offset = 0;
+		new->lr_length = UINT64_MAX;
+	}
+}
+
 /*ARGSUSED*/
 static int
 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
@@ -106,7 +137,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
 
-	zfs_rlock_init(&zp->z_range_lock);
+	rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
 
 	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
@@ -128,7 +159,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	rw_destroy(&zp->z_xattr_lock);
-	zfs_rlock_destroy(&zp->z_range_lock);
+	rangelock_fini(&zp->z_rangelock);
 
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
@@ -577,9 +608,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_is_mapped = B_FALSE;
 	zp->z_is_ctldir = B_FALSE;
 	zp->z_is_stale = B_FALSE;
-	zp->z_range_lock.zr_size = &zp->z_size;
-	zp->z_range_lock.zr_blksz = &zp->z_blksz;
-	zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
@@ -1475,20 +1503,20 @@ zfs_extend(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	dmu_tx_t *tx;
-	rl_t *rl;
+	locked_range_t *lr;
 	uint64_t newblksz;
 	int error;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
-	rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
+	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end <= zp->z_size) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (0);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
@@ -1518,7 +1546,7 @@ zfs_extend(znode_t *zp, uint64_t end)
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (error);
 	}
 
@@ -1530,7 +1558,7 @@ zfs_extend(znode_t *zp, uint64_t end)
 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
 	    &zp->z_size, sizeof (zp->z_size), tx));
 
-	zfs_range_unlock(rl);
+	rangelock_exit(lr);
 
 	dmu_tx_commit(tx);
 
@@ -1593,19 +1621,19 @@ static int
 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
-	rl_t *rl;
+	locked_range_t *lr;
 	int error;
 
 	/*
 	 * Lock the range being freed.
 	 */
-	rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
+	lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (off >= zp->z_size) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (0);
 	}
 
@@ -1655,7 +1683,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 				    page_len);
 		}
 	}
-	zfs_range_unlock(rl);
+	rangelock_exit(lr);
 
 	return (error);
 }
@@ -1673,7 +1701,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	dmu_tx_t *tx;
-	rl_t *rl;
+	locked_range_t *lr;
 	int error;
 	sa_bulk_attr_t bulk[2];
 	int count = 0;
@@ -1681,20 +1709,20 @@ zfs_trunc(znode_t *zp, uint64_t end)
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
-	rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
+	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end >= zp->z_size) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (0);
 	}
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
 	    DMU_OBJECT_END);
 	if (error) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (error);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
@@ -1704,7 +1732,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (error);
 	}
 
@@ -1720,8 +1748,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
 
 	dmu_tx_commit(tx);
-
-	zfs_range_unlock(rl);
+	rangelock_exit(lr);
 
 	return (0);
 }
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index f7706f143..e6f8451b2 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -86,7 +86,6 @@
 #include <sys/dmu_tx.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
-#include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 
@@ -123,7 +122,7 @@ struct zvol_state {
 	uint32_t		zv_open_count;	/* open counts */
 	uint32_t		zv_changed;	/* disk changed */
 	zilog_t			*zv_zilog;	/* ZIL handle */
-	zfs_rlock_t		zv_range_lock;	/* range lock */
+	rangelock_t		zv_rangelock;	/* for range locking */
 	dnode_t			*zv_dn;		/* dnode hold */
 	dev_t			zv_dev;		/* device id */
 	struct gendisk		*zv_disk;	/* generic disk */
@@ -716,7 +715,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 typedef struct zv_request {
 	zvol_state_t	*zv;
 	struct bio	*bio;
-	rl_t		*rl;
+	locked_range_t	*lr;
 } zv_request_t;
 
 static void
@@ -778,7 +777,7 @@ zvol_write(void *arg)
 		if (error)
 			break;
 	}
-	zfs_range_unlock(zvr->rl);
+	rangelock_exit(zvr->lr);
 
 	int64_t nwritten = start_resid - uio.uio_resid;
 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
@@ -872,7 +871,8 @@ zvol_discard(void *arg)
 		    ZVOL_OBJ, start, size);
 	}
 unlock:
-	zfs_range_unlock(zvr->rl);
+	rangelock_exit(zvr->lr);
+
 	if (error == 0 && sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
@@ -917,7 +917,7 @@ zvol_read(void *arg)
 			break;
 		}
 	}
-	zfs_range_unlock(zvr->rl);
+	rangelock_exit(zvr->lr);
 
 	int64_t nread = start_resid - uio.uio_resid;
 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
@@ -985,7 +985,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
 		 * are asynchronous, we take it here synchronously to make
 		 * sure overlapped I/Os are properly ordered.
 		 */
-		zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+		zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
 		    RL_WRITER);
 		/*
 		 * Sync writes and discards execute zil_commit() which may need
@@ -1014,7 +1014,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
 
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
-		zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+		zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
 		    RL_READER);
 		if (zvol_request_sync || taskq_dispatch(zvol_taskq,
 		    zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
@@ -1036,7 +1036,7 @@ zvol_get_done(zgd_t *zgd, int error)
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
-	zfs_range_unlock(zgd->zgd_rl);
+	rangelock_exit(zgd->zgd_lr);
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
@@ -1072,7 +1072,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
-		zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+		zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
 		    RL_READER);
 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
@@ -1085,7 +1085,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 		 */
 		size = zv->zv_volblocksize;
 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
-		zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+		zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
 		    RL_READER);
 		error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
@@ -1687,7 +1687,7 @@ zvol_alloc(dev_t dev, const char *name)
 	zv->zv_open_count = 0;
 	strlcpy(zv->zv_name, name, MAXNAMELEN);
 
-	zfs_rlock_init(&zv->zv_range_lock);
+	rangelock_init(&zv->zv_rangelock, NULL, NULL);
 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
 
 	zv->zv_disk->major = zvol_major;
@@ -1745,7 +1745,7 @@ zvol_free(void *arg)
 	ASSERT(zv->zv_disk->private_data == NULL);
 
 	rw_destroy(&zv->zv_suspend_lock);
-	zfs_rlock_destroy(&zv->zv_range_lock);
+	rangelock_fini(&zv->zv_rangelock);
 
 	del_gendisk(zv->zv_disk);
 	blk_cleanup_queue(zv->zv_queue);
author	Matt Ahrens <[email protected]>	2018-10-01 15:13:12 -0700
committer	Brian Behlendorf <[email protected]>	2018-10-11 10:19:33 -0700
commit	5d43cc9a59d61aea37a1236e9d28628856030947 (patch)
tree	366ef984481334ae6f99287048ced780eb097712 /module
parent	50a343d85c04698d51c154375a00994dea81e6db (diff)