6 files changed, 592 insertions, 59 deletions
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 60287ccdd..dae4107e0 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -186,6 +186,7 @@ static int zfs_abd_scatter_min_size = 512 * 3;
 abd_t *abd_zero_scatter = NULL;
 
 struct page;
+
 /*
  * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
  * point to ZERO_PAGE if it is available or it will be an allocated zero'd
@@ -453,14 +454,21 @@ abd_free_chunks(abd_t *abd)
 	if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
 		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 
-	abd_for_each_sg(abd, sg, nr_pages, i) {
-		page = sg_page(sg);
-		abd_unmark_zfs_page(page);
-		order = compound_order(page);
-		__free_pages(page, order);
-		ASSERT3U(sg->length, <=, PAGE_SIZE << order);
-		ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+	/*
+	 * Scatter ABDs may be constructed by abd_alloc_from_pages() from
+	 * an array of pages. In which case they should not be freed.
+	 */
+	if (!abd_is_from_pages(abd)) {
+		abd_for_each_sg(abd, sg, nr_pages, i) {
+			page = sg_page(sg);
+			abd_unmark_zfs_page(page);
+			order = compound_order(page);
+			__free_pages(page, order);
+			ASSERT3U(sg->length, <=, PAGE_SIZE << order);
+			ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+		}
 	}
+
 	abd_free_sg_table(abd);
 }
 
@@ -551,17 +559,19 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
 void
 abd_verify_scatter(abd_t *abd)
 {
-	size_t n;
-	int i = 0;
-	struct scatterlist *sg = NULL;
-
 	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
 	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
 	    ABD_SCATTER(abd).abd_sgl->length);
-	n = ABD_SCATTER(abd).abd_nents;
+
+#ifdef ZFS_DEBUG
+	struct scatterlist *sg = NULL;
+	size_t n = ABD_SCATTER(abd).abd_nents;
+	int i = 0;
+
 	abd_for_each_sg(abd, sg, n, i) {
 		ASSERT3P(sg_page(sg), !=, NULL);
 	}
+#endif
 }
 
 static void
@@ -687,14 +697,77 @@ abd_free_linear_page(abd_t *abd)
 {
 	/* Transform it back into a scatter ABD for freeing */
 	struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+
+	/* When backed by user page unmap it */
+	if (abd_is_from_pages(abd))
+		zfs_kunmap(sg_page(sg));
+
 	abd->abd_flags &= ~ABD_FLAG_LINEAR;
 	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
 	ABD_SCATTER(abd).abd_nents = 1;
 	ABD_SCATTER(abd).abd_offset = 0;
 	ABD_SCATTER(abd).abd_sgl = sg;
 	abd_free_chunks(abd);
+}
+
+/*
+ * Allocate a scatter ABD structure from user pages. The pages must be
+ * pinned with get_user_pages, or similiar, but need not be mapped via
+ * the kmap interfaces.
+ */
+abd_t *
+abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size)
+{
+	uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE);
+	struct sg_table table;
+
+	VERIFY3U(size, <=, DMU_MAX_ACCESS);
+	ASSERT3U(offset, <, PAGE_SIZE);
+	ASSERT3P(pages, !=, NULL);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that we
+	 * own the underlying data buffer, which is not true in this case.
+	 * Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+	abd_t *abd = abd_alloc_struct(0);
+	abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER;
+	abd->abd_size = size;
+
+	while (sg_alloc_table_from_pages(&table, pages, npages, offset,
+	    size, __GFP_NOWARN | GFP_NOIO) != 0) {
+		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+		schedule_timeout_interruptible(1);
+	}
+
+	if ((offset + size) <= PAGE_SIZE) {
+		/*
+		 * Since there is only one entry, this ABD can be represented
+		 * as a linear buffer. All single-page (4K) ABD's constructed
+		 * from a user page can be represented this way as long as the
+		 * page is mapped to a virtual address. This allows us to
+		 * apply an offset in to the mapped page.
+		 *
+		 * Note that kmap() must be used, not kmap_atomic(), because
+		 * the mapping needs to bet set up on all CPUs. Using kmap()
+		 * also enables the user of highmem pages when required.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
+		abd->abd_u.abd_linear.abd_sgl = table.sgl;
+		zfs_kmap(sg_page(table.sgl));
+		ABD_LINEAR_BUF(abd) = sg_virt(table.sgl);
+	} else {
+		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+
+		ABD_SCATTER(abd).abd_offset = offset;
+		ABD_SCATTER(abd).abd_sgl = table.sgl;
+		ABD_SCATTER(abd).abd_nents = table.nents;
+
+		ASSERT0(ABD_SCATTER(abd).abd_offset);
+	}
 
-	abd_update_scatter_stats(abd, ABDSTAT_DECR);
+	return (abd);
 }
 
 /*
@@ -746,6 +819,9 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 	ABD_SCATTER(abd).abd_offset = new_offset;
 	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
 
+	if (abd_is_from_pages(sabd))
+		abd->abd_flags |= ABD_FLAG_FROM_PAGES;
+
 	return (abd);
 }
 
@@ -874,6 +950,115 @@ abd_cache_reap_now(void)
 }
 
 /*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+	void *buf;
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, 0);
+	/*
+	 * In the event the ABD is composed of a single user page from Direct
+	 * I/O we can not direclty return the raw buffer. This is a consequence
+	 * of not being able to write protect the page and the contents of the
+	 * page can be changed at any time by the user.
+	 */
+	if (abd_is_from_pages(abd)) {
+		buf = zio_buf_alloc(n);
+	} else if (abd_is_linear(abd)) {
+		buf = abd_to_buf(abd);
+	} else {
+		buf = zio_buf_alloc(n);
+	}
+
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+#endif
+	return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+	void *buf = abd_borrow_buf(abd, n);
+
+	/*
+	 * In the event the ABD is composed of a single user page from Direct
+	 * I/O we must make sure copy the data over into the newly allocated
+	 * buffer. This is a consequence of the fact that we can not write
+	 * protect the user page and there is a risk the contents of the page
+	 * could be changed by the user at any moment.
+	 */
+	if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
+		abd_copy_to_buf(buf, abd, n);
+	}
+	return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will
+ * not change the contents of the ABD. If you want any changes you made to
+ * buf to be copied back to abd, use abd_return_buf_copy() instead. If the
+ * ABD is not constructed from user pages for Direct I/O then an ASSERT
+ * checks to make sure the contents of buffer have not changed since it was
+ * borrowed. We can not ASSERT that the contents of the buffer have not changed
+ * if it is composed of user pages because the pages can not be placed under
+ * write protection and the user could have possibly changed the contents in
+ * the pages at any time.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, n);
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+#endif
+	if (abd_is_from_pages(abd)) {
+		zio_buf_free(buf, n);
+	} else if (abd_is_linear(abd)) {
+		ASSERT3P(buf, ==, abd_to_buf(abd));
+	} else if (abd_is_gang(abd)) {
+#ifdef ZFS_DEBUG
+		/*
+		 * We have to be careful with gang ABD's that we do not ASSERT0
+		 * for any ABD's that contain user pages from Direct I/O. In
+		 * order to handle this, we just iterate through the gang ABD
+		 * and only verify ABDs that are not from user pages.
+		 */
+		void *cmp_buf = buf;
+
+		for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
+		    cabd != NULL;
+		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+			if (!abd_is_from_pages(cabd)) {
+				ASSERT0(abd_cmp_buf(cabd, cmp_buf,
+				    cabd->abd_size));
+			}
+			cmp_buf = (char *)cmp_buf + cabd->abd_size;
+		}
+#endif
+		zio_buf_free(buf, n);
+	} else {
+		ASSERT0(abd_cmp_buf(abd, buf, n));
+		zio_buf_free(buf, n);
+	}
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+	if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
+		abd_copy_from_buf(abd, buf, n);
+	}
+	abd_return_buf(abd, buf, n);
+}
+
+/*
  * This is abd_iter_page(), the function underneath abd_iterate_page_func().
  * It yields the next page struct and data offset and size within it, without
  * mapping it into the address space.
diff --git a/module/os/linux/zfs/zfs_racct.c b/module/os/linux/zfs/zfs_racct.c
index ce623ef9d..ce197caa4 100644
--- a/module/os/linux/zfs/zfs_racct.c
+++ b/module/os/linux/zfs/zfs_racct.c
@@ -25,14 +25,35 @@
 
 #include <sys/zfs_racct.h>
 
+#ifdef _KERNEL
+#include <linux/task_io_accounting_ops.h>
+
+void
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+	task_io_account_read(size);
+	spa_iostats_read_add(spa, size, iops, flags);
+}
+
 void
-zfs_racct_read(uint64_t size, uint64_t iops)
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
 {
-	(void) size, (void) iops;
+	task_io_account_write(size);
+	spa_iostats_write_add(spa, size, iops, flags);
 }
 
+#else
+
 void
-zfs_racct_write(uint64_t size, uint64_t iops)
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
 {
-	(void) size, (void) iops;
+	(void) spa, (void) size, (void) iops, (void) flags;
 }
+
+void
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+	(void) spa, (void) size, (void) iops, (void) flags;
+}
+
+#endif /* _KERNEL */
diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c
index a99a1ba88..637f968f8 100644
--- a/module/os/linux/zfs/zfs_uio.c
+++ b/module/os/linux/zfs/zfs_uio.c
@@ -41,12 +41,19 @@
 
 #ifdef _KERNEL
 
+#include <sys/errno.h>
+#include <sys/vmem.h>
+#include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/uio_impl.h>
 #include <sys/sysmacros.h>
 #include <sys/string.h>
+#include <sys/zfs_refcount.h>
+#include <sys/zfs_debug.h>
 #include <linux/kmap_compat.h>
 #include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
 
 /*
  * Move "n" bytes at byte address "p"; "rw" indicates the direction
@@ -327,8 +334,13 @@ EXPORT_SYMBOL(zfs_uiomove);
 int
 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
 {
-	if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) {
-		/* There's never a need to fault in kernel pages */
+	if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
+	    (uio->uio_extflg & UIO_DIRECT)) {
+		/*
+		 * There's never a need to fault in kernel pages or Direct I/O
+		 * write pages. Direct I/O write pages have been pinned in so
+		 * there is never a time for these pages a fault will occur.
+		 */
 		return (0);
 #if defined(HAVE_VFS_IOV_ITER)
 	} else if (uio->uio_segflg == UIO_ITER) {
@@ -437,9 +449,288 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
 			uio->uio_iovcnt--;
 		}
 	}
+
 	uio->uio_loffset += n;
 	uio->uio_resid -= n;
 }
 EXPORT_SYMBOL(zfs_uioskip);
 
+/*
+ * Check if the uio is page-aligned in memory.
+ */
+boolean_t
+zfs_uio_page_aligned(zfs_uio_t *uio)
+{
+	boolean_t aligned = B_TRUE;
+
+	if (uio->uio_segflg == UIO_USERSPACE ||
+	    uio->uio_segflg == UIO_SYSSPACE) {
+		const struct iovec *iov = uio->uio_iov;
+		size_t skip = uio->uio_skip;
+
+		for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
+			uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
+			size_t size = iov->iov_len - skip;
+			if ((addr & (PAGE_SIZE - 1)) ||
+			    (size & (PAGE_SIZE - 1))) {
+				aligned = B_FALSE;
+				break;
+			}
+			skip = 0;
+		}
+#if defined(HAVE_VFS_IOV_ITER)
+	} else if (uio->uio_segflg == UIO_ITER) {
+		unsigned long alignment =
+		    iov_iter_alignment(uio->uio_iter);
+		aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
+#endif
+	} else {
+		/* Currently not supported */
+		aligned = B_FALSE;
+	}
+
+	return (aligned);
+}
+
+
+#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
+#define	ZFS_MARKEED_PAGE	0x0
+#define	IS_ZFS_MARKED_PAGE(_p)	0
+#define	zfs_mark_page(_p)
+#define	zfs_unmark_page(_p)
+#define	IS_ZERO_PAGE(_p)	0
+
+#else
+/*
+ * Mark pages to know if they were allocated to replace ZERO_PAGE() for
+ * Direct I/O writes.
+ */
+#define	ZFS_MARKED_PAGE		0x5a465350414745 /* ASCII: ZFSPAGE */
+#define	IS_ZFS_MARKED_PAGE(_p) \
+	(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
+#define	IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
+
+static inline void
+zfs_mark_page(struct page *page)
+{
+	ASSERT3P(page, !=, NULL);
+	get_page(page);
+	SetPagePrivate(page);
+	set_page_private(page, ZFS_MARKED_PAGE);
+}
+
+static inline void
+zfs_unmark_page(struct page *page)
+{
+	ASSERT3P(page, !=, NULL);
+	set_page_private(page, 0UL);
+	ClearPagePrivate(page);
+	put_page(page);
+}
+#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
+
+static void
+zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
+{
+	ASSERT3P(uio->uio_dio.pages, !=, NULL);
+
+	for (long i = 0; i < uio->uio_dio.npages; i++) {
+		struct page *p = uio->uio_dio.pages[i];
+		lock_page(p);
+
+		if (IS_ZERO_PAGE(p)) {
+			/*
+			 * If the user page points the kernels ZERO_PAGE() a
+			 * new zero filled page will just be allocated so the
+			 * contents of the page can not be changed by the user
+			 * while a Direct I/O write is taking place.
+			 */
+			gfp_t gfp_zero_page  = __GFP_NOWARN | GFP_NOIO |
+			    __GFP_ZERO | GFP_KERNEL;
+
+			ASSERT0(IS_ZFS_MARKED_PAGE(p));
+			unlock_page(p);
+			put_page(p);
+
+			p = __page_cache_alloc(gfp_zero_page);
+			zfs_mark_page(p);
+		} else {
+			unlock_page(p);
+		}
+	}
+}
+
+void
+zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+
+	ASSERT(uio->uio_extflg & UIO_DIRECT);
+	ASSERT3P(uio->uio_dio.pages, !=, NULL);
+
+	for (long i = 0; i < uio->uio_dio.npages; i++) {
+		struct page *p = uio->uio_dio.pages[i];
+
+		if (IS_ZFS_MARKED_PAGE(p)) {
+			zfs_unmark_page(p);
+			__free_page(p);
+			continue;
+		}
+
+		put_page(p);
+	}
+
+	vmem_free(uio->uio_dio.pages,
+	    uio->uio_dio.npages * sizeof (struct page *));
+}
+
+/*
+ * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's
+ * iov_iter_get_pages().
+ */
+static int
+zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio,
+    long *numpages)
+{
+	unsigned long addr = (unsigned long)(v.iov_base);
+	size_t len = v.iov_len;
+	unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	long res = zfs_get_user_pages(
+	    P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, rw == UIO_READ,
+	    &uio->uio_dio.pages[uio->uio_dio.npages]);
+	if (res < 0) {
+		return (SET_ERROR(-res));
+	} else if (len != (res * PAGE_SIZE)) {
+		return (SET_ERROR(EFAULT));
+	}
+
+	ASSERT3S(len, ==, res * PAGE_SIZE);
+	*numpages = res;
+	return (0);
+}
+
+static int
+zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	const struct iovec *iovp = uio->uio_iov;
+	size_t skip = uio->uio_skip;
+	size_t len = uio->uio_resid - skip;
+
+	ASSERT(uio->uio_segflg != UIO_SYSSPACE);
+
+	for (int i = 0; i < uio->uio_iovcnt; i++) {
+		struct iovec iov;
+		long numpages = 0;
+
+		if (iovp->iov_len == 0) {
+			iovp++;
+			skip = 0;
+			continue;
+		}
+		iov.iov_len = MIN(len, iovp->iov_len - skip);
+		iov.iov_base = iovp->iov_base + skip;
+		int error = zfs_uio_iov_step(iov, rw, uio, &numpages);
+
+		if (error)
+			return (error);
+
+		uio->uio_dio.npages += numpages;
+		len -= iov.iov_len;
+		skip = 0;
+		iovp++;
+	}
+
+	ASSERT0(len);
+
+	return (0);
+}
+
+#if defined(HAVE_VFS_IOV_ITER)
+static int
+zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	size_t skip = uio->uio_skip;
+	size_t wanted = uio->uio_resid - uio->uio_skip;
+	ssize_t rollback = 0;
+	ssize_t cnt;
+	unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
+
+	while (wanted) {
+#if defined(HAVE_IOV_ITER_GET_PAGES2)
+		cnt = iov_iter_get_pages2(uio->uio_iter,
+		    &uio->uio_dio.pages[uio->uio_dio.npages],
+		    wanted, maxpages, &skip);
+#else
+		cnt = iov_iter_get_pages(uio->uio_iter,
+		    &uio->uio_dio.pages[uio->uio_dio.npages],
+		    wanted, maxpages, &skip);
+#endif
+		if (cnt < 0) {
+			iov_iter_revert(uio->uio_iter, rollback);
+			return (SET_ERROR(-cnt));
+		}
+		uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
+		rollback += cnt;
+		wanted -= cnt;
+		skip = 0;
+#if !defined(HAVE_IOV_ITER_GET_PAGES2)
+		/*
+		 * iov_iter_get_pages2() advances the iov_iter on success.
+		 */
+		iov_iter_advance(uio->uio_iter, cnt);
+#endif
+
+	}
+	ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
+	iov_iter_revert(uio->uio_iter, rollback);
+
+	return (0);
+}
+#endif /* HAVE_VFS_IOV_ITER */
+
+/*
+ * This function pins user pages. In the event that the user pages were not
+ * successfully pinned an error value is returned.
+ *
+ * On success, 0 is returned.
+ */
+int
+zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	int error = 0;
+	long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
+	size_t size = npages * sizeof (struct page *);
+
+	if (uio->uio_segflg == UIO_USERSPACE) {
+		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
+		error = zfs_uio_get_dio_pages_iov(uio, rw);
+#if defined(HAVE_VFS_IOV_ITER)
+	} else if (uio->uio_segflg == UIO_ITER) {
+		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
+		error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
+#endif
+	} else {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	ASSERT3S(uio->uio_dio.npages, >=, 0);
+
+	if (error) {
+		for (long i = 0; i < uio->uio_dio.npages; i++)
+			put_page(uio->uio_dio.pages[i]);
+		vmem_free(uio->uio_dio.pages, size);
+		return (error);
+	} else {
+		ASSERT3S(uio->uio_dio.npages, ==, npages);
+	}
+
+	if (rw == UIO_WRITE) {
+		zfs_uio_dio_check_for_zero_page(uio);
+	}
+
+	uio->uio_extflg |= UIO_DIRECT;
+
+	return (0);
+}
+
 #endif /* _KERNEL */
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index a52f08868..22a4ad1ef 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -59,6 +59,7 @@
 #include <sys/objlist.h>
 #include <sys/zpl.h>
 #include <linux/vfs_compat.h>
+#include <linux/fs.h>
 #include "zfs_comutil.h"
 
 enum {
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 9803c7fec..77e59a3ba 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -296,6 +296,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 
 		struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
+
 			/*
 			 * If filemap_fault() retries there exists a window
 			 * where the page will be unlocked and not up to date.
@@ -3866,7 +3867,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	}
 
 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
-	    for_sync ? zfs_putpage_sync_commit_cb :
+	    B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
 	    zfs_putpage_async_commit_cb, pp);
 
 	dmu_tx_commit(tx);
@@ -4009,6 +4010,7 @@ zfs_inactive(struct inode *ip)
 static int
 zfs_fillpage(struct inode *ip, struct page *pp)
 {
+	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	loff_t i_size = i_size_read(ip);
 	u_offset_t io_off = page_offset(pp);
@@ -4020,7 +4022,7 @@ zfs_fillpage(struct inode *ip, struct page *pp)
 		io_len = i_size - io_off;
 
 	void *va = kmap(pp);
-	int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
+	int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
 	    io_len, va, DMU_READ_PREFETCH);
 	if (io_len != PAGE_SIZE)
 		memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
@@ -4058,11 +4060,49 @@ zfs_getpage(struct inode *ip, struct page *pp)
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	znode_t *zp = ITOZ(ip);
 	int error;
+	loff_t i_size = i_size_read(ip);
+	u_offset_t io_off = page_offset(pp);
+	size_t io_len = PAGE_SIZE;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
+	ASSERT3U(io_off, <, i_size);
+
+	if (io_off + io_len > i_size)
+		io_len = i_size - io_off;
+
+	/*
+	 * It is important to hold the rangelock here because it is possible
+	 * a Direct I/O write or block clone might be taking place at the same
+	 * time that a page is being faulted in through filemap_fault(). With
+	 * Direct I/O writes and block cloning db->db_data will be set to NULL
+	 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
+	 * rangelock is not held, then there is a race between faulting in a
+	 * page and writing out a Direct I/O write or block cloning. Without
+	 * the rangelock a NULL pointer dereference can occur in
+	 * dmu_read_impl() for db->db_data during the mempcy operation when
+	 * zfs_fillpage() calls dmu_read().
+	 */
+	zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
+	    io_off, io_len, RL_READER);
+	if (lr == NULL) {
+		/*
+		 * It is important to drop the page lock before grabbing the
+		 * rangelock to avoid another deadlock between here and
+		 * zfs_write() -> update_pages(). update_pages() holds both the
+		 * rangelock and the page lock.
+		 */
+		get_page(pp);
+		unlock_page(pp);
+		lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
+		    io_len, RL_READER);
+		lock_page(pp);
+		put_page(pp);
+	}
 	error = zfs_fillpage(ip, pp);
+	zfs_rangelock_exit(lr);
+
 	if (error == 0)
 		dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
 
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 9dec52215..6b16faa2b 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -322,14 +322,14 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
-	int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+	ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
 	    filp->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
-	if (error < 0)
-		return (error);
+	if (ret < 0)
+		return (ret);
 
 	ssize_t read = count - uio.uio_resid;
 	kiocb->ki_pos += read;
@@ -384,14 +384,14 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
-	int error = -zfs_write(ITOZ(ip), &uio,
+	ret = -zfs_write(ITOZ(ip), &uio,
 	    filp->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
-	if (error < 0)
-		return (error);
+	if (ret < 0)
+		return (ret);
 
 	ssize_t wrote = count - uio.uio_resid;
 	kiocb->ki_pos += wrote;
@@ -422,14 +422,14 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov,
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
-	int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
-	    filp->f_flags | zfs_io_flags(kiocb), cr);
+	ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+	    flip->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
-	if (error < 0)
-		return (error);
+	if (ret < 0)
+		return (ret);
 
 	ssize_t read = count - uio.uio_resid;
 	kiocb->ki_pos += read;
@@ -467,53 +467,57 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
-	int error = -zfs_write(ITOZ(ip), &uio,
+	ret = -zfs_write(ITOZ(ip), &uio,
 	    filp->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
-	if (error < 0)
-		return (error);
+	if (ret < 0)
+		return (ret);
 
 	ssize_t wrote = count - uio.uio_resid;
 	kiocb->ki_pos += wrote;
 
 	return (wrote);
 }
+
 #endif /* HAVE_VFS_RW_ITERATE */
 
-#if defined(HAVE_VFS_RW_ITERATE)
 static ssize_t
-zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
+zpl_direct_IO_impl(void)
 {
-	if (rw == WRITE)
-		return (zpl_iter_write(kiocb, iter));
-	else
-		return (zpl_iter_read(kiocb, iter));
+	/*
+	 * All O_DIRECT requests should be handled by
+	 * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code
+	 * should call the direct_IO address_space_operations function. We set
+	 * this code path to be fatal if it is executed.
+	 */
+	PANIC(0);
+	return (0);
 }
+
+#if defined(HAVE_VFS_RW_ITERATE)
 #if defined(HAVE_VFS_DIRECT_IO_ITER)
 static ssize_t
 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
 {
-	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+	return (zpl_direct_IO_impl());
 }
 #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
 static ssize_t
 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 {
-	ASSERT3S(pos, ==, kiocb->ki_pos);
-	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+	return (zpl_direct_IO_impl());
 }
 #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
 static ssize_t
 zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 {
-	ASSERT3S(pos, ==, kiocb->ki_pos);
-	return (zpl_direct_IO_impl(rw, kiocb, iter));
+	return (zpl_direct_IO_impl());
 }
 #else
-#error "Unknown direct IO interface"
+#error "Unknown Direct I/O interface"
 #endif
 
 #else /* HAVE_VFS_RW_ITERATE */
@@ -523,26 +527,16 @@ static ssize_t
 zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov,
     loff_t pos, unsigned long nr_segs)
 {
-	if (rw == WRITE)
-		return (zpl_aio_write(kiocb, iov, nr_segs, pos));
-	else
-		return (zpl_aio_read(kiocb, iov, nr_segs, pos));
+	return (zpl_direct_IO_impl());
 }
 #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
 static ssize_t
 zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 {
-	const struct iovec *iovp = iov_iter_iovec(iter);
-	unsigned long nr_segs = iter->nr_segs;
-
-	ASSERT3S(pos, ==, kiocb->ki_pos);
-	if (rw == WRITE)
-		return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
-	else
-		return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
+	return (zpl_direct_IO_impl());
 }
 #else
-#error "Unknown direct IO interface"
+#error "Unknown Direct I/O interface"
 #endif
 
 #endif /* HAVE_VFS_RW_ITERATE */
@@ -627,6 +621,7 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
 	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
 	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
 	spl_fstrans_unmark(cookie);
+
 	if (error)
 		return (error);