30 files changed, 2256 insertions, 302 deletions
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 0472a9348..d96347bad 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -327,6 +327,7 @@ ZFS_OBJS := \
 	ddt_stats.o \
 	ddt_zap.o \
 	dmu.o \
+	dmu_direct.o \
 	dmu_diff.o \
 	dmu_object.o \
 	dmu_objset.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 9161204c9..188f5ad2d 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -257,6 +257,7 @@ SRCS+=	abd.c \
 	ddt_stats.c \
 	ddt_zap.c \
 	dmu.c \
+	dmu_direct.c \
 	dmu_diff.c \
 	dmu_object.c \
 	dmu_objset.c \
diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c
index 17886cbeb..74cbe36bb 100644
--- a/module/os/freebsd/spl/spl_uio.c
+++ b/module/os/freebsd/spl/spl_uio.c
@@ -44,6 +44,10 @@
 #include <sys/uio_impl.h>
 #include <sys/vnode.h>
 #include <sys/zfs_znode.h>
+#include <sys/byteorder.h>
+#include <sys/lock.h>
+#include <sys/vm.h>
+#include <vm/vm_map.h>
 
 static void
 zfs_freeuio(struct uio *uio)
@@ -115,3 +119,200 @@ zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio)
 	ASSERT3U(zfs_uio_rw(uio), ==, dir);
 	return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio)));
 }
+
+/*
+ * Check if the uio is page-aligned in memory.
+ */
+boolean_t
+zfs_uio_page_aligned(zfs_uio_t *uio)
+{
+	const struct iovec *iov = GET_UIO_STRUCT(uio)->uio_iov;
+
+	for (int i = zfs_uio_iovcnt(uio); i > 0; iov++, i--) {
+		uintptr_t addr = (uintptr_t)iov->iov_base;
+		size_t size = iov->iov_len;
+		if ((addr & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+				return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static void
+zfs_uio_set_pages_to_stable(zfs_uio_t *uio)
+{
+	ASSERT3P(uio->uio_dio.pages, !=, NULL);
+	ASSERT3S(uio->uio_dio.npages, >, 0);
+
+	for (int i = 0; i < uio->uio_dio.npages; i++) {
+		vm_page_t page = uio->uio_dio.pages[i];
+		ASSERT3P(page, !=, NULL);
+
+		MPASS(page == PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(page)));
+		vm_page_busy_acquire(page, VM_ALLOC_SBUSY);
+		pmap_remove_write(page);
+	}
+}
+
+static void
+zfs_uio_release_stable_pages(zfs_uio_t *uio)
+{
+	ASSERT3P(uio->uio_dio.pages, !=, NULL);
+	for (int i = 0; i < uio->uio_dio.npages; i++) {
+		vm_page_t page = uio->uio_dio.pages[i];
+
+		ASSERT3P(page, !=, NULL);
+		vm_page_sunbusy(page);
+	}
+}
+
+/*
+ * If the operation is marked as read, then we are stating the pages will be
+ * written to and must be given write access.
+ */
+static int
+zfs_uio_hold_pages(unsigned long start, size_t len, int nr_pages,
+    zfs_uio_rw_t rw, vm_page_t *pages)
+{
+	vm_map_t map;
+	vm_prot_t prot;
+	int count;
+
+	map = &curthread->td_proc->p_vmspace->vm_map;
+	ASSERT3S(len, >, 0);
+
+	prot = rw == UIO_READ ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;
+	count = vm_fault_quick_hold_pages(map, start, len, prot, pages,
+	    nr_pages);
+
+	return (count);
+}
+
+void
+zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	ASSERT(uio->uio_extflg & UIO_DIRECT);
+	ASSERT3P(uio->uio_dio.pages, !=, NULL);
+	ASSERT(zfs_uio_rw(uio) == rw);
+
+	if (rw == UIO_WRITE)
+		zfs_uio_release_stable_pages(uio);
+
+	vm_page_unhold_pages(&uio->uio_dio.pages[0],
+	    uio->uio_dio.npages);
+
+	kmem_free(uio->uio_dio.pages,
+	    uio->uio_dio.npages * sizeof (vm_page_t));
+}
+
+static int
+zfs_uio_get_user_pages(unsigned long start, int nr_pages,
+    size_t len, zfs_uio_rw_t rw, vm_page_t *pages)
+{
+	int count;
+
+	count = zfs_uio_hold_pages(start, len, nr_pages, rw, pages);
+
+	if (count != nr_pages) {
+		if (count > 0)
+			vm_page_unhold_pages(pages, count);
+		return (0);
+	}
+
+	ASSERT3S(count, ==, nr_pages);
+
+	return (count);
+}
+
+static int
+zfs_uio_iov_step(struct iovec v, zfs_uio_t *uio, int *numpages)
+{
+	unsigned long addr = (unsigned long)(v.iov_base);
+	size_t len = v.iov_len;
+	int n = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	int res = zfs_uio_get_user_pages(
+	    P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, len,
+	    zfs_uio_rw(uio), &uio->uio_dio.pages[uio->uio_dio.npages]);
+
+	if (res != n)
+		return (SET_ERROR(EFAULT));
+
+	ASSERT3U(len, ==, res * PAGE_SIZE);
+	*numpages = res;
+	return (0);
+}
+
+static int
+zfs_uio_get_dio_pages_impl(zfs_uio_t *uio)
+{
+	const struct iovec *iovp = GET_UIO_STRUCT(uio)->uio_iov;
+	size_t len = zfs_uio_resid(uio);
+
+	for (int i = 0; i < zfs_uio_iovcnt(uio); i++) {
+		struct iovec iov;
+		int numpages = 0;
+
+		if (iovp->iov_len == 0) {
+			iovp++;
+			continue;
+		}
+		iov.iov_len = MIN(len, iovp->iov_len);
+		iov.iov_base = iovp->iov_base;
+		int error = zfs_uio_iov_step(iov, uio, &numpages);
+
+		if (error)
+			return (error);
+
+		uio->uio_dio.npages += numpages;
+		len -= iov.iov_len;
+		iovp++;
+	}
+
+	ASSERT0(len);
+
+	return (0);
+}
+
+/*
+ * This function holds user pages into the kernel. In the event that the user
+ * pages are not successfully held an error value is returned.
+ *
+ * On success, 0 is returned.
+ */
+int
+zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	int error = 0;
+	int npages = DIV_ROUND_UP(zfs_uio_resid(uio), PAGE_SIZE);
+	size_t size = npages * sizeof (vm_page_t);
+
+	ASSERT(zfs_uio_rw(uio) == rw);
+
+	uio->uio_dio.pages = kmem_alloc(size, KM_SLEEP);
+
+	error = zfs_uio_get_dio_pages_impl(uio);
+
+	if (error) {
+		vm_page_unhold_pages(&uio->uio_dio.pages[0],
+		    uio->uio_dio.npages);
+		kmem_free(uio->uio_dio.pages, size);
+		return (error);
+	}
+
+	ASSERT3S(uio->uio_dio.npages, >, 0);
+
+	/*
+	 * Since we will be writing the user pages we must make sure that
+	 * they are stable. That way the contents of the pages can not change
+	 * while we are doing: compression, checksumming, encryption, parity
+	 * calculations or deduplication.
+	 */
+	if (zfs_uio_rw(uio) == UIO_WRITE)
+		zfs_uio_set_pages_to_stable(uio);
+
+	uio->uio_extflg |= UIO_DIRECT;
+
+	return (0);
+}
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index f24ea3dc7..f20dc5d8c 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -32,6 +32,7 @@
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
+#include <sys/vm.h>
 
 typedef struct abd_stats {
 	kstat_named_t abdstat_struct_size;
@@ -135,7 +136,9 @@ abd_size_alloc_linear(size_t size)
 void
 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
 {
-	uint_t n = abd_scatter_chunkcnt(abd);
+	uint_t n;
+
+	n = abd_scatter_chunkcnt(abd);
 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
 	int waste = (n << PAGE_SHIFT) - abd->abd_size;
 	if (op == ABDSTAT_INCR) {
@@ -198,10 +201,16 @@ abd_free_chunks(abd_t *abd)
 {
 	uint_t i, n;
 
-	n = abd_scatter_chunkcnt(abd);
-	for (i = 0; i < n; i++) {
-		kmem_cache_free(abd_chunk_cache,
-		    ABD_SCATTER(abd).abd_chunks[i]);
+	/*
+	 * Scatter ABDs may be constructed by abd_alloc_from_pages() from
+	 * an array of pages. In which case they should not be freed.
+	 */
+	if (!abd_is_from_pages(abd)) {
+		n = abd_scatter_chunkcnt(abd);
+		for (i = 0; i < n; i++) {
+			kmem_cache_free(abd_chunk_cache,
+			    ABD_SCATTER(abd).abd_chunks[i]);
+		}
 	}
 }
 
@@ -342,11 +351,8 @@ abd_fini(void)
 void
 abd_free_linear_page(abd_t *abd)
 {
-	/*
-	 * FreeBSD does not have scatter linear pages
-	 * so there is an error.
-	 */
-	VERIFY(0);
+	ASSERT3P(abd->abd_u.abd_linear.sf, !=, NULL);
+	zfs_unmap_page(abd->abd_u.abd_linear.sf);
 }
 
 /*
@@ -365,6 +371,26 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
 	return (abd_alloc_linear(size, is_metadata));
 }
 
+static abd_t *
+abd_get_offset_from_pages(abd_t *abd, abd_t *sabd, size_t chunkcnt,
+    size_t new_offset)
+{
+	ASSERT(abd_is_from_pages(sabd));
+
+	/*
+	 * Set the child child chunks to point at the parent chunks as
+	 * the chunks are just pages and we don't want to copy them.
+	 */
+	size_t parent_offset = new_offset / PAGE_SIZE;
+	ASSERT3U(parent_offset, <, abd_scatter_chunkcnt(sabd));
+	for (int i = 0; i < chunkcnt; i++)
+		ABD_SCATTER(abd).abd_chunks[i] =
+		    ABD_SCATTER(sabd).abd_chunks[parent_offset + i];
+
+	abd->abd_flags |= ABD_FLAG_FROM_PAGES;
+	return (abd);
+}
+
 abd_t *
 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
     size_t size)
@@ -399,6 +425,11 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 
 	ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK;
 
+	if (abd_is_from_pages(sabd)) {
+		return (abd_get_offset_from_pages(abd, sabd, chunkcnt,
+		    new_offset));
+	}
+
 	/* Copy the scatterlist starting at the correct offset */
 	(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
 	    &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT],
@@ -408,6 +439,44 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 }
 
 /*
+ * Allocate a scatter ABD structure from user pages.
+ */
+abd_t *
+abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size)
+{
+	VERIFY3U(size, <=, DMU_MAX_ACCESS);
+	ASSERT3U(offset, <, PAGE_SIZE);
+	ASSERT3P(pages, !=, NULL);
+
+	abd_t *abd = abd_alloc_struct(size);
+	abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
+	abd->abd_size = size;
+
+	if ((offset + size) <= PAGE_SIZE) {
+		/*
+		 * There is only a single page worth of data, so we will just
+		 * use  a linear ABD. We have to make sure to take into account
+		 * the offset though. In all other cases our offset will be 0
+		 * as we are always PAGE_SIZE aligned.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
+		ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0],
+		    &abd->abd_u.abd_linear.sf) + offset;
+	} else {
+		ABD_SCATTER(abd).abd_offset = offset;
+		ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+		/*
+		 * Setting the ABD's abd_chunks to point to the user pages.
+		 */
+		for (int i = 0; i < abd_chunkcnt_for_bytes(size); i++)
+			ABD_SCATTER(abd).abd_chunks[i] = pages[i];
+	}
+
+	return (abd);
+}
+
+/*
  * Initialize the abd_iter.
  */
 void
@@ -468,6 +537,16 @@ abd_iter_map(struct abd_iter *aiter)
 	if (abd_is_linear(abd)) {
 		aiter->iter_mapsize = abd->abd_size - offset;
 		paddr = ABD_LINEAR_BUF(abd);
+	} else if (abd_is_from_pages(abd)) {
+		aiter->sf = NULL;
+		offset += ABD_SCATTER(abd).abd_offset;
+		size_t index = offset / PAGE_SIZE;
+		offset &= PAGE_MASK;
+		aiter->iter_mapsize = MIN(PAGE_SIZE - offset,
+		    abd->abd_size - aiter->iter_pos);
+		paddr = zfs_map_page(
+		    ABD_SCATTER(aiter->iter_abd).abd_chunks[index],
+		    &aiter->sf);
 	} else {
 		offset += ABD_SCATTER(abd).abd_offset;
 		paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT];
@@ -490,6 +569,12 @@ abd_iter_unmap(struct abd_iter *aiter)
 		ASSERT3U(aiter->iter_mapsize, >, 0);
 	}
 
+	if (abd_is_from_pages(aiter->iter_abd) &&
+	    !abd_is_linear_page(aiter->iter_abd)) {
+		ASSERT3P(aiter->sf, !=, NULL);
+		zfs_unmap_page(aiter->sf);
+	}
+
 	aiter->iter_mapaddr = NULL;
 	aiter->iter_mapsize = 0;
 }
@@ -499,3 +584,67 @@ abd_cache_reap_now(void)
 {
 	kmem_cache_reap_soon(abd_chunk_cache);
 }
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will alloate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+	void *buf;
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, 0);
+	if (abd_is_linear(abd)) {
+		buf = abd_to_buf(abd);
+	} else {
+		buf = zio_buf_alloc(n);
+	}
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+#endif
+	return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+	void *buf = abd_borrow_buf(abd, n);
+	if (!abd_is_linear(abd)) {
+		abd_copy_to_buf(buf, abd, n);
+	}
+	return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * no change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, n);
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+#endif
+	if (abd_is_linear(abd)) {
+		ASSERT3P(buf, ==, abd_to_buf(abd));
+	} else {
+		ASSERT0(abd_cmp_buf(abd, buf, n));
+		zio_buf_free(buf, n);
+	}
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+	if (!abd_is_linear(abd)) {
+		abd_copy_from_buf(abd, buf, n);
+	}
+	abd_return_buf(abd, buf, n);
+}
diff --git a/module/os/freebsd/zfs/zfs_racct.c b/module/os/freebsd/zfs/zfs_racct.c
index 883255bc1..2989a9af9 100644
--- a/module/os/freebsd/zfs/zfs_racct.c
+++ b/module/os/freebsd/zfs/zfs_racct.c
@@ -27,7 +27,7 @@
 #include <sys/racct.h>
 
 void
-zfs_racct_read(uint64_t size, uint64_t iops)
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
 {
 	curthread->td_ru.ru_inblock += iops;
 #ifdef RACCT
@@ -40,10 +40,12 @@ zfs_racct_read(uint64_t size, uint64_t iops)
 #else
 	(void) size;
 #endif /* RACCT */
+
+	spa_iostats_read_add(spa, size, iops, flags);
 }
 
 void
-zfs_racct_write(uint64_t size, uint64_t iops)
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
 {
 	curthread->td_ru.ru_oublock += iops;
 #ifdef RACCT
@@ -56,4 +58,6 @@ zfs_racct_write(uint64_t size, uint64_t iops)
 #else
 	(void) size;
 #endif /* RACCT */
+
+	spa_iostats_write_add(spa, size, iops, flags);
 }
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 01b964f98..5dbca10a3 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -4131,7 +4131,7 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 		 * but that would make the locking messier
 		 */
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
-		    len, commit, NULL, NULL);
+		    len, commit, B_FALSE, NULL, NULL);
 
 		zfs_vmobject_wlock(object);
 		for (i = 0; i < ncount; i++) {
@@ -4266,6 +4266,8 @@ ioflags(int ioflags)
 		flags |= O_APPEND;
 	if (ioflags & IO_NDELAY)
 		flags |= O_NONBLOCK;
+	if (ioflags & IO_DIRECT)
+		flags |= O_DIRECT;
 	if (ioflags & IO_SYNC)
 		flags |= O_SYNC;
 
@@ -4285,9 +4287,36 @@ static int
 zfs_freebsd_read(struct vop_read_args *ap)
 {
 	zfs_uio_t uio;
+	int error = 0;
 	zfs_uio_init(&uio, ap->a_uio);
-	return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
-	    ap->a_cred));
+	error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
+	    ap->a_cred);
+	/*
+	 * XXX We occasionally get an EFAULT for Direct I/O reads on
+	 * FreeBSD 13. This still needs to be resolved. The EFAULT comes
+	 * from:
+	 * zfs_uio_get__dio_pages_alloc() ->
+	 * zfs_uio_get_dio_pages_impl() ->
+	 * zfs_uio_iov_step() ->
+	 * zfs_uio_get_user_pages().
+	 * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O
+	 * read fails to map in the user pages (returning EFAULT) the
+	 * Direct I/O request is broken up into two separate IO requests
+	 * and issued separately using Direct I/O.
+	 */
+#ifdef ZFS_DEBUG
+	if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) {
+#if 0
+		printf("%s(%d): Direct I/O read returning EFAULT "
+		    "uio = %p, zfs_uio_offset(uio) = %lu "
+		    "zfs_uio_resid(uio) = %lu\n",
+		    __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio),
+		    zfs_uio_resid(&uio));
+#endif
+	}
+
+#endif
+	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
index ddb20b031..c3be4730d 100644
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -922,6 +922,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
 	if (commit)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	rw_exit(&zv->zv_suspend_lock);
+
 	return (error);
 }
 
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 60287ccdd..dae4107e0 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -186,6 +186,7 @@ static int zfs_abd_scatter_min_size = 512 * 3;
 abd_t *abd_zero_scatter = NULL;
 
 struct page;
+
 /*
  * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
  * point to ZERO_PAGE if it is available or it will be an allocated zero'd
@@ -453,14 +454,21 @@ abd_free_chunks(abd_t *abd)
 	if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
 		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 
-	abd_for_each_sg(abd, sg, nr_pages, i) {
-		page = sg_page(sg);
-		abd_unmark_zfs_page(page);
-		order = compound_order(page);
-		__free_pages(page, order);
-		ASSERT3U(sg->length, <=, PAGE_SIZE << order);
-		ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+	/*
+	 * Scatter ABDs may be constructed by abd_alloc_from_pages() from
+	 * an array of pages. In which case they should not be freed.
+	 */
+	if (!abd_is_from_pages(abd)) {
+		abd_for_each_sg(abd, sg, nr_pages, i) {
+			page = sg_page(sg);
+			abd_unmark_zfs_page(page);
+			order = compound_order(page);
+			__free_pages(page, order);
+			ASSERT3U(sg->length, <=, PAGE_SIZE << order);
+			ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+		}
 	}
+
 	abd_free_sg_table(abd);
 }
 
@@ -551,17 +559,19 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
 void
 abd_verify_scatter(abd_t *abd)
 {
-	size_t n;
-	int i = 0;
-	struct scatterlist *sg = NULL;
-
 	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
 	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
 	    ABD_SCATTER(abd).abd_sgl->length);
-	n = ABD_SCATTER(abd).abd_nents;
+
+#ifdef ZFS_DEBUG
+	struct scatterlist *sg = NULL;
+	size_t n = ABD_SCATTER(abd).abd_nents;
+	int i = 0;
+
 	abd_for_each_sg(abd, sg, n, i) {
 		ASSERT3P(sg_page(sg), !=, NULL);
 	}
+#endif
 }
 
 static void
@@ -687,14 +697,77 @@ abd_free_linear_page(abd_t *abd)
 {
 	/* Transform it back into a scatter ABD for freeing */
 	struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+
+	/* When backed by user page unmap it */
+	if (abd_is_from_pages(abd))
+		zfs_kunmap(sg_page(sg));
+
 	abd->abd_flags &= ~ABD_FLAG_LINEAR;
 	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
 	ABD_SCATTER(abd).abd_nents = 1;
 	ABD_SCATTER(abd).abd_offset = 0;
 	ABD_SCATTER(abd).abd_sgl = sg;
 	abd_free_chunks(abd);
+}
+
+/*
+ * Allocate a scatter ABD structure from user pages. The pages must be
+ * pinned with get_user_pages, or similiar, but need not be mapped via
+ * the kmap interfaces.
+ */
+abd_t *
+abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size)
+{
+	uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE);
+	struct sg_table table;
+
+	VERIFY3U(size, <=, DMU_MAX_ACCESS);
+	ASSERT3U(offset, <, PAGE_SIZE);
+	ASSERT3P(pages, !=, NULL);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that we
+	 * own the underlying data buffer, which is not true in this case.
+	 * Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+	abd_t *abd = abd_alloc_struct(0);
+	abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER;
+	abd->abd_size = size;
+
+	while (sg_alloc_table_from_pages(&table, pages, npages, offset,
+	    size, __GFP_NOWARN | GFP_NOIO) != 0) {
+		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+		schedule_timeout_interruptible(1);
+	}
+
+	if ((offset + size) <= PAGE_SIZE) {
+		/*
+		 * Since there is only one entry, this ABD can be represented
+		 * as a linear buffer. All single-page (4K) ABD's constructed
+		 * from a user page can be represented this way as long as the
+		 * page is mapped to a virtual address. This allows us to
+		 * apply an offset in to the mapped page.
+		 *
+		 * Note that kmap() must be used, not kmap_atomic(), because
+		 * the mapping needs to bet set up on all CPUs. Using kmap()
+		 * also enables the user of highmem pages when required.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
+		abd->abd_u.abd_linear.abd_sgl = table.sgl;
+		zfs_kmap(sg_page(table.sgl));
+		ABD_LINEAR_BUF(abd) = sg_virt(table.sgl);
+	} else {
+		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+
+		ABD_SCATTER(abd).abd_offset = offset;
+		ABD_SCATTER(abd).abd_sgl = table.sgl;
+		ABD_SCATTER(abd).abd_nents = table.nents;
+
+		ASSERT0(ABD_SCATTER(abd).abd_offset);
+	}
 
-	abd_update_scatter_stats(abd, ABDSTAT_DECR);
+	return (abd);
 }
 
 /*
@@ -746,6 +819,9 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 	ABD_SCATTER(abd).abd_offset = new_offset;
 	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
 
+	if (abd_is_from_pages(sabd))
+		abd->abd_flags |= ABD_FLAG_FROM_PAGES;
+
 	return (abd);
 }
 
@@ -874,6 +950,115 @@ abd_cache_reap_now(void)
 }
 
 /*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+	void *buf;
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, 0);
+	/*
+	 * In the event the ABD is composed of a single user page from Direct
+	 * I/O we can not direclty return the raw buffer. This is a consequence
+	 * of not being able to write protect the page and the contents of the
+	 * page can be changed at any time by the user.
+	 */
+	if (abd_is_from_pages(abd)) {
+		buf = zio_buf_alloc(n);
+	} else if (abd_is_linear(abd)) {
+		buf = abd_to_buf(abd);
+	} else {
+		buf = zio_buf_alloc(n);
+	}
+
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+#endif
+	return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+	void *buf = abd_borrow_buf(abd, n);
+
+	/*
+	 * In the event the ABD is composed of a single user page from Direct
+	 * I/O we must make sure copy the data over into the newly allocated
+	 * buffer. This is a consequence of the fact that we can not write
+	 * protect the user page and there is a risk the contents of the page
+	 * could be changed by the user at any moment.
+	 */
+	if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
+		abd_copy_to_buf(buf, abd, n);
+	}
+	return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will
+ * not change the contents of the ABD. If you want any changes you made to
+ * buf to be copied back to abd, use abd_return_buf_copy() instead. If the
+ * ABD is not constructed from user pages for Direct I/O then an ASSERT
+ * checks to make sure the contents of buffer have not changed since it was
+ * borrowed. We can not ASSERT that the contents of the buffer have not changed
+ * if it is composed of user pages because the pages can not be placed under
+ * write protection and the user could have possibly changed the contents in
+ * the pages at any time.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, n);
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+#endif
+	if (abd_is_from_pages(abd)) {
+		zio_buf_free(buf, n);
+	} else if (abd_is_linear(abd)) {
+		ASSERT3P(buf, ==, abd_to_buf(abd));
+	} else if (abd_is_gang(abd)) {
+#ifdef ZFS_DEBUG
+		/*
+		 * We have to be careful with gang ABD's that we do not ASSERT0
+		 * for any ABD's that contain user pages from Direct I/O. In
+		 * order to handle this, we just iterate through the gang ABD
+		 * and only verify ABDs that are not from user pages.
+		 */
+		void *cmp_buf = buf;
+
+		for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
+		    cabd != NULL;
+		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+			if (!abd_is_from_pages(cabd)) {
+				ASSERT0(abd_cmp_buf(cabd, cmp_buf,
+				    cabd->abd_size));
+			}
+			cmp_buf = (char *)cmp_buf + cabd->abd_size;
+		}
+#endif
+		zio_buf_free(buf, n);
+	} else {
+		ASSERT0(abd_cmp_buf(abd, buf, n));
+		zio_buf_free(buf, n);
+	}
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+	if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
+		abd_copy_from_buf(abd, buf, n);
+	}
+	abd_return_buf(abd, buf, n);
+}
+
+/*
  * This is abd_iter_page(), the function underneath abd_iterate_page_func().
  * It yields the next page struct and data offset and size within it, without
  * mapping it into the address space.
diff --git a/module/os/linux/zfs/zfs_racct.c b/module/os/linux/zfs/zfs_racct.c
index ce623ef9d..ce197caa4 100644
--- a/module/os/linux/zfs/zfs_racct.c
+++ b/module/os/linux/zfs/zfs_racct.c
@@ -25,14 +25,35 @@
 
 #include <sys/zfs_racct.h>
 
+#ifdef _KERNEL
+#include <linux/task_io_accounting_ops.h>
+
+void
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+	task_io_account_read(size);
+	spa_iostats_read_add(spa, size, iops, flags);
+}
+
 void
-zfs_racct_read(uint64_t size, uint64_t iops)
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
 {
-	(void) size, (void) iops;
+	task_io_account_write(size);
+	spa_iostats_write_add(spa, size, iops, flags);
 }
 
+#else
+
 void
-zfs_racct_write(uint64_t size, uint64_t iops)
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
 {
-	(void) size, (void) iops;
+	(void) spa, (void) size, (void) iops, (void) flags;
 }
+
+void
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+	(void) spa, (void) size, (void) iops, (void) flags;
+}
+
+#endif /* _KERNEL */
diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c
index a99a1ba88..637f968f8 100644
--- a/module/os/linux/zfs/zfs_uio.c
+++ b/module/os/linux/zfs/zfs_uio.c
@@ -41,12 +41,19 @@
 
 #ifdef _KERNEL
 
+#include <sys/errno.h>
+#include <sys/vmem.h>
+#include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/uio_impl.h>
 #include <sys/sysmacros.h>
 #include <sys/string.h>
+#include <sys/zfs_refcount.h>
+#include <sys/zfs_debug.h>
 #include <linux/kmap_compat.h>
 #include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
 
 /*
  * Move "n" bytes at byte address "p"; "rw" indicates the direction
@@ -327,8 +334,13 @@ EXPORT_SYMBOL(zfs_uiomove);
 int
 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
 {
-	if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) {
-		/* There's never a need to fault in kernel pages */
+	if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
+	    (uio->uio_extflg & UIO_DIRECT)) {
+		/*
+		 * There's never a need to fault in kernel pages or Direct I/O
+		 * write pages. Direct I/O write pages have been pinned in so
+		 * there is never a time for these pages a fault will occur.
+		 */
 		return (0);
 #if defined(HAVE_VFS_IOV_ITER)
 	} else if (uio->uio_segflg == UIO_ITER) {
@@ -437,9 +449,288 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
 			uio->uio_iovcnt--;
 		}
 	}
+
 	uio->uio_loffset += n;
 	uio->uio_resid -= n;
 }
 EXPORT_SYMBOL(zfs_uioskip);
 
+/*
+ * Check if the uio is page-aligned in memory.
+ */
+boolean_t
+zfs_uio_page_aligned(zfs_uio_t *uio)
+{
+	boolean_t aligned = B_TRUE;
+
+	if (uio->uio_segflg == UIO_USERSPACE ||
+	    uio->uio_segflg == UIO_SYSSPACE) {
+		const struct iovec *iov = uio->uio_iov;
+		size_t skip = uio->uio_skip;
+
+		for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
+			uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
+			size_t size = iov->iov_len - skip;
+			if ((addr & (PAGE_SIZE - 1)) ||
+			    (size & (PAGE_SIZE - 1))) {
+				aligned = B_FALSE;
+				break;
+			}
+			skip = 0;
+		}
+#if defined(HAVE_VFS_IOV_ITER)
+	} else if (uio->uio_segflg == UIO_ITER) {
+		unsigned long alignment =
+		    iov_iter_alignment(uio->uio_iter);
+		aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
+#endif
+	} else {
+		/* Currently not supported */
+		aligned = B_FALSE;
+	}
+
+	return (aligned);
+}
+
+
+#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
+#define	ZFS_MARKEED_PAGE	0x0
+#define	IS_ZFS_MARKED_PAGE(_p)	0
+#define	zfs_mark_page(_p)
+#define	zfs_unmark_page(_p)
+#define	IS_ZERO_PAGE(_p)	0
+
+#else
+/*
+ * Mark pages to know if they were allocated to replace ZERO_PAGE() for
+ * Direct I/O writes.
+ */
+#define	ZFS_MARKED_PAGE		0x5a465350414745 /* ASCII: ZFSPAGE */
+#define	IS_ZFS_MARKED_PAGE(_p) \
+	(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
+#define	IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
+
+static inline void
+zfs_mark_page(struct page *page)
+{
+	ASSERT3P(page, !=, NULL);
+	get_page(page);
+	SetPagePrivate(page);
+	set_page_private(page, ZFS_MARKED_PAGE);
+}
+
+static inline void
+zfs_unmark_page(struct page *page)
+{
+	ASSERT3P(page, !=, NULL);
+	set_page_private(page, 0UL);
+	ClearPagePrivate(page);
+	put_page(page);
+}
+#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
+
+static void
+zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
+{
+	ASSERT3P(uio->uio_dio.pages, !=, NULL);
+
+	for (long i = 0; i < uio->uio_dio.npages; i++) {
+		struct page *p = uio->uio_dio.pages[i];
+		lock_page(p);
+
+		if (IS_ZERO_PAGE(p)) {
+			/*
+			 * If the user page points the kernels ZERO_PAGE() a
+			 * new zero filled page will just be allocated so the
+			 * contents of the page can not be changed by the user
+			 * while a Direct I/O write is taking place.
+			 */
+			gfp_t gfp_zero_page  = __GFP_NOWARN | GFP_NOIO |
+			    __GFP_ZERO | GFP_KERNEL;
+
+			ASSERT0(IS_ZFS_MARKED_PAGE(p));
+			unlock_page(p);
+			put_page(p);
+
+			p = __page_cache_alloc(gfp_zero_page);
+			zfs_mark_page(p);
+		} else {
+			unlock_page(p);
+		}
+	}
+}
+
+void
+zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+
+	ASSERT(uio->uio_extflg & UIO_DIRECT);
+	ASSERT3P(uio->uio_dio.pages, !=, NULL);
+
+	for (long i = 0; i < uio->uio_dio.npages; i++) {
+		struct page *p = uio->uio_dio.pages[i];
+
+		if (IS_ZFS_MARKED_PAGE(p)) {
+			zfs_unmark_page(p);
+			__free_page(p);
+			continue;
+		}
+
+		put_page(p);
+	}
+
+	vmem_free(uio->uio_dio.pages,
+	    uio->uio_dio.npages * sizeof (struct page *));
+}
+
+/*
+ * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's
+ * iov_iter_get_pages().
+ */
+static int
+zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio,
+    long *numpages)
+{
+	unsigned long addr = (unsigned long)(v.iov_base);
+	size_t len = v.iov_len;
+	unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	long res = zfs_get_user_pages(
+	    P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, rw == UIO_READ,
+	    &uio->uio_dio.pages[uio->uio_dio.npages]);
+	if (res < 0) {
+		return (SET_ERROR(-res));
+	} else if (len != (res * PAGE_SIZE)) {
+		return (SET_ERROR(EFAULT));
+	}
+
+	ASSERT3S(len, ==, res * PAGE_SIZE);
+	*numpages = res;
+	return (0);
+}
+
+static int
+zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	const struct iovec *iovp = uio->uio_iov;
+	size_t skip = uio->uio_skip;
+	size_t len = uio->uio_resid - skip;
+
+	ASSERT(uio->uio_segflg != UIO_SYSSPACE);
+
+	for (int i = 0; i < uio->uio_iovcnt; i++) {
+		struct iovec iov;
+		long numpages = 0;
+
+		if (iovp->iov_len == 0) {
+			iovp++;
+			skip = 0;
+			continue;
+		}
+		iov.iov_len = MIN(len, iovp->iov_len - skip);
+		iov.iov_base = iovp->iov_base + skip;
+		int error = zfs_uio_iov_step(iov, rw, uio, &numpages);
+
+		if (error)
+			return (error);
+
+		uio->uio_dio.npages += numpages;
+		len -= iov.iov_len;
+		skip = 0;
+		iovp++;
+	}
+
+	ASSERT0(len);
+
+	return (0);
+}
+
+#if defined(HAVE_VFS_IOV_ITER)
+static int
+zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	size_t skip = uio->uio_skip;
+	size_t wanted = uio->uio_resid - uio->uio_skip;
+	ssize_t rollback = 0;
+	ssize_t cnt;
+	unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
+
+	while (wanted) {
+#if defined(HAVE_IOV_ITER_GET_PAGES2)
+		cnt = iov_iter_get_pages2(uio->uio_iter,
+		    &uio->uio_dio.pages[uio->uio_dio.npages],
+		    wanted, maxpages, &skip);
+#else
+		cnt = iov_iter_get_pages(uio->uio_iter,
+		    &uio->uio_dio.pages[uio->uio_dio.npages],
+		    wanted, maxpages, &skip);
+#endif
+		if (cnt < 0) {
+			iov_iter_revert(uio->uio_iter, rollback);
+			return (SET_ERROR(-cnt));
+		}
+		uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
+		rollback += cnt;
+		wanted -= cnt;
+		skip = 0;
+#if !defined(HAVE_IOV_ITER_GET_PAGES2)
+		/*
+		 * iov_iter_get_pages2() advances the iov_iter on success.
+		 */
+		iov_iter_advance(uio->uio_iter, cnt);
+#endif
+
+	}
+	ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
+	iov_iter_revert(uio->uio_iter, rollback);
+
+	return (0);
+}
+#endif /* HAVE_VFS_IOV_ITER */
+
+/*
+ * This function pins user pages. In the event that the user pages were not
+ * successfully pinned an error value is returned.
+ *
+ * On success, 0 is returned.
+ */
+int
+zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+	int error = 0;
+	long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
+	size_t size = npages * sizeof (struct page *);
+
+	if (uio->uio_segflg == UIO_USERSPACE) {
+		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
+		error = zfs_uio_get_dio_pages_iov(uio, rw);
+#if defined(HAVE_VFS_IOV_ITER)
+	} else if (uio->uio_segflg == UIO_ITER) {
+		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
+		error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
+#endif
+	} else {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	ASSERT3S(uio->uio_dio.npages, >=, 0);
+
+	if (error) {
+		for (long i = 0; i < uio->uio_dio.npages; i++)
+			put_page(uio->uio_dio.pages[i]);
+		vmem_free(uio->uio_dio.pages, size);
+		return (error);
+	} else {
+		ASSERT3S(uio->uio_dio.npages, ==, npages);
+	}
+
+	if (rw == UIO_WRITE) {
+		zfs_uio_dio_check_for_zero_page(uio);
+	}
+
+	uio->uio_extflg |= UIO_DIRECT;
+
+	return (0);
+}
+
 #endif /* _KERNEL */
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index a52f08868..22a4ad1ef 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -59,6 +59,7 @@
 #include <sys/objlist.h>
 #include <sys/zpl.h>
 #include <linux/vfs_compat.h>
+#include <linux/fs.h>
 #include "zfs_comutil.h"
 
 enum {
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 9803c7fec..77e59a3ba 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -296,6 +296,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 
 		struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
+
 			/*
 			 * If filemap_fault() retries there exists a window
 			 * where the page will be unlocked and not up to date.
@@ -3866,7 +3867,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	}
 
 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
-	    for_sync ? zfs_putpage_sync_commit_cb :
+	    B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
 	    zfs_putpage_async_commit_cb, pp);
 
 	dmu_tx_commit(tx);
@@ -4009,6 +4010,7 @@ zfs_inactive(struct inode *ip)
 static int
 zfs_fillpage(struct inode *ip, struct page *pp)
 {
+	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	loff_t i_size = i_size_read(ip);
 	u_offset_t io_off = page_offset(pp);
@@ -4020,7 +4022,7 @@ zfs_fillpage(struct inode *ip, struct page *pp)
 		io_len = i_size - io_off;
 
 	void *va = kmap(pp);
-	int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
+	int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
 	    io_len, va, DMU_READ_PREFETCH);
 	if (io_len != PAGE_SIZE)
 		memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
@@ -4058,11 +4060,49 @@ zfs_getpage(struct inode *ip, struct page *pp)
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	znode_t *zp = ITOZ(ip);
 	int error;
+	loff_t i_size = i_size_read(ip);
+	u_offset_t io_off = page_offset(pp);
+	size_t io_len = PAGE_SIZE;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
+	ASSERT3U(io_off, <, i_size);
+
+	if (io_off + io_len > i_size)
+		io_len = i_size - io_off;
+
+	/*
+	 * It is important to hold the rangelock here because it is possible
+	 * a Direct I/O write or block clone might be taking place at the same
+	 * time that a page is being faulted in through filemap_fault(). With
+	 * Direct I/O writes and block cloning db->db_data will be set to NULL
+	 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
+	 * rangelock is not held, then there is a race between faulting in a
+	 * page and writing out a Direct I/O write or block cloning. Without
+	 * the rangelock a NULL pointer dereference can occur in
+	 * dmu_read_impl() for db->db_data during the mempcy operation when
+	 * zfs_fillpage() calls dmu_read().
+	 */
+	zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
+	    io_off, io_len, RL_READER);
+	if (lr == NULL) {
+		/*
+		 * It is important to drop the page lock before grabbing the
+		 * rangelock to avoid another deadlock between here and
+		 * zfs_write() -> update_pages(). update_pages() holds both the
+		 * rangelock and the page lock.
+		 */
+		get_page(pp);
+		unlock_page(pp);
+		lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
+		    io_len, RL_READER);
+		lock_page(pp);
+		put_page(pp);
+	}
 	error = zfs_fillpage(ip, pp);
+	zfs_rangelock_exit(lr);
+
 	if (error == 0)
 		dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
 
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 9dec52215..6b16faa2b 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -322,14 +322,14 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
-	int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+	ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
 	    filp->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
-	if (error < 0)
-		return (error);
+	if (ret < 0)
+		return (ret);
 
 	ssize_t read = count - uio.uio_resid;
 	kiocb->ki_pos += read;
@@ -384,14 +384,14 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
-	int error = -zfs_write(ITOZ(ip), &uio,
+	ret = -zfs_write(ITOZ(ip), &uio,
 	    filp->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
-	if (error < 0)
-		return (error);
+	if (ret < 0)
+		return (ret);
 
 	ssize_t wrote = count - uio.uio_resid;
 	kiocb->ki_pos += wrote;
@@ -422,14 +422,14 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov,
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
-	int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
-	    filp->f_flags | zfs_io_flags(kiocb), cr);
+	ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+	    flip->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
-	if (error < 0)
-		return (error);
+	if (ret < 0)
+		return (ret);
 
 	ssize_t read = count - uio.uio_resid;
 	kiocb->ki_pos += read;
@@ -467,53 +467,57 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
-	int error = -zfs_write(ITOZ(ip), &uio,
+	ret = -zfs_write(ITOZ(ip), &uio,
 	    filp->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
-	if (error < 0)
-		return (error);
+	if (ret < 0)
+		return (ret);
 
 	ssize_t wrote = count - uio.uio_resid;
 	kiocb->ki_pos += wrote;
 
 	return (wrote);
 }
+
 #endif /* HAVE_VFS_RW_ITERATE */
 
-#if defined(HAVE_VFS_RW_ITERATE)
 static ssize_t
-zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
+zpl_direct_IO_impl(void)
 {
-	if (rw == WRITE)
-		return (zpl_iter_write(kiocb, iter));
-	else
-		return (zpl_iter_read(kiocb, iter));
+	/*
+	 * All O_DIRECT requests should be handled by
+	 * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code
+	 * should call the direct_IO address_space_operations function. We set
+	 * this code path to be fatal if it is executed.
+	 */
+	PANIC(0);
+	return (0);
 }
+
+#if defined(HAVE_VFS_RW_ITERATE)
 #if defined(HAVE_VFS_DIRECT_IO_ITER)
 static ssize_t
 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
 {
-	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+	return (zpl_direct_IO_impl());
 }
 #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
 static ssize_t
 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 {
-	ASSERT3S(pos, ==, kiocb->ki_pos);
-	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+	return (zpl_direct_IO_impl());
 }
 #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
 static ssize_t
 zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 {
-	ASSERT3S(pos, ==, kiocb->ki_pos);
-	return (zpl_direct_IO_impl(rw, kiocb, iter));
+	return (zpl_direct_IO_impl());
 }
 #else
-#error "Unknown direct IO interface"
+#error "Unknown Direct I/O interface"
 #endif
 
 #else /* HAVE_VFS_RW_ITERATE */
@@ -523,26 +527,16 @@ static ssize_t
 zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov,
     loff_t pos, unsigned long nr_segs)
 {
-	if (rw == WRITE)
-		return (zpl_aio_write(kiocb, iov, nr_segs, pos));
-	else
-		return (zpl_aio_read(kiocb, iov, nr_segs, pos));
+	return (zpl_direct_IO_impl());
 }
 #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
 static ssize_t
 zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 {
-	const struct iovec *iovp = iov_iter_iovec(iter);
-	unsigned long nr_segs = iter->nr_segs;
-
-	ASSERT3S(pos, ==, kiocb->ki_pos);
-	if (rw == WRITE)
-		return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
-	else
-		return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
+	return (zpl_direct_IO_impl());
 }
 #else
-#error "Unknown direct IO interface"
+#error "Unknown Direct I/O interface"
 #endif
 
 #endif /* HAVE_VFS_RW_ITERATE */
@@ -627,6 +621,7 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
 	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
 	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
 	spl_fstrans_unmark(cookie);
+
 	if (error)
 		return (error);
 
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c
index 764993b45..10ac13a89 100644
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@@ -395,6 +395,13 @@ zfs_prop_init(void)
 		{ NULL }
 	};
 
+	static const zprop_index_t direct_table[] = {
+		{ "disabled",	ZFS_DIRECT_DISABLED },
+		{ "standard",	ZFS_DIRECT_STANDARD },
+		{ "always",	ZFS_DIRECT_ALWAYS },
+		{ NULL }
+	};
+
 	struct zfs_mod_supported_features *sfeatures =
 	    zfs_mod_list_supported(ZFS_SYSFS_DATASET_PROPERTIES);
 
@@ -479,6 +486,10 @@ zfs_prop_init(void)
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "default | full | geom | dev | none", "VOLMODE", volmode_table,
 	    sfeatures);
+	zprop_register_index(ZFS_PROP_DIRECT, "direct",
+	    ZFS_DIRECT_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "disabled | standard | always", "DIRECT", direct_table,
+	    sfeatures);
 
 	/* inherit index (boolean) properties */
 	zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c
index e2d4d1aef..622323bbb 100644
--- a/module/zcommon/zfs_valstr.c
+++ b/module/zcommon/zfs_valstr.c
@@ -218,6 +218,7 @@ _VALSTR_BITFIELD_IMPL(zio_flag,
 	{ '.', "NP", "NOPWRITE" },
 	{ '.', "EX", "REEXECUTED" },
 	{ '.', "DG", "DELEGATED" },
+	{ '.', "DC", "DIO_CHKSUM_ERR" },
 )
 /* END CSTYLED */
 
@@ -252,6 +253,7 @@ _VALSTR_BITFIELD_IMPL(zio_stage,
 	{ 'V', "VD", "VDEV_IO_DONE" },
 	{ 'V', "VA", "VDEV_IO_ASSESS" },
 	{ 'C', "CV", "CHECKSUM_VERIFY" },
+	{ 'C', "DC", "DIO_CHECKSUM_VERIFY" },
 	{ 'X', "X ", "DONE" },
 )
 /* END CSTYLED */
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index c8c4d2270..529deeecf 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -89,8 +89,8 @@
  * functions.
  *
  * As an additional feature, linear and scatter ABD's can be stitched together
- * by using the gang ABD type (abd_alloc_gang_abd()). This allows for
- * multiple ABDs to be viewed as a singular ABD.
+ * by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs
+ * to be viewed as a singular ABD.
  *
  * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
  * B_FALSE.
@@ -109,11 +109,15 @@ void
 abd_verify(abd_t *abd)
 {
 #ifdef ZFS_DEBUG
-	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+	if (abd_is_from_pages(abd)) {
+		ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS);
+	} else {
+		ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+	}
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
 	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
-	    ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
+	    ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES));
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {
@@ -136,7 +140,7 @@ abd_verify(abd_t *abd)
 #endif
 }
 
-static void
+void
 abd_init_struct(abd_t *abd)
 {
 	list_link_init(&abd->abd_gang_link);
@@ -238,6 +242,7 @@ abd_free_linear(abd_t *abd)
 		abd_free_linear_page(abd);
 		return;
 	}
+
 	if (abd->abd_flags & ABD_FLAG_META) {
 		zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
 	} else {
@@ -520,6 +525,21 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
 		 */
 		abd->abd_flags |= ABD_FLAG_LINEAR;
 
+		/*
+		 * User pages from Direct I/O requests may be in a single page
+		 * (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag
+		 * that here for abd. This is required because we have to be
+		 * careful when borrowing the buffer from the ABD because we
+		 * can not place user pages under write protection on Linux.
+		 * See the comments in abd_os.c for abd_borrow_buf(),
+		 * abd_borrow_buf_copy(), abd_return_buf() and
+		 * abd_return_buf_copy().
+		 */
+		if (abd_is_from_pages(sabd)) {
+			abd->abd_flags |= ABD_FLAG_FROM_PAGES |
+			    ABD_FLAG_LINEAR_PAGE;
+		}
+
 		ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
 	} else if (abd_is_gang(sabd)) {
 		size_t left = size;
@@ -648,70 +668,6 @@ abd_to_buf(abd_t *abd)
 	return (ABD_LINEAR_BUF(abd));
 }
 
-/*
- * Borrow a raw buffer from an ABD without copying the contents of the ABD
- * into the buffer. If the ABD is scattered, this will allocate a raw buffer
- * whose contents are undefined. To copy over the existing data in the ABD, use
- * abd_borrow_buf_copy() instead.
- */
-void *
-abd_borrow_buf(abd_t *abd, size_t n)
-{
-	void *buf;
-	abd_verify(abd);
-	ASSERT3U(abd->abd_size, >=, n);
-	if (abd_is_linear(abd)) {
-		buf = abd_to_buf(abd);
-	} else {
-		buf = zio_buf_alloc(n);
-	}
-#ifdef ZFS_DEBUG
-	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
-#endif
-	return (buf);
-}
-
-void *
-abd_borrow_buf_copy(abd_t *abd, size_t n)
-{
-	void *buf = abd_borrow_buf(abd, n);
-	if (!abd_is_linear(abd)) {
-		abd_copy_to_buf(buf, abd, n);
-	}
-	return (buf);
-}
-
-/*
- * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
- * not change the contents of the ABD and will ASSERT that you didn't modify
- * the buffer since it was borrowed. If you want any changes you made to buf to
- * be copied back to abd, use abd_return_buf_copy() instead.
- */
-void
-abd_return_buf(abd_t *abd, void *buf, size_t n)
-{
-	abd_verify(abd);
-	ASSERT3U(abd->abd_size, >=, n);
-#ifdef ZFS_DEBUG
-	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
-#endif
-	if (abd_is_linear(abd)) {
-		ASSERT3P(buf, ==, abd_to_buf(abd));
-	} else {
-		ASSERT0(abd_cmp_buf(abd, buf, n));
-		zio_buf_free(buf, n);
-	}
-}
-
-void
-abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
-{
-	if (!abd_is_linear(abd)) {
-		abd_copy_from_buf(abd, buf, n);
-	}
-	abd_return_buf(abd, buf, n);
-}
-
 void
 abd_release_ownership_of_buf(abd_t *abd)
 {
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 714a30e86..b5bcd367b 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -5961,7 +5961,7 @@ top:
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
 			    metadata, misses);
-			zfs_racct_read(size, 1);
+			zfs_racct_read(spa, size, 1, 0);
 		}
 
 		/* Check if the spa even has l2 configured */
diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c
index 914260e74..27a04c2af 100644
--- a/module/zfs/dataset_kstats.c
+++ b/module/zfs/dataset_kstats.c
@@ -217,8 +217,7 @@ dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
 }
 
 void
-dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
-    int64_t nwritten)
+dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten)
 {
 	ASSERT3S(nwritten, >=, 0);
 
@@ -230,8 +229,7 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
 }
 
 void
-dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
-    int64_t nread)
+dataset_kstats_update_read_kstats(dataset_kstats_t *dk, int64_t nread)
 {
 	ASSERT3S(nread, >=, 0);
 
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 099883ba2..df9368fc8 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -628,7 +628,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
  * L2ARC.
  */
 boolean_t
-dbuf_is_l2cacheable(dmu_buf_impl_t *db)
+dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)
 {
 	if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
 	    (db->db_objset->os_secondary_cache ==
@@ -636,10 +636,17 @@ dbuf_is_l2cacheable(dmu_buf_impl_t *db)
 		if (l2arc_exclude_special == 0)
 			return (B_TRUE);
 
-		blkptr_t *bp = db->db_blkptr;
-		if (bp == NULL || BP_IS_HOLE(bp))
+		/*
+		 * bp must be checked in the event it was passed from
+		 * dbuf_read_impl() as the result of a the BP being set from
+		 * a Direct I/O write in dbuf_read(). See comments in
+		 * dbuf_read().
+		 */
+		blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;
+
+		if (db_bp == NULL || BP_IS_HOLE(db_bp))
 			return (B_FALSE);
-		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+		uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);
 		vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
 		vdev_t *vd = NULL;
 
@@ -1380,6 +1387,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(db->db_state, ==, DB_READ);
+
 	/*
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
@@ -1570,12 +1578,11 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
  */
 static int
 dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
-    db_lock_type_t dblt, const void *tag)
+    db_lock_type_t dblt, blkptr_t *bp, const void *tag)
 {
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
-	blkptr_t bp, *bpp = NULL;
 
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1589,43 +1596,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 		goto early_unlock;
 	}
 
-	/*
-	 * If we have a pending block clone, we don't want to read the
-	 * underlying block, but the content of the block being cloned,
-	 * pointed by the dirty record, so we have the most recent data.
-	 * If there is no dirty record, then we hit a race in a sync
-	 * process when the dirty record is already removed, while the
-	 * dbuf is not yet destroyed. Such case is equivalent to uncached.
-	 */
-	if (db->db_state == DB_NOFILL) {
-		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
-		if (dr != NULL) {
-			if (!dr->dt.dl.dr_brtwrite) {
-				err = EIO;
-				goto early_unlock;
-			}
-			bp = dr->dt.dl.dr_overridden_by;
-			bpp = &bp;
-		}
-	}
-
-	if (bpp == NULL && db->db_blkptr != NULL) {
-		bp = *db->db_blkptr;
-		bpp = &bp;
-	}
-
-	err = dbuf_read_hole(db, dn, bpp);
+	err = dbuf_read_hole(db, dn, bp);
 	if (err == 0)
 		goto early_unlock;
 
-	ASSERT(bpp != NULL);
+	ASSERT(bp != NULL);
 
 	/*
 	 * Any attempt to read a redacted block should result in an error. This
 	 * will never happen under normal conditions, but can be useful for
 	 * debugging purposes.
 	 */
-	if (BP_IS_REDACTED(bpp)) {
+	if (BP_IS_REDACTED(bp)) {
 		ASSERT(dsl_dataset_feature_is_active(
 		    db->db_objset->os_dsl_dataset,
 		    SPA_FEATURE_REDACTED_DATASETS));
@@ -1640,9 +1622,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 	 * All bps of an encrypted os should have the encryption bit set.
 	 * If this is not true it indicates tampering and we report an error.
 	 */
-	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
 		spa_log_error(db->db_objset->os_spa, &zb,
-		    BP_GET_LOGICAL_BIRTH(bpp));
+		    BP_GET_LOGICAL_BIRTH(bp));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
@@ -1653,7 +1635,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 
 	if (!DBUF_IS_CACHEABLE(db))
 		aflags |= ARC_FLAG_UNCACHED;
-	else if (dbuf_is_l2cacheable(db))
+	else if (dbuf_is_l2cacheable(db, bp))
 		aflags |= ARC_FLAG_L2CACHE;
 
 	dbuf_add_ref(db, NULL);
@@ -1661,17 +1643,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 	zio_flags = (flags & DB_RF_CANFAIL) ?
 	    ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
 
-	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
+	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 		zio_flags |= ZIO_FLAG_RAW;
+
 	/*
-	 * The zio layer will copy the provided blkptr later, but we have our
-	 * own copy so that we can release the parent's rwlock. We have to
-	 * do that so that if dbuf_read_done is called synchronously (on
+	 * The zio layer will copy the provided blkptr later, but we need to
+	 * do this now so that we can release the parent's rwlock. We have to
+	 * do that now so that if dbuf_read_done is called synchronously (on
 	 * an l1 cache hit) we don't acquire the db_mtx while holding the
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
+	blkptr_t copy = *bp;
 	dmu_buf_unlock_parent(db, dblt, tag);
-	return (arc_read(zio, db->db_objset->os_spa, bpp,
+	return (arc_read(zio, db->db_objset->os_spa, &copy,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
 	    &aflags, &zb));
 
@@ -1844,13 +1828,30 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 		ASSERT(db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL);
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
-		if (pio == NULL && (db->db_state == DB_NOFILL ||
-		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
-			spa_t *spa = dn->dn_objset->os_spa;
-			pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-			need_wait = B_TRUE;
+		blkptr_t *bp;
+
+		/*
+		 * If a block clone or Direct I/O write has occurred we will
+		 * get the dirty records overridden BP so we get the most
+		 * recent data.
+		 */
+		err = dmu_buf_get_bp_from_dbuf(db, &bp);
+
+		if (!err) {
+			if (pio == NULL && (db->db_state == DB_NOFILL ||
+			    (bp != NULL && !BP_IS_HOLE(bp)))) {
+				spa_t *spa = dn->dn_objset->os_spa;
+				pio =
+				    zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+				need_wait = B_TRUE;
+			}
+
+			err =
+			    dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);
+		} else {
+			mutex_exit(&db->db_mtx);
+			dmu_buf_unlock_parent(db, dblt, FTAG);
 		}
-		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
 		/* dbuf_read_impl drops db_mtx and parent's rwlock. */
 		miss = (db->db_state != DB_CACHED);
 	}
@@ -1918,6 +1919,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
+
 	/*
 	 * This assert is valid because dmu_sync() expects to be called by
 	 * a zilog's get_data while holding a range lock.  This call only
@@ -1936,16 +1938,20 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
-	if (dr->dt.dl.dr_brtwrite) {
+	if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
 		ASSERT0P(dr->dt.dl.dr_data);
 		dr->dt.dl.dr_data = db->db_buf;
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 	dr->dt.dl.dr_brtwrite = B_FALSE;
+	dr->dt.dl.dr_diowrite = B_FALSE;
 	dr->dt.dl.dr_has_raw_params = B_FALSE;
 
 	/*
+	 * In the event that Direct I/O was used, we do not
+	 * need to release the buffer from the ARC.
+	 *
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
 	 * modifying the buffer, so they will immediately do
@@ -2084,6 +2090,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 	 */
 	dmu_buf_will_dirty(&db->db, tx);
 
+	VERIFY3P(db->db_buf, !=, NULL);
+
 	/* create the data buffer for the new block */
 	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
 
@@ -2532,6 +2540,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	uint64_t txg = tx->tx_txg;
 	boolean_t brtwrite;
+	boolean_t diowrite;
 
 	ASSERT(txg != 0);
 
@@ -2557,7 +2566,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	ASSERT(dr->dr_dbuf == db);
 
 	brtwrite = dr->dt.dl.dr_brtwrite;
+	diowrite = dr->dt.dl.dr_diowrite;
 	if (brtwrite) {
+		ASSERT3B(diowrite, ==, B_FALSE);
 		/*
 		 * We are freeing a block that we cloned in the same
 		 * transaction group.
@@ -2598,10 +2609,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	if (db->db_state != DB_NOFILL && !brtwrite) {
 		dbuf_unoverride(dr);
 
-		ASSERT(db->db_buf != NULL);
-		ASSERT(dr->dt.dl.dr_data != NULL);
-		if (dr->dt.dl.dr_data != db->db_buf)
+		if (dr->dt.dl.dr_data != db->db_buf) {
+			ASSERT(db->db_buf != NULL);
+			ASSERT(dr->dt.dl.dr_data != NULL);
 			arc_buf_destroy(dr->dt.dl.dr_data, db);
+		}
 	}
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
@@ -2610,7 +2622,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	db->db_dirtycnt -= 1;
 
 	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-		ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+		ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
 		    arc_released(db->db_buf));
 		dbuf_destroy(db);
 		return (B_TRUE);
@@ -2670,8 +2682,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 	 * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
 	 * want to make sure dbuf_read() will read the pending cloned block and
 	 * not the uderlying block that is being replaced. dbuf_undirty() will
-	 * do dbuf_unoverride(), so we will end up with cloned block content,
-	 * without overridden BP.
+	 * do brt_pending_remove() before removing the dirty record.
 	 */
 	(void) dbuf_read(db, NULL, flags);
 	if (undirty) {
@@ -2701,23 +2712,126 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	return (dr != NULL);
 }
 
+/*
+ * Normally the db_blkptr points to the most recent on-disk content for the
+ * dbuf (and anything newer will be cached in the dbuf). However, a pending
+ * block clone or not yet synced Direct I/O write will have a dirty record BP
+ * pointing to the most recent data.
+ */
+int
+dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	int error = 0;
+
+	if (db->db_level != 0) {
+		*bp = db->db_blkptr;
+		return (0);
+	}
+
+	*bp = db->db_blkptr;
+	dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+	if (dr && db->db_state == DB_NOFILL) {
+		/* Block clone */
+		if (!dr->dt.dl.dr_brtwrite)
+			error = EIO;
+		else
+			*bp = &dr->dt.dl.dr_overridden_by;
+	} else if (dr && db->db_state == DB_UNCACHED) {
+		/* Direct I/O write */
+		if (dr->dt.dl.dr_diowrite)
+			*bp = &dr->dt.dl.dr_overridden_by;
+	}
+
+	return (error);
+}
+
+/*
+ * Direct I/O reads can read directly from the ARC, but the data has
+ * to be untransformed in order to copy it over into user pages.
+ */
+int
+dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)
+{
+	int err = 0;
+	DB_DNODE_ENTER(db);
+	dnode_t *dn = DB_DNODE(db);
+
+	ASSERT3S(db->db_state, ==, DB_CACHED);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	/*
+	 * Ensure that this block's dnode has been decrypted if
+	 * the caller has requested decrypted data.
+	 */
+	err = dbuf_read_verify_dnode_crypt(db, dn, 0);
+
+	/*
+	 * If the arc buf is compressed or encrypted and the caller
+	 * requested uncompressed data, we need to untransform it
+	 * before returning. We also call arc_untransform() on any
+	 * unauthenticated blocks, which will verify their MAC if
+	 * the key is now available.
+	 */
+	if (err == 0 && db->db_buf != NULL &&
+	    (arc_is_encrypted(db->db_buf) ||
+	    arc_is_unauthenticated(db->db_buf) ||
+	    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+		zbookmark_phys_t zb;
+
+		SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+		    db->db.db_object, db->db_level, db->db_blkid);
+		dbuf_fix_old_data(db, spa_syncing_txg(spa));
+		err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
+		dbuf_set_data(db, db->db_buf);
+	}
+	DB_DNODE_EXIT(db);
+	DBUF_STAT_BUMP(hash_hits);
+
+	return (err);
+}
+
 void
-dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
+	/*
+	 * Block clones and Direct I/O writes always happen in open-context.
+	 */
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT0(db->db_level);
+	ASSERT(!dmu_tx_is_syncing(tx));
+	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
-	/*
-	 * Block cloning: We are going to clone into this block, so undirty
-	 * modifications done to this block so far in this txg. This includes
-	 * writes and clones into this block.
-	 */
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
-	VERIFY(!dbuf_undirty(db, tx));
+
+	/*
+	 * We are going to clone or issue a Direct I/O write on this block, so
+	 * undirty modifications done to this block so far in this txg. This
+	 * includes writes and clones into this block.
+	 *
+	 * If there dirty record associated with this txg from a previous Direct
+	 * I/O write then space accounting cleanup takes place. It is important
+	 * to go ahead free up the space accounting through dbuf_undirty() ->
+	 * dbuf_unoverride() -> zio_free(). Space accountiung for determining
+	 * if a write can occur in zfs_write() happens through dmu_tx_assign().
+	 * This can cause an issue with Direct I/O writes in the case of
+	 * overwriting the same block, because all DVA allocations are being
+	 * done in open-context. Constantly allowing Direct I/O overwrites to
+	 * the same block can exhaust the pools available space leading to
+	 * ENOSPC errors at the DVA allocation part of the ZIO pipeline, which
+	 * will eventually suspend the pool. By cleaning up sapce acccounting
+	 * now, the ENOSPC error can be avoided.
+	 *
+	 * Since we are undirtying the record in open-context, we must have a
+	 * hold on the db, so it should never be evicted after calling
+	 * dbuf_undirty().
+	 */
+	VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);
 	ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+
 	if (db->db_buf != NULL) {
 		/*
 		 * If there is an associated ARC buffer with this dbuf we can
@@ -2728,6 +2842,11 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 		if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
 			arc_buf_destroy(db->db_buf, db);
 
+		/*
+		 * Setting the dbuf's data pointers to NULL will force all
+		 * future reads down to the devices to get the most up to date
+		 * version of the data after a Direct I/O write has completed.
+		 */
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 	}
@@ -2736,7 +2855,8 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	ASSERT3P(db->db.db_data, ==, NULL);
 
 	db->db_state = DB_NOFILL;
-	DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+	DTRACE_SET_STATE(db,
+	    "allocating NOFILL buffer for clone or direct I/O write");
 
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
@@ -2773,21 +2893,28 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 	    dmu_tx_private_ok(tx));
 
 	mutex_enter(&db->db_mtx);
-	if (db->db_state == DB_NOFILL) {
+	dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+	if (db->db_state == DB_NOFILL ||
+	    (db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
 		/*
-		 * Block cloning: We will be completely overwriting a block
-		 * cloned in this transaction group, so let's undirty the
-		 * pending clone and mark the block as uncached. This will be
-		 * as if the clone was never done.  But if the fill can fail
-		 * we should have a way to return back to the cloned data.
+		 * If the fill can fail we should have a way to return back to
+		 * the cloned or Direct I/O write data.
 		 */
-		if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+		if (canfail && dr) {
 			mutex_exit(&db->db_mtx);
 			dmu_buf_will_dirty(db_fake, tx);
 			return;
 		}
-		VERIFY(!dbuf_undirty(db, tx));
-		db->db_state = DB_UNCACHED;
+		/*
+		 * Block cloning: We will be completely overwriting a block
+		 * cloned in this transaction group, so let's undirty the
+		 * pending clone and mark the block as uncached. This will be
+		 * as if the clone was never done.
+		 */
+		if (dr && dr->dt.dl.dr_brtwrite) {
+			VERIFY(!dbuf_undirty(db, tx));
+			db->db_state = DB_UNCACHED;
+		}
 	}
 	mutex_exit(&db->db_mtx);
 
@@ -4080,7 +4207,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
-
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
@@ -4540,24 +4666,32 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
 	mutex_enter(&db->db_mtx);
 	/*
-	 * To be synced, we must be dirtied.  But we
-	 * might have been freed after the dirty.
+	 * To be synced, we must be dirtied.  But we might have been freed
+	 * after the dirty.
 	 */
 	if (db->db_state == DB_UNCACHED) {
 		/* This buffer has been freed since it was dirtied */
-		ASSERT(db->db.db_data == NULL);
+		ASSERT3P(db->db.db_data, ==, NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else if (db->db_state == DB_READ) {
 		/*
-		 * This buffer has a clone we need to write, and an in-flight
-		 * read on the BP we're about to clone. Its safe to issue the
-		 * write here because the read has already been issued and the
-		 * contents won't change.
+		 * This buffer was either cloned or had a Direct I/O write
+		 * occur and has an in-flgiht read on the BP. It is safe to
+		 * issue the write here, because the read has already been
+		 * issued and the contents won't change.
+		 *
+		 * We can verify the case of both the clone and Direct I/O
+		 * write by making sure the first dirty record for the dbuf
+		 * has no ARC buffer associated with it.
 		 */
-		ASSERT(dr->dt.dl.dr_brtwrite &&
-		    dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
+		dbuf_dirty_record_t *dr_head =
+		    list_head(&db->db_dirty_records);
+		ASSERT3P(db->db_buf, ==, NULL);
+		ASSERT3P(db->db.db_data, ==, NULL);
+		ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL);
+		ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
@@ -4608,8 +4742,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	dbuf_check_blkptr(dn, db);
 
 	/*
-	 * If this buffer is in the middle of an immediate write,
-	 * wait for the synchronous IO to complete.
+	 * If this buffer is in the middle of an immediate write, wait for the
+	 * synchronous IO to complete.
+	 *
+	 * This is also valid even with Direct I/O writes setting a dirty
+	 * records override state into DR_IN_DMU_SYNC, because all
+	 * Direct I/O writes happen in open-context.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -4913,8 +5051,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+
+		/* no dr_data if this is a NO_FILL or Direct I/O */
 		if (dr->dt.dl.dr_data != NULL &&
 		    dr->dt.dl.dr_data != db->db_buf) {
+			ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
+			ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
 			arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
@@ -5180,7 +5322,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * The BP for this block has been provided by open context
-		 * (by dmu_sync() or dmu_buf_write_embedded()).
+		 * (by dmu_sync(), dmu_write_direct(),
+		 *  or dmu_buf_write_embedded()).
 		 */
 		abd_t *contents = (data != NULL) ?
 		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
@@ -5219,7 +5362,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 		dr->dr_zio = arc_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
-		    dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+		    dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,
 		    children_ready_cb, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
@@ -5239,7 +5382,7 @@ EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
 EXPORT_SYMBOL(dmu_buf_is_dirty);
-EXPORT_SYMBOL(dmu_buf_will_clone);
+EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
 EXPORT_SYMBOL(dmu_buf_will_fill);
 EXPORT_SYMBOL(dmu_buf_fill_done);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index b3eda8ea5..3f87cfe6b 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -609,8 +609,16 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		dbp[i] = &db->db;
 	}
 
-	if (!read)
-		zfs_racct_write(length, nblks);
+	/*
+	 * If we are doing O_DIRECT we still hold the dbufs, even for reads,
+	 * but we do not issue any reads here. We do not want to account for
+	 * writes in this case.
+	 *
+	 * O_DIRECT write/read accounting takes place in
+	 * dmu_{write/read}_abd().
+	 */
+	if (!read && ((flags & DMU_DIRECTIO) == 0))
+		zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
 
 	if (zs)
 		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
@@ -897,7 +905,7 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
- * the end so that the file gets shorter over time (if we crashes in the
+ * the end so that the file gets shorter over time (if we crash in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
@@ -1168,7 +1176,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
-	 * block.  If we ever do the tail block optimization, we will need to
+	 * block. If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
@@ -1178,6 +1186,18 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
 		size = newsz;
 	}
 
+	if (size == 0)
+		return (0);
+
+	/* Allow Direct I/O when requested and properly aligned */
+	if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) &&
+	    zfs_dio_aligned(offset, size, PAGESIZE)) {
+		abd_t *data = abd_get_from_buf(buf, size);
+		err = dmu_read_abd(dn, offset, size, data, flags);
+		abd_free(data);
+		return (err);
+	}
+
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
@@ -1286,22 +1306,41 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 }
 
 /*
- * Note: Lustre is an external consumer of this interface.
+ * This interface is not used internally by ZFS but is provided for
+ * use by Lustre which is built on the DMU interfaces.
  */
-void
-dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
-    const void *buf, dmu_tx_t *tx)
+int
+dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
+    const void *buf, dmu_tx_t *tx, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
+	int error;
 
 	if (size == 0)
-		return;
+		return (0);
+
+	/* Allow Direct I/O when requested and properly aligned */
+	if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
+	    zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
+		abd_t *data = abd_get_from_buf((void *)buf, size);
+		error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
+		abd_free(data);
+		return (error);
+	}
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (0);
+}
+
+int
+dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
+    const void *buf, dmu_tx_t *tx)
+{
+	return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
 }
 
 void
@@ -1365,6 +1404,9 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 
+	if (uio->uio_extflg & UIO_DIRECT)
+		return (dmu_read_uio_direct(dn, uio, size));
+
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
@@ -1453,23 +1495,53 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
-	int i;
+	uint64_t write_size;
 
-	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
+top:
+	write_size = size;
+
+	/*
+	 * We only allow Direct I/O writes to happen if we are block
+	 * sized aligned. Otherwise, we pass the write off to the ARC.
+	 */
+	if ((uio->uio_extflg & UIO_DIRECT) &&
+	    (write_size >= dn->dn_datablksz)) {
+		if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
+		    dn->dn_datablksz)) {
+			return (dmu_write_uio_direct(dn, uio, size, tx));
+		} else if (write_size > dn->dn_datablksz &&
+		    zfs_dio_offset_aligned(zfs_uio_offset(uio),
+		    dn->dn_datablksz)) {
+			write_size =
+			    dn->dn_datablksz * (write_size / dn->dn_datablksz);
+			err = dmu_write_uio_direct(dn, uio, write_size, tx);
+			if (err == 0) {
+				size -= write_size;
+				goto top;
+			} else {
+				return (err);
+			}
+		} else {
+			write_size =
+			    P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
+		}
+	}
+
+	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
-	for (i = 0; i < numbufs; i++) {
+	for (int i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
-		ASSERT(size > 0);
+		ASSERT(write_size > 0);
 
 		offset_t off = zfs_uio_offset(uio);
 		bufoff = off - db->db_offset;
-		tocpy = MIN(db->db_size - bufoff, size);
+		tocpy = MIN(db->db_size - bufoff, write_size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
@@ -1489,10 +1561,18 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 		if (err)
 			break;
 
+		write_size -= tocpy;
 		size -= tocpy;
 	}
 
+	IMPLY(err == 0, write_size == 0);
+
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
+		goto top;
+	}
+
 	return (err);
 }
 
@@ -1731,7 +1811,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
 	 * same size as the dbuf.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
-		zfs_racct_write(blksz, 1);
+		zfs_racct_write(os->os_spa, blksz, 1, 0);
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
@@ -1761,23 +1841,22 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
 	return (err);
 }
 
-typedef struct {
-	dbuf_dirty_record_t	*dsa_dr;
-	dmu_sync_cb_t		*dsa_done;
-	zgd_t			*dsa_zgd;
-	dmu_tx_t		*dsa_tx;
-} dmu_sync_arg_t;
-
-static void
+void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
-	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
-	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
+		dbuf_dirty_record_t *dr = dsa->dsa_dr;
+		blkptr_t *bp = zio->io_bp;
+
 		if (BP_IS_HOLE(bp)) {
+			dmu_buf_t *db = NULL;
+			if (dr)
+				db = &(dr->dr_dbuf->db);
+			else
+				db = dsa->dsa_zgd->zgd_db;
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
@@ -1796,7 +1875,7 @@ dmu_sync_late_arrival_ready(zio_t *zio)
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
-static void
+void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
@@ -1809,7 +1888,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 	 * Record the vdev(s) backing this blkptr so they can be flushed after
 	 * the writes for the lwb have completed.
 	 */
-	if (zio->io_error == 0) {
+	if (zgd && zio->io_error == 0) {
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 	}
 
@@ -1848,10 +1927,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
+
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
-	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+	if (dsa->dsa_done)
+		dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
@@ -2120,9 +2201,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
-	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
-	    &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db),
+	    dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL,
+	    dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL,
+	    &zb));
 
 	return (0);
 }
@@ -2385,6 +2467,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
+	zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
 	memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
 	memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
 	memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -2594,7 +2677,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
 		ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
 
-		dmu_buf_will_clone(dbuf, tx);
+		dmu_buf_will_clone_or_dio(dbuf, tx);
 
 		mutex_enter(&db->db_mtx);
 
@@ -2817,8 +2900,15 @@ EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
 EXPORT_SYMBOL(dmu_read);
 EXPORT_SYMBOL(dmu_read_by_dnode);
+EXPORT_SYMBOL(dmu_read_uio);
+EXPORT_SYMBOL(dmu_read_uio_dbuf);
+EXPORT_SYMBOL(dmu_read_uio_dnode);
 EXPORT_SYMBOL(dmu_write);
 EXPORT_SYMBOL(dmu_write_by_dnode);
+EXPORT_SYMBOL(dmu_write_by_dnode_flags);
+EXPORT_SYMBOL(dmu_write_uio);
+EXPORT_SYMBOL(dmu_write_uio_dbuf);
+EXPORT_SYMBOL(dmu_write_uio_dnode);
 EXPORT_SYMBOL(dmu_prealloc);
 EXPORT_SYMBOL(dmu_object_info);
 EXPORT_SYMBOL(dmu_object_info_from_dnode);
diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c
new file mode 100644
index 000000000..91a7fd8df
--- /dev/null
+++ b/module/zfs/dmu_direct.c
@@ -0,0 +1,395 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_racct.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dmu_objset.h>
+
+static abd_t *
+make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,
+    uint64_t size)
+{
+	size_t buf_size = db->db.db_size;
+	abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;
+	size_t buf_off = 0;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (offset > db->db.db_offset) {
+		size_t pre_size = offset - db->db.db_offset;
+		pre_buf = abd_alloc_for_io(pre_size, B_TRUE);
+		buf_size -= pre_size;
+		buf_off = 0;
+	} else {
+		buf_off = db->db.db_offset - offset;
+		size -= buf_off;
+	}
+
+	if (size < buf_size) {
+		size_t post_size = buf_size - size;
+		post_buf = abd_alloc_for_io(post_size, B_TRUE);
+		buf_size -= post_size;
+	}
+
+	ASSERT3U(buf_size, >, 0);
+	abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);
+
+	if (pre_buf || post_buf) {
+		mbuf = abd_alloc_gang();
+		if (pre_buf)
+			abd_gang_add(mbuf, pre_buf, B_TRUE);
+		abd_gang_add(mbuf, buf, B_TRUE);
+		if (post_buf)
+			abd_gang_add(mbuf, post_buf, B_TRUE);
+	} else {
+		mbuf = buf;
+	}
+
+	return (mbuf);
+}
+
+static void
+dmu_read_abd_done(zio_t *zio)
+{
+	abd_free(zio->io_abd);
+}
+
+static void
+dmu_write_direct_ready(zio_t *zio)
+{
+	dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dmu_write_direct_done(zio_t *zio)
+{
+	dmu_sync_arg_t *dsa = zio->io_private;
+	dbuf_dirty_record_t *dr = dsa->dsa_dr;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	abd_free(zio->io_abd);
+
+	mutex_enter(&db->db_mtx);
+	ASSERT3P(db->db_buf, ==, NULL);
+	ASSERT3P(dr->dt.dl.dr_data, ==, NULL);
+	ASSERT3P(db->db.db_data, ==, NULL);
+	db->db_state = DB_UNCACHED;
+	mutex_exit(&db->db_mtx);
+
+	dmu_sync_done(zio, NULL, zio->io_private);
+
+	if (zio->io_error != 0) {
+		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+			ASSERT3U(zio->io_error, ==, EIO);
+
+		/*
+		 * In the event of an I/O error this block has been freed in
+		 * zio_done() through zio_dva_unallocate(). Calling
+		 * dmu_sync_done() above set dr_override_state to
+		 * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
+		 * dbuf_unoverride(), it will skip doing zio_free() to free
+		 * this block as that was already taken care of.
+		 *
+		 * Since we are undirtying the record in open-context, we must
+		 * have a hold on the db, so it should never be evicted after
+		 * calling dbuf_undirty().
+		 */
+		mutex_enter(&db->db_mtx);
+		VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);
+		mutex_exit(&db->db_mtx);
+	}
+
+	kmem_free(zio->io_bp, sizeof (blkptr_t));
+	zio->io_bp = NULL;
+}
+
+int
+dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
+{
+	objset_t *os = db->db_objset;
+	dsl_dataset_t *ds = dmu_objset_ds(os);
+	zbookmark_phys_t zb;
+	dbuf_dirty_record_t *dr_head;
+
+	SET_BOOKMARK(&zb, ds->ds_object,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	DB_DNODE_ENTER(db);
+	zio_prop_t zp;
+	dmu_write_policy(os, DB_DNODE(db), db->db_level,
+	    WP_DMU_SYNC | WP_DIRECT_WR, &zp);
+	DB_DNODE_EXIT(db);
+
+	/*
+	 * Dirty this dbuf with DB_NOFILL since we will not have any data
+	 * associated with the dbuf.
+	 */
+	dmu_buf_will_clone_or_dio(&db->db, tx);
+
+	mutex_enter(&db->db_mtx);
+
+	uint64_t txg = dmu_tx_get_txg(tx);
+	ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));
+	ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));
+
+	dr_head = list_head(&db->db_dirty_records);
+	ASSERT3U(dr_head->dr_txg, ==, txg);
+	dr_head->dt.dl.dr_diowrite = B_TRUE;
+	dr_head->dr_accounted = db->db.db_size;
+
+	blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+	if (db->db_blkptr != NULL) {
+		/*
+		 * Fill in bp with the current block pointer so that
+		 * the nopwrite code can check if we're writing the same
+		 * data that's already on disk.
+		 */
+		*bp = *db->db_blkptr;
+	} else {
+		memset(bp, 0, sizeof (blkptr_t));
+	}
+
+	/*
+	 * Disable nopwrite if the current block pointer could change
+	 * before this TXG syncs.
+	 */
+	if (list_next(&db->db_dirty_records, dr_head) != NULL)
+		zp.zp_nopwrite = B_FALSE;
+
+	ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
+	dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+
+	mutex_exit(&db->db_mtx);
+
+	dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);
+
+	dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = dr_head;
+	dsa->dsa_tx = tx;
+
+	zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,
+	    db->db.db_size, db->db.db_size, &zp,
+	    dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);
+
+	if (pio == NULL)
+		return (zio_wait(zio));
+
+	zio_nowait(zio);
+
+	return (0);
+}
+
+int
+dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
+    abd_t *data, uint32_t flags, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	spa_t *spa = dn->dn_objset->os_spa;
+	int numbufs, err;
+
+	ASSERT(flags & DMU_DIRECTIO);
+
+	err = dmu_buf_hold_array_by_dnode(dn, offset,
+	    size, B_FALSE, FTAG, &numbufs, &dbp, flags);
+	if (err)
+		return (err);
+
+	zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+	for (int i = 0; i < numbufs && err == 0; i++) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+
+		abd_t *abd = abd_get_offset_size(data,
+		    db->db.db_offset - offset, dn->dn_datablksz);
+
+		zfs_racct_write(spa, db->db.db_size, 1, flags);
+		err = dmu_write_direct(pio, db, abd, tx);
+		ASSERT0(err);
+	}
+
+	err = zio_wait(pio);
+
+	/*
+	 * The dbuf must be held until the Direct I/O write has completed in
+	 * the event there was any errors and dbuf_undirty() was called.
+	 */
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (err);
+}
+
+int
+dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
+    abd_t *data, uint32_t flags)
+{
+	objset_t *os = dn->dn_objset;
+	spa_t *spa = os->os_spa;
+	dmu_buf_t **dbp;
+	int numbufs, err;
+
+	ASSERT(flags & DMU_DIRECTIO);
+
+	err = dmu_buf_hold_array_by_dnode(dn, offset,
+	    size, B_FALSE, FTAG, &numbufs, &dbp, flags);
+	if (err)
+		return (err);
+
+	zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+	for (int i = 0; i < numbufs; i++) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+		abd_t *mbuf;
+		zbookmark_phys_t zb;
+		blkptr_t *bp;
+
+		mutex_enter(&db->db_mtx);
+
+		SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,
+		    db->db.db_object, db->db_level, db->db_blkid);
+
+		/*
+		 * If there is another read for this dbuf, we will wait for
+		 * that to complete first before checking the db_state below.
+		 */
+		while (db->db_state == DB_READ)
+			cv_wait(&db->db_changed, &db->db_mtx);
+
+		err = dmu_buf_get_bp_from_dbuf(db, &bp);
+		if (err) {
+			mutex_exit(&db->db_mtx);
+			goto error;
+		}
+
+		/*
+		 * There is no need to read if this is a hole or the data is
+		 * cached. This will not be considered a direct read for IO
+		 * accounting in the same way that an ARC hit is not counted.
+		 */
+		if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {
+			size_t aoff = offset < db->db.db_offset ?
+			    db->db.db_offset - offset : 0;
+			size_t boff = offset > db->db.db_offset ?
+			    offset - db->db.db_offset : 0;
+			size_t len = MIN(size - aoff, db->db.db_size - boff);
+
+			if (db->db_state == DB_CACHED) {
+				/*
+				 * We need to untransformed the ARC buf data
+				 * before we copy it over.
+				 */
+				err = dmu_buf_untransform_direct(db, spa);
+				ASSERT0(err);
+				abd_copy_from_buf_off(data,
+				    (char *)db->db.db_data + boff, aoff, len);
+			} else {
+				abd_zero_off(data, aoff, len);
+			}
+
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+
+		mbuf = make_abd_for_dbuf(db, data, offset, size);
+		ASSERT3P(mbuf, !=, NULL);
+
+		/*
+		 * The dbuf mutex (db_mtx) must be held when creating the ZIO
+		 * for the read. The BP returned from
+		 * dmu_buf_get_bp_from_dbuf() could be from a pending block
+		 * clone or a yet to be synced Direct I/O write that is in the
+		 * dbuf's dirty record. When zio_read() is called, zio_create()
+		 * will make a copy of the BP. However, if zio_read() is called
+		 * without the mutex being held then the dirty record from the
+		 * dbuf could be freed in dbuf_write_done() resulting in garbage
+		 * being set for the zio BP.
+		 */
+		zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
+		    dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_CANFAIL, &zb);
+		mutex_exit(&db->db_mtx);
+
+		zfs_racct_read(spa, db->db.db_size, 1, flags);
+		zio_nowait(cio);
+	}
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (zio_wait(rio));
+
+error:
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	(void) zio_wait(rio);
+	return (err);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
+{
+	offset_t offset = zfs_uio_offset(uio);
+	offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
+	int err;
+
+	ASSERT(uio->uio_extflg & UIO_DIRECT);
+	ASSERT3U(page_index, <, uio->uio_dio.npages);
+
+	abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
+	    offset & (PAGESIZE - 1), size);
+	err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
+	abd_free(data);
+
+	if (err == 0)
+		zfs_uioskip(uio, size);
+
+	return (err);
+}
+
+int
+dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
+{
+	offset_t offset = zfs_uio_offset(uio);
+	offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
+	int err;
+
+	ASSERT(uio->uio_extflg & UIO_DIRECT);
+	ASSERT3U(page_index, <, uio->uio_dio.npages);
+
+	abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
+	    offset & (PAGESIZE - 1), size);
+	err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
+	abd_free(data);
+
+	if (err == 0)
+		zfs_uioskip(uio, size);
+
+	return (err);
+}
+#endif /* _KERNEL */
+
+EXPORT_SYMBOL(dmu_read_uio_direct);
+EXPORT_SYMBOL(dmu_write_uio_direct);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 8f4fefa4f..f030fba22 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -351,6 +351,20 @@ smallblk_changed_cb(void *arg, uint64_t newval)
 }
 
 static void
+direct_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD ||
+	    newval == ZFS_DIRECT_ALWAYS);
+
+	os->os_direct = newval;
+}
+
+static void
 logbias_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
@@ -633,6 +647,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 				    ZFS_PROP_SPECIAL_SMALL_BLOCKS),
 				    smallblk_changed_cb, os);
 			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_DIRECT),
+				    direct_changed_cb, os);
+			}
 		}
 		if (err != 0) {
 			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
index 17ed2a620..45a2f0626 100644
--- a/module/zfs/spa_stats.c
+++ b/module/zfs/spa_stats.c
@@ -895,6 +895,14 @@ static const spa_iostats_t spa_iostats_template = {
 	{ "simple_trim_bytes_skipped",		KSTAT_DATA_UINT64 },
 	{ "simple_trim_extents_failed",		KSTAT_DATA_UINT64 },
 	{ "simple_trim_bytes_failed",		KSTAT_DATA_UINT64 },
+	{ "arc_read_count",			KSTAT_DATA_UINT64 },
+	{ "arc_read_bytes",			KSTAT_DATA_UINT64 },
+	{ "arc_write_count",			KSTAT_DATA_UINT64 },
+	{ "arc_write_bytes",			KSTAT_DATA_UINT64 },
+	{ "direct_read_count",			KSTAT_DATA_UINT64 },
+	{ "direct_read_bytes",			KSTAT_DATA_UINT64 },
+	{ "direct_write_count",			KSTAT_DATA_UINT64 },
+	{ "direct_write_bytes",			KSTAT_DATA_UINT64 },
 };
 
 #define	SPA_IOSTATS_ADD(stat, val) \
@@ -938,6 +946,44 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
 	}
 }
 
+void
+spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+	kstat_t *ksp = shk->kstat;
+
+	if (ksp == NULL)
+		return;
+
+	spa_iostats_t *iostats = ksp->ks_data;
+	if (flags & DMU_DIRECTIO) {
+		SPA_IOSTATS_ADD(direct_read_count, iops);
+		SPA_IOSTATS_ADD(direct_read_bytes, size);
+	} else {
+		SPA_IOSTATS_ADD(arc_read_count, iops);
+		SPA_IOSTATS_ADD(arc_read_bytes, size);
+	}
+}
+
+void
+spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+	kstat_t *ksp = shk->kstat;
+
+	if (ksp == NULL)
+		return;
+
+	spa_iostats_t *iostats = ksp->ks_data;
+	if (flags & DMU_DIRECTIO) {
+		SPA_IOSTATS_ADD(direct_write_count, iops);
+		SPA_IOSTATS_ADD(direct_write_bytes, size);
+	} else {
+		SPA_IOSTATS_ADD(arc_write_count, iops);
+		SPA_IOSTATS_ADD(arc_write_bytes, size);
+	}
+}
+
 static int
 spa_iostats_update(kstat_t *ksp, int rw)
 {
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 6ae0a1412..9305bd894 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -118,6 +118,11 @@ static unsigned int zfs_slow_io_events_per_second = 20;
 static unsigned int zfs_deadman_events_per_second = 1;
 
 /*
+ * Rate limit direct write IO verify failures to this many per scond.
+ */
+static unsigned int zfs_dio_write_verify_events_per_second = 20;
+
+/*
  * Rate limit checksum events after this many checksum errors per second.
  */
 static unsigned int zfs_checksum_events_per_second = 20;
@@ -153,6 +158,17 @@ int zfs_nocacheflush = 0;
 uint_t zfs_vdev_max_auto_ashift = 14;
 uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
+/*
+ * VDEV checksum verification for Direct I/O writes. This is neccessary for
+ * Linux, because anonymous pages can not be placed under write protection
+ * during Direct I/O writes.
+ */
+#if !defined(__FreeBSD__)
+uint_t zfs_vdev_direct_write_verify = 1;
+#else
+uint_t zfs_vdev_direct_write_verify = 0;
+#endif
+
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 {
@@ -673,6 +689,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	    1);
 	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
 	    1);
+	zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
+	    &zfs_dio_write_verify_events_per_second, 1);
 	zfs_ratelimit_init(&vd->vdev_checksum_rl,
 	    &zfs_checksum_events_per_second, 1);
 
@@ -1182,6 +1200,7 @@ vdev_free(vdev_t *vd)
 
 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_deadman_rl);
+	zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
 
 	if (vd == spa->spa_root_vdev)
@@ -4475,6 +4494,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
+	vd->vdev_stat.vs_dio_verify_errors = 0;
 	vd->vdev_stat.vs_slow_ios = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
@@ -6503,7 +6523,14 @@ ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
 	"Rate limit hung IO (deadman) events to this many per second");
 
+ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
+	"Rate Direct I/O write verify events to this many per second");
+
 /* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
+	"Direct I/O writes will perform for checksum verification before "
+	"commiting write");
+
 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
 	"Rate limit checksum events to this many checksum errors per second "
 	"(do not set below ZED threshold).");
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 47346dd5a..9d12bc2eb 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -387,6 +387,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 	/* IO delays */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
 
+	/* Direct I/O write verify errors */
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS,
+	    vs->vs_dio_verify_errors);
+
 	/* Add extended stats nvlist to main nvlist */
 	fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
 
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index f7cecc9af..25b05abd3 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -595,6 +595,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 			    DATA_TYPE_UINT64, vs->vs_checksum_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
 			    DATA_TYPE_UINT64, vs->vs_slow_ios,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS,
+			    DATA_TYPE_UINT64, vs->vs_dio_verify_errors,
 			    NULL);
 		}
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 53366ad49..e69b98896 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -160,7 +160,6 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/errno.h>
-#include <sys/uio_impl.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/cmn_err.h>
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 399f5a011..8d0aebbec 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -607,7 +607,7 @@ static int64_t zfs_immediate_write_sz = 32768;
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
-    zil_callback_t callback, void *callback_data)
+    boolean_t o_direct, zil_callback_t callback, void *callback_data)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 	uint32_t blocksize = zp->z_blksz;
@@ -622,7 +622,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 		return;
 	}
 
-	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
 		write_state = WR_INDIRECT;
 	else if (!spa_has_slogs(zilog->zl_spa) &&
 	    resid >= zfs_immediate_write_sz)
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index f3db953ea..f9cc5b010 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -35,7 +35,6 @@
 #include <sys/time.h>
 #include <sys/sysmacros.h>
 #include <sys/vfs.h>
-#include <sys/uio_impl.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
@@ -75,6 +74,14 @@ int zfs_bclone_enabled = 1;
 static int zfs_bclone_wait_dirty = 0;
 
 /*
+ * Enable Direct I/O. If this setting is 0, then all I/O requests will be
+ * directed through the ARC acting as though the dataset property direct was
+ * set to disabled.
+ */
+static int zfs_dio_enabled = 1;
+
+
+/*
  * Maximum bytes to read per chunk in zfs_read().
  */
 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
@@ -203,6 +210,77 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
 }
 
 /*
+ * Determine if Direct I/O has been requested (either via the O_DIRECT flag or
+ * the "direct" dataset property). When inherited by the property only apply
+ * the O_DIRECT flag to correctly aligned IO requests. The rational for this
+ * is it allows the property to be safely set on a dataset without forcing
+ * all of the applications to be aware of the alignment restrictions. When
+ * O_DIRECT is explicitly requested by an application return EINVAL if the
+ * request is unaligned.  In all cases, if the range for this request has
+ * been mmap'ed then we will perform buffered I/O to keep the mapped region
+ * synhronized with the ARC.
+ *
+ * It is possible that a file's pages could be mmap'ed after it is checked
+ * here. If so, that is handled coorarding in zfs_write(). See comments in the
+ * following area for how this is handled:
+ * zfs_write() -> update_pages()
+ */
+static int
+zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
+    int *ioflagp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	objset_t *os = zfsvfs->z_os;
+	int ioflag = *ioflagp;
+	int error = 0;
+
+	if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED ||
+	    zn_has_cached_data(zp, zfs_uio_offset(uio),
+	    zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
+		/*
+		 * Direct I/O is disabled or the region is mmap'ed. In either
+		 * case the I/O request will just directed through the ARC.
+		 */
+		ioflag &= ~O_DIRECT;
+		goto out;
+	} else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
+	    zfs_uio_page_aligned(uio) &&
+	    zfs_uio_aligned(uio, PAGE_SIZE)) {
+		if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
+		    (rw == UIO_READ)) {
+			ioflag |= O_DIRECT;
+		}
+	} else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
+		/*
+		 * Direct I/O was requested through the direct=always, but it
+		 * is not properly PAGE_SIZE aligned. The request will be
+		 * directed through the ARC.
+		 */
+		ioflag &= ~O_DIRECT;
+	}
+
+	if (ioflag & O_DIRECT) {
+		if (!zfs_uio_page_aligned(uio) ||
+		    !zfs_uio_aligned(uio, PAGE_SIZE)) {
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		error = zfs_uio_get_dio_pages_alloc(uio, rw);
+		if (error) {
+			goto out;
+		}
+	}
+
+	IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
+	ASSERT0(error);
+
+out:
+	*ioflagp = ioflag;
+	return (error);
+}
+
+/*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	zp	- inode of file to be read from.
@@ -286,24 +364,58 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		error = 0;
 		goto out;
 	}
-
 	ASSERT(zfs_uio_offset(uio) < zp->z_size);
+
+	/*
+	 * Setting up Direct I/O if requested.
+	 */
+	error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag);
+	if (error) {
+		goto out;
+	}
+
 #if defined(__linux__)
 	ssize_t start_offset = zfs_uio_offset(uio);
 #endif
+	ssize_t chunk_size = zfs_vnops_read_chunk_size;
 	ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
 	ssize_t start_resid = n;
+	ssize_t dio_remaining_resid = 0;
+
+	if (uio->uio_extflg & UIO_DIRECT) {
+		/*
+		 * All pages for an O_DIRECT request ahve already been mapped
+		 * so there's no compelling reason to handle this uio in
+		 * smaller chunks.
+		 */
+		chunk_size = DMU_MAX_ACCESS;
+
+		/*
+		 * In the event that the O_DIRECT request is reading the entire
+		 * file, it is possible file's length is not page sized
+		 * aligned. However, lower layers expect that the Direct I/O
+		 * request is page-aligned. In this case, as much of the file
+		 * that can be read using Direct I/O happens and the remaining
+		 * amount will be read through the ARC.
+		 *
+		 * This is still consistent with the semantics of Direct I/O in
+		 * ZFS as at a minimum the I/O request must be page-aligned.
+		 */
+		dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
+		if (dio_remaining_resid != 0)
+			n -= dio_remaining_resid;
+	}
 
 	while (n > 0) {
-		ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
-		    P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
+		ssize_t nbytes = MIN(n, chunk_size -
+		    P2PHASE(zfs_uio_offset(uio), chunk_size));
 #ifdef UIO_NOCOPY
 		if (zfs_uio_segflg(uio) == UIO_NOCOPY)
 			error = mappedread_sf(zp, nbytes, uio);
 		else
 #endif
 		if (zn_has_cached_data(zp, zfs_uio_offset(uio),
-		    zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
+		    zfs_uio_offset(uio) + nbytes - 1)) {
 			error = mappedread(zp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -332,12 +444,40 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		n -= nbytes;
 	}
 
+	if (error == 0 && (uio->uio_extflg & UIO_DIRECT) &&
+	    dio_remaining_resid != 0) {
+		/*
+		 * Temporarily remove the UIO_DIRECT flag from the UIO so the
+		 * remainder of the file can be read using the ARC.
+		 */
+		uio->uio_extflg &= ~UIO_DIRECT;
+
+		if (zn_has_cached_data(zp, zfs_uio_offset(uio),
+		    zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
+			error = mappedread(zp, dio_remaining_resid, uio);
+		} else {
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
+			    dio_remaining_resid);
+		}
+		uio->uio_extflg |= UIO_DIRECT;
+
+		if (error != 0)
+			n += dio_remaining_resid;
+	} else if (error && (uio->uio_extflg & UIO_DIRECT)) {
+		n += dio_remaining_resid;
+	}
 	int64_t nread = start_resid - n;
+
 	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
-	task_io_account_read(nread);
 out:
 	zfs_rangelock_exit(lr);
 
+	/*
+	 * Cleanup for Direct I/O if requested.
+	 */
+	if (uio->uio_extflg & UIO_DIRECT)
+		zfs_uio_free_dio_pages(uio, UIO_READ);
+
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
@@ -422,6 +562,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	int error = 0, error1;
 	ssize_t start_resid = zfs_uio_resid(uio);
 	uint64_t clear_setid_bits_txg = 0;
+	boolean_t o_direct_defer = B_FALSE;
 
 	/*
 	 * Fasttrack empty write
@@ -475,6 +616,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	}
 
 	/*
+	 * Setting up Direct I/O if requested.
+	 */
+	error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag);
+	if (error) {
+		zfs_exit(zfsvfs, FTAG);
+		return (SET_ERROR(error));
+	}
+
+	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 */
@@ -504,6 +654,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			woff = zp->z_size;
 		}
 		zfs_uio_setoffset(uio, woff);
+		/*
+		 * We need to update the starting offset as well because it is
+		 * set previously in the ZPL (Linux) and VNOPS (FreeBSD)
+		 * layers.
+		 */
+		zfs_uio_setsoffset(uio, woff);
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
@@ -540,6 +696,33 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	const uint64_t projid = zp->z_projid;
 
 	/*
+	 * In the event we are increasing the file block size
+	 * (lr_length == UINT64_MAX), we will direct the write to the ARC.
+	 * Because zfs_grow_blocksize() will read from the ARC in order to
+	 * grow the dbuf, we avoid doing Direct I/O here as that would cause
+	 * data written to disk to be overwritten by data in the ARC during
+	 * the sync phase. Besides writing data twice to disk, we also
+	 * want to avoid consistency concerns between data in the the ARC and
+	 * on disk while growing the file's blocksize.
+	 *
+	 * We will only temporarily remove Direct I/O and put it back after
+	 * we have grown the blocksize. We do this in the event a request
+	 * is larger than max_blksz, so further requests to
+	 * dmu_write_uio_dbuf() will still issue the requests using Direct
+	 * IO.
+	 *
+	 * As an example:
+	 * The first block to file is being written as a 4k request with
+	 * a recorsize of 1K. The first 1K issued in the loop below will go
+	 * through the ARC; however, the following 3 1K requests will
+	 * use Direct I/O.
+	 */
+	if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) {
+		uio->uio_extflg &= ~UIO_DIRECT;
+		o_direct_defer = B_TRUE;
+	}
+
+	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
@@ -580,6 +763,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		ssize_t nbytes = n;
 		if (n >= blksz && woff >= zp->z_size &&
 		    P2PHASE(woff, blksz) == 0 &&
+		    !(uio->uio_extflg & UIO_DIRECT) &&
 		    (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
@@ -705,9 +889,30 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			zfs_uioskip(uio, nbytes);
 			tx_bytes = nbytes;
 		}
+		/*
+		 * There is a window where a file's pages can be mmap'ed after
+		 * zfs_setup_direct() is called. This is due to the fact that
+		 * the rangelock in this function is acquired after calling
+		 * zfs_setup_direct(). This is done so that
+		 * zfs_uio_prefaultpages() does not attempt to fault in pages
+		 * on Linux for Direct I/O requests. This is not necessary as
+		 * the pages are pinned in memory and can not be faulted out.
+		 * Ideally, the rangelock would be held before calling
+		 * zfs_setup_direct() and zfs_uio_prefaultpages(); however,
+		 * this can lead to a deadlock as zfs_getpage() also acquires
+		 * the rangelock as a RL_WRITER and prefaulting the pages can
+		 * lead to zfs_getpage() being called.
+		 *
+		 * In the case of the pages being mapped after
+		 * zfs_setup_direct() is called, the call to update_pages()
+		 * will still be made to make sure there is consistency between
+		 * the ARC and the Linux page cache. This is an ufortunate
+		 * situation as the data will be read back into the ARC after
+		 * the Direct I/O write has completed, but this is the penality
+		 * for writing to a mmap'ed region of a file using Direct I/O.
+		 */
 		if (tx_bytes &&
-		    zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
-		    !(ioflag & O_DIRECT)) {
+		    zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) {
 			update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
 		}
 
@@ -756,10 +961,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		 * the TX_WRITE records logged here.
 		 */
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
-		    NULL, NULL);
+		    uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL,
+		    NULL);
 
 		dmu_tx_commit(tx);
 
+		/*
+		 * Direct I/O was deferred in order to grow the first block.
+		 * At this point it can be re-enabled for subsequent writes.
+		 */
+		if (o_direct_defer) {
+			ASSERT(ioflag & O_DIRECT);
+			uio->uio_extflg |= UIO_DIRECT;
+			o_direct_defer = B_FALSE;
+		}
+
 		if (error != 0)
 			break;
 		ASSERT3S(tx_bytes, ==, nbytes);
@@ -767,10 +983,22 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		pfbytes -= nbytes;
 	}
 
+	if (o_direct_defer) {
+		ASSERT(ioflag & O_DIRECT);
+		uio->uio_extflg |= UIO_DIRECT;
+		o_direct_defer = B_FALSE;
+	}
+
 	zfs_znode_update_vfs(zp);
 	zfs_rangelock_exit(lr);
 
 	/*
+	 * Cleanup for Direct I/O if requested.
+	 */
+	if (uio->uio_extflg & UIO_DIRECT)
+		zfs_uio_free_dio_pages(uio, UIO_WRITE);
+
+	/*
 	 * If we're in replay mode, or we made no progress, or the
 	 * uio data is inaccessible return an error.  Otherwise, it's
 	 * at least a partial write, so it's successful.
@@ -784,9 +1012,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	if (commit)
 		zil_commit(zilog, zp->z_id);
 
-	const int64_t nwritten = start_resid - zfs_uio_resid(uio);
+	int64_t nwritten = start_resid - zfs_uio_resid(uio);
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
-	task_io_account_write(nwritten);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
@@ -846,7 +1073,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
-	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 	uint64_t zp_gen;
@@ -890,8 +1116,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
-		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
-		    offset, size, RL_READER);
+		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
+		    size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
@@ -929,18 +1155,44 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 			zil_fault_io = 0;
 		}
 #endif
+
+		dmu_buf_t *dbp;
 		if (error == 0)
 			error = dmu_buf_hold_noread(os, object, offset, zgd,
-			    &db);
+			    &dbp);
 
 		if (error == 0) {
-			blkptr_t *bp = &lr->lr_blkptr;
+			zgd->zgd_db = dbp;
+			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp;
+			boolean_t direct_write = B_FALSE;
+			mutex_enter(&db->db_mtx);
+			dbuf_dirty_record_t *dr =
+			    dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg);
+			if (dr != NULL && dr->dt.dl.dr_diowrite)
+				direct_write = B_TRUE;
+			mutex_exit(&db->db_mtx);
+
+			/*
+			 * All Direct I/O writes will have already completed and
+			 * the block pointer can be immediately stored in the
+			 * log record.
+			 */
+			if (direct_write) {
+				/*
+				 * A Direct I/O write always covers an entire
+				 * block.
+				 */
+				ASSERT3U(dbp->db_size, ==, zp->z_blksz);
+				lr->lr_blkptr = dr->dt.dl.dr_overridden_by;
+				zfs_get_done(zgd, 0);
+				return (0);
+			}
 
-			zgd->zgd_db = db;
+			blkptr_t *bp = &lr->lr_blkptr;
 			zgd->zgd_bp = bp;
 
-			ASSERT(db->db_offset == offset);
-			ASSERT(db->db_size == size);
+			ASSERT3U(dbp->db_offset, ==, offset);
+			ASSERT3U(dbp->db_size, ==, size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
@@ -975,7 +1227,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 	return (error);
 }
 
-
 static void
 zfs_get_done(zgd_t *zgd, int error)
 {
@@ -1559,3 +1810,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
 	"Wait for dirty blocks when cloning");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
+	"Enable Direct I/O");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 53992931e..66a8a9fef 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -803,6 +803,12 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
+	if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+		ASSERT3U(*errorp, ==, EIO);
+		ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+		pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+	}
+
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
@@ -1282,20 +1288,14 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
+	enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ?
+	    ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE;
 
-	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
-	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
-	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
-	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
-	    DMU_OT_IS_VALID(zp->zp_type) &&
-	    zp->zp_level < 32 &&
-	    zp->zp_copies > 0 &&
-	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
-	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
-	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
+	    ZIO_STAGE_OPEN, pipeline);
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
@@ -1572,6 +1572,19 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+	} else if (type == ZIO_TYPE_WRITE &&
+	    pio->io_prop.zp_direct_write == B_TRUE) {
+		/*
+		 * By default we only will verify checksums for Direct I/O
+		 * writes for Linux. FreeBSD is able to place user pages under
+		 * write protection before issuing them to the ZIO pipeline.
+		 *
+		 * Checksum validation errors will only be reported through
+		 * the top-level VDEV, which is set by this child ZIO.
+		 */
+		ASSERT3P(bp, !=, NULL);
+		ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+		pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
@@ -3104,6 +3117,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		zp.zp_nopwrite = B_FALSE;
 		zp.zp_encrypt = gio->io_prop.zp_encrypt;
 		zp.zp_byteorder = gio->io_prop.zp_byteorder;
+		zp.zp_direct_write = B_FALSE;
 		memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
 		memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
 		memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -3577,6 +3591,13 @@ zio_ddt_write(zio_t *zio)
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
+	/*
+	 * Deduplication will not take place for Direct I/O writes. The
+	 * ddt_tree will be emptied in syncing context. Direct I/O writes take
+	 * place in the open-context. Direct I/O write can not attempt to
+	 * modify the ddt_tree while issuing out a write.
+	 */
+	ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE);
 
 	ddt_enter(ddt);
 	dde = ddt_lookup(ddt, bp);
@@ -4509,6 +4530,19 @@ zio_vdev_io_assess(zio_t *zio)
 		zio->io_vsd = NULL;
 	}
 
+	/*
+	 * If a Direct I/O write checksum verify error has occurred then this
+	 * I/O should not attempt to be issued again. Instead the EIO will
+	 * be returned.
+	 */
+	if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+		ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+		ASSERT3U(zio->io_error, ==, EIO);
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		return (zio);
+	}
+
+
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
@@ -4822,6 +4856,49 @@ zio_checksum_verify(zio_t *zio)
 	return (zio);
 }
 
+static zio_t *
+zio_dio_checksum_verify(zio_t *zio)
+{
+	zio_t *pio = zio_unique_parent(zio);
+	int error;
+
+	ASSERT3P(zio->io_vd, !=, NULL);
+	ASSERT3P(zio->io_bp, !=, NULL);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+	ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE);
+	ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+
+	if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0)
+		goto out;
+
+	if ((error = zio_checksum_error(zio, NULL)) != 0) {
+		zio->io_error = error;
+		if (error == ECKSUM) {
+			mutex_enter(&zio->io_vd->vdev_stat_lock);
+			zio->io_vd->vdev_stat.vs_dio_verify_errors++;
+			mutex_exit(&zio->io_vd->vdev_stat_lock);
+			zio->io_error = SET_ERROR(EIO);
+			zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+
+			/*
+			 * The EIO error must be propagated up to the logical
+			 * parent ZIO in zio_notify_parent() so it can be
+			 * returned to dmu_write_abd().
+			 */
+			zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE;
+
+			(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY,
+			    zio->io_spa, zio->io_vd, &zio->io_bookmark,
+			    zio, 0);
+		}
+	}
+
+out:
+	return (zio);
+}
+
+
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
@@ -5152,7 +5229,8 @@ zio_done(zio_t *zio)
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
-		    !vdev_is_dead(zio->io_vd)) {
+		    !vdev_is_dead(zio->io_vd) &&
+		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
 			int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
 			    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 			if (ret != EALREADY) {
@@ -5167,6 +5245,7 @@ zio_done(zio_t *zio)
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
 		    zio == zio->io_logical) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
@@ -5188,7 +5267,8 @@ zio_done(zio_t *zio)
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
-		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+		    !(zio->io_flags & ZIO_FLAG_CANFAIL) &&
+		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
@@ -5239,6 +5319,14 @@ zio_done(zio_t *zio)
 
 	if (zio->io_reexecute) {
 		/*
+		 * A Direct I/O write that has a checksum verify error should
+		 * not attempt to reexecute. Instead, EAGAIN should just be
+		 * propagated back up so the write can be attempt to be issued
+		 * through the ARC.
+		 */
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
+
+		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
@@ -5398,6 +5486,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
+	zio_dio_checksum_verify,
 	zio_done
 };