aboutsummaryrefslogtreecommitdiffstats
path: root/module/os/linux/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/os/linux/zfs')
-rw-r--r--module/os/linux/zfs/abd_os.c211
-rw-r--r--module/os/linux/zfs/zfs_racct.c29
-rw-r--r--module/os/linux/zfs/zfs_uio.c295
-rw-r--r--module/os/linux/zfs/zfs_vfsops.c1
-rw-r--r--module/os/linux/zfs/zfs_vnops_os.c44
-rw-r--r--module/os/linux/zfs/zpl_file.c71
6 files changed, 592 insertions, 59 deletions
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 60287ccdd..dae4107e0 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -186,6 +186,7 @@ static int zfs_abd_scatter_min_size = 512 * 3;
abd_t *abd_zero_scatter = NULL;
struct page;
+
/*
* abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
* point to ZERO_PAGE if it is available or it will be an allocated zero'd
@@ -453,14 +454,21 @@ abd_free_chunks(abd_t *abd)
if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
- abd_for_each_sg(abd, sg, nr_pages, i) {
- page = sg_page(sg);
- abd_unmark_zfs_page(page);
- order = compound_order(page);
- __free_pages(page, order);
- ASSERT3U(sg->length, <=, PAGE_SIZE << order);
- ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+ /*
+ * Scatter ABDs may be constructed by abd_alloc_from_pages() from
+ * an array of pages. In which case they should not be freed.
+ */
+ if (!abd_is_from_pages(abd)) {
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ page = sg_page(sg);
+ abd_unmark_zfs_page(page);
+ order = compound_order(page);
+ __free_pages(page, order);
+ ASSERT3U(sg->length, <=, PAGE_SIZE << order);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+ }
}
+
abd_free_sg_table(abd);
}
@@ -551,17 +559,19 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
void
abd_verify_scatter(abd_t *abd)
{
- size_t n;
- int i = 0;
- struct scatterlist *sg = NULL;
-
ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
ABD_SCATTER(abd).abd_sgl->length);
- n = ABD_SCATTER(abd).abd_nents;
+
+#ifdef ZFS_DEBUG
+ struct scatterlist *sg = NULL;
+ size_t n = ABD_SCATTER(abd).abd_nents;
+ int i = 0;
+
abd_for_each_sg(abd, sg, n, i) {
ASSERT3P(sg_page(sg), !=, NULL);
}
+#endif
}
static void
@@ -687,14 +697,77 @@ abd_free_linear_page(abd_t *abd)
{
/* Transform it back into a scatter ABD for freeing */
struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+
+ /* When backed by user page unmap it */
+ if (abd_is_from_pages(abd))
+ zfs_kunmap(sg_page(sg));
+
abd->abd_flags &= ~ABD_FLAG_LINEAR;
abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
ABD_SCATTER(abd).abd_nents = 1;
ABD_SCATTER(abd).abd_offset = 0;
ABD_SCATTER(abd).abd_sgl = sg;
abd_free_chunks(abd);
+}
+
+/*
+ * Allocate a scatter ABD structure from user pages. The pages must be
+ * pinned with get_user_pages, or similiar, but need not be mapped via
+ * the kmap interfaces.
+ */
+abd_t *
+abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size)
+{
+ uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE);
+ struct sg_table table;
+
+ VERIFY3U(size, <=, DMU_MAX_ACCESS);
+ ASSERT3U(offset, <, PAGE_SIZE);
+ ASSERT3P(pages, !=, NULL);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that we
+ * own the underlying data buffer, which is not true in this case.
+ * Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd_t *abd = abd_alloc_struct(0);
+ abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER;
+ abd->abd_size = size;
+
+ while (sg_alloc_table_from_pages(&table, pages, npages, offset,
+ size, __GFP_NOWARN | GFP_NOIO) != 0) {
+ ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ if ((offset + size) <= PAGE_SIZE) {
+ /*
+ * Since there is only one entry, this ABD can be represented
+ * as a linear buffer. All single-page (4K) ABD's constructed
+ * from a user page can be represented this way as long as the
+ * page is mapped to a virtual address. This allows us to
+ * apply an offset in to the mapped page.
+ *
+ * Note that kmap() must be used, not kmap_atomic(), because
+ * the mapping needs to bet set up on all CPUs. Using kmap()
+ * also enables the user of highmem pages when required.
+ */
+ abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
+ abd->abd_u.abd_linear.abd_sgl = table.sgl;
+ zfs_kmap(sg_page(table.sgl));
+ ABD_LINEAR_BUF(abd) = sg_virt(table.sgl);
+ } else {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+ abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+
+ ABD_SCATTER(abd).abd_offset = offset;
+ ABD_SCATTER(abd).abd_sgl = table.sgl;
+ ABD_SCATTER(abd).abd_nents = table.nents;
+
+ ASSERT0(ABD_SCATTER(abd).abd_offset);
+ }
- abd_update_scatter_stats(abd, ABDSTAT_DECR);
+ return (abd);
}
/*
@@ -746,6 +819,9 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
ABD_SCATTER(abd).abd_offset = new_offset;
ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+ if (abd_is_from_pages(sabd))
+ abd->abd_flags |= ABD_FLAG_FROM_PAGES;
+
return (abd);
}
@@ -874,6 +950,115 @@ abd_cache_reap_now(void)
}
/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+ void *buf;
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, 0);
+ /*
+ * In the event the ABD is composed of a single user page from Direct
+ * I/O we can not direclty return the raw buffer. This is a consequence
+ * of not being able to write protect the page and the contents of the
+ * page can be changed at any time by the user.
+ */
+ if (abd_is_from_pages(abd)) {
+ buf = zio_buf_alloc(n);
+ } else if (abd_is_linear(abd)) {
+ buf = abd_to_buf(abd);
+ } else {
+ buf = zio_buf_alloc(n);
+ }
+
+#ifdef ZFS_DEBUG
+ (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+#endif
+ return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+ void *buf = abd_borrow_buf(abd, n);
+
+ /*
+ * In the event the ABD is composed of a single user page from Direct
+ * I/O we must make sure copy the data over into the newly allocated
+ * buffer. This is a consequence of the fact that we can not write
+ * protect the user page and there is a risk the contents of the page
+ * could be changed by the user at any moment.
+ */
+ if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
+ abd_copy_to_buf(buf, abd, n);
+ }
+ return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will
+ * not change the contents of the ABD. If you want any changes you made to
+ * buf to be copied back to abd, use abd_return_buf_copy() instead. If the
+ * ABD is not constructed from user pages for Direct I/O then an ASSERT
+ * checks to make sure the contents of buffer have not changed since it was
+ * borrowed. We can not ASSERT that the contents of the buffer have not changed
+ * if it is composed of user pages because the pages can not be placed under
+ * write protection and the user could have possibly changed the contents in
+ * the pages at any time.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+#ifdef ZFS_DEBUG
+ (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+#endif
+ if (abd_is_from_pages(abd)) {
+ zio_buf_free(buf, n);
+ } else if (abd_is_linear(abd)) {
+ ASSERT3P(buf, ==, abd_to_buf(abd));
+ } else if (abd_is_gang(abd)) {
+#ifdef ZFS_DEBUG
+ /*
+ * We have to be careful with gang ABD's that we do not ASSERT0
+ * for any ABD's that contain user pages from Direct I/O. In
+ * order to handle this, we just iterate through the gang ABD
+ * and only verify ABDs that are not from user pages.
+ */
+ void *cmp_buf = buf;
+
+ for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
+ cabd != NULL;
+ cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+ if (!abd_is_from_pages(cabd)) {
+ ASSERT0(abd_cmp_buf(cabd, cmp_buf,
+ cabd->abd_size));
+ }
+ cmp_buf = (char *)cmp_buf + cabd->abd_size;
+ }
+#endif
+ zio_buf_free(buf, n);
+ } else {
+ ASSERT0(abd_cmp_buf(abd, buf, n));
+ zio_buf_free(buf, n);
+ }
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+ if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
+ abd_copy_from_buf(abd, buf, n);
+ }
+ abd_return_buf(abd, buf, n);
+}
+
+/*
* This is abd_iter_page(), the function underneath abd_iterate_page_func().
* It yields the next page struct and data offset and size within it, without
* mapping it into the address space.
diff --git a/module/os/linux/zfs/zfs_racct.c b/module/os/linux/zfs/zfs_racct.c
index ce623ef9d..ce197caa4 100644
--- a/module/os/linux/zfs/zfs_racct.c
+++ b/module/os/linux/zfs/zfs_racct.c
@@ -25,14 +25,35 @@
#include <sys/zfs_racct.h>
+#ifdef _KERNEL
+#include <linux/task_io_accounting_ops.h>
+
+void
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+ task_io_account_read(size);
+ spa_iostats_read_add(spa, size, iops, flags);
+}
+
void
-zfs_racct_read(uint64_t size, uint64_t iops)
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
{
- (void) size, (void) iops;
+ task_io_account_write(size);
+ spa_iostats_write_add(spa, size, iops, flags);
}
+#else
+
void
-zfs_racct_write(uint64_t size, uint64_t iops)
+zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
{
- (void) size, (void) iops;
+ (void) spa, (void) size, (void) iops, (void) flags;
}
+
+void
+zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+ (void) spa, (void) size, (void) iops, (void) flags;
+}
+
+#endif /* _KERNEL */
diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c
index a99a1ba88..637f968f8 100644
--- a/module/os/linux/zfs/zfs_uio.c
+++ b/module/os/linux/zfs/zfs_uio.c
@@ -41,12 +41,19 @@
#ifdef _KERNEL
+#include <sys/errno.h>
+#include <sys/vmem.h>
+#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/uio_impl.h>
#include <sys/sysmacros.h>
#include <sys/string.h>
+#include <sys/zfs_refcount.h>
+#include <sys/zfs_debug.h>
#include <linux/kmap_compat.h>
#include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
/*
* Move "n" bytes at byte address "p"; "rw" indicates the direction
@@ -327,8 +334,13 @@ EXPORT_SYMBOL(zfs_uiomove);
int
zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
{
- if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) {
- /* There's never a need to fault in kernel pages */
+ if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
+ (uio->uio_extflg & UIO_DIRECT)) {
+ /*
+ * There's never a need to fault in kernel pages or Direct I/O
+ * write pages. Direct I/O write pages have been pinned in so
+ * there is never a time for these pages a fault will occur.
+ */
return (0);
#if defined(HAVE_VFS_IOV_ITER)
} else if (uio->uio_segflg == UIO_ITER) {
@@ -437,9 +449,288 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
uio->uio_iovcnt--;
}
}
+
uio->uio_loffset += n;
uio->uio_resid -= n;
}
EXPORT_SYMBOL(zfs_uioskip);
+/*
+ * Check if the uio is page-aligned in memory.
+ */
+boolean_t
+zfs_uio_page_aligned(zfs_uio_t *uio)
+{
+ boolean_t aligned = B_TRUE;
+
+ if (uio->uio_segflg == UIO_USERSPACE ||
+ uio->uio_segflg == UIO_SYSSPACE) {
+ const struct iovec *iov = uio->uio_iov;
+ size_t skip = uio->uio_skip;
+
+ for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
+ uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
+ size_t size = iov->iov_len - skip;
+ if ((addr & (PAGE_SIZE - 1)) ||
+ (size & (PAGE_SIZE - 1))) {
+ aligned = B_FALSE;
+ break;
+ }
+ skip = 0;
+ }
+#if defined(HAVE_VFS_IOV_ITER)
+ } else if (uio->uio_segflg == UIO_ITER) {
+ unsigned long alignment =
+ iov_iter_alignment(uio->uio_iter);
+ aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
+#endif
+ } else {
+ /* Currently not supported */
+ aligned = B_FALSE;
+ }
+
+ return (aligned);
+}
+
+
+#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
+#define ZFS_MARKEED_PAGE 0x0
+#define IS_ZFS_MARKED_PAGE(_p) 0
+#define zfs_mark_page(_p)
+#define zfs_unmark_page(_p)
+#define IS_ZERO_PAGE(_p) 0
+
+#else
+/*
+ * Mark pages to know if they were allocated to replace ZERO_PAGE() for
+ * Direct I/O writes.
+ */
+#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */
+#define IS_ZFS_MARKED_PAGE(_p) \
+ (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
+#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
+
+static inline void
+zfs_mark_page(struct page *page)
+{
+ ASSERT3P(page, !=, NULL);
+ get_page(page);
+ SetPagePrivate(page);
+ set_page_private(page, ZFS_MARKED_PAGE);
+}
+
+static inline void
+zfs_unmark_page(struct page *page)
+{
+ ASSERT3P(page, !=, NULL);
+ set_page_private(page, 0UL);
+ ClearPagePrivate(page);
+ put_page(page);
+}
+#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
+
+static void
+zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
+{
+ ASSERT3P(uio->uio_dio.pages, !=, NULL);
+
+ for (long i = 0; i < uio->uio_dio.npages; i++) {
+ struct page *p = uio->uio_dio.pages[i];
+ lock_page(p);
+
+ if (IS_ZERO_PAGE(p)) {
+ /*
+ * If the user page points the kernels ZERO_PAGE() a
+ * new zero filled page will just be allocated so the
+ * contents of the page can not be changed by the user
+ * while a Direct I/O write is taking place.
+ */
+ gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |
+ __GFP_ZERO | GFP_KERNEL;
+
+ ASSERT0(IS_ZFS_MARKED_PAGE(p));
+ unlock_page(p);
+ put_page(p);
+
+ p = __page_cache_alloc(gfp_zero_page);
+ zfs_mark_page(p);
+ } else {
+ unlock_page(p);
+ }
+ }
+}
+
+void
+zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+
+ ASSERT(uio->uio_extflg & UIO_DIRECT);
+ ASSERT3P(uio->uio_dio.pages, !=, NULL);
+
+ for (long i = 0; i < uio->uio_dio.npages; i++) {
+ struct page *p = uio->uio_dio.pages[i];
+
+ if (IS_ZFS_MARKED_PAGE(p)) {
+ zfs_unmark_page(p);
+ __free_page(p);
+ continue;
+ }
+
+ put_page(p);
+ }
+
+ vmem_free(uio->uio_dio.pages,
+ uio->uio_dio.npages * sizeof (struct page *));
+}
+
+/*
+ * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's
+ * iov_iter_get_pages().
+ */
+static int
+zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio,
+ long *numpages)
+{
+ unsigned long addr = (unsigned long)(v.iov_base);
+ size_t len = v.iov_len;
+ unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE);
+
+ long res = zfs_get_user_pages(
+ P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, rw == UIO_READ,
+ &uio->uio_dio.pages[uio->uio_dio.npages]);
+ if (res < 0) {
+ return (SET_ERROR(-res));
+ } else if (len != (res * PAGE_SIZE)) {
+ return (SET_ERROR(EFAULT));
+ }
+
+ ASSERT3S(len, ==, res * PAGE_SIZE);
+ *numpages = res;
+ return (0);
+}
+
+static int
+zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+ const struct iovec *iovp = uio->uio_iov;
+ size_t skip = uio->uio_skip;
+ size_t len = uio->uio_resid - skip;
+
+ ASSERT(uio->uio_segflg != UIO_SYSSPACE);
+
+ for (int i = 0; i < uio->uio_iovcnt; i++) {
+ struct iovec iov;
+ long numpages = 0;
+
+ if (iovp->iov_len == 0) {
+ iovp++;
+ skip = 0;
+ continue;
+ }
+ iov.iov_len = MIN(len, iovp->iov_len - skip);
+ iov.iov_base = iovp->iov_base + skip;
+ int error = zfs_uio_iov_step(iov, rw, uio, &numpages);
+
+ if (error)
+ return (error);
+
+ uio->uio_dio.npages += numpages;
+ len -= iov.iov_len;
+ skip = 0;
+ iovp++;
+ }
+
+ ASSERT0(len);
+
+ return (0);
+}
+
+#if defined(HAVE_VFS_IOV_ITER)
+static int
+zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+ size_t skip = uio->uio_skip;
+ size_t wanted = uio->uio_resid - uio->uio_skip;
+ ssize_t rollback = 0;
+ ssize_t cnt;
+ unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
+
+ while (wanted) {
+#if defined(HAVE_IOV_ITER_GET_PAGES2)
+ cnt = iov_iter_get_pages2(uio->uio_iter,
+ &uio->uio_dio.pages[uio->uio_dio.npages],
+ wanted, maxpages, &skip);
+#else
+ cnt = iov_iter_get_pages(uio->uio_iter,
+ &uio->uio_dio.pages[uio->uio_dio.npages],
+ wanted, maxpages, &skip);
+#endif
+ if (cnt < 0) {
+ iov_iter_revert(uio->uio_iter, rollback);
+ return (SET_ERROR(-cnt));
+ }
+ uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
+ rollback += cnt;
+ wanted -= cnt;
+ skip = 0;
+#if !defined(HAVE_IOV_ITER_GET_PAGES2)
+ /*
+ * iov_iter_get_pages2() advances the iov_iter on success.
+ */
+ iov_iter_advance(uio->uio_iter, cnt);
+#endif
+
+ }
+ ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
+ iov_iter_revert(uio->uio_iter, rollback);
+
+ return (0);
+}
+#endif /* HAVE_VFS_IOV_ITER */
+
+/*
+ * This function pins user pages. In the event that the user pages were not
+ * successfully pinned an error value is returned.
+ *
+ * On success, 0 is returned.
+ */
+int
+zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
+{
+ int error = 0;
+ long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
+ size_t size = npages * sizeof (struct page *);
+
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
+ error = zfs_uio_get_dio_pages_iov(uio, rw);
+#if defined(HAVE_VFS_IOV_ITER)
+ } else if (uio->uio_segflg == UIO_ITER) {
+ uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
+ error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
+#endif
+ } else {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ ASSERT3S(uio->uio_dio.npages, >=, 0);
+
+ if (error) {
+ for (long i = 0; i < uio->uio_dio.npages; i++)
+ put_page(uio->uio_dio.pages[i]);
+ vmem_free(uio->uio_dio.pages, size);
+ return (error);
+ } else {
+ ASSERT3S(uio->uio_dio.npages, ==, npages);
+ }
+
+ if (rw == UIO_WRITE) {
+ zfs_uio_dio_check_for_zero_page(uio);
+ }
+
+ uio->uio_extflg |= UIO_DIRECT;
+
+ return (0);
+}
+
#endif /* _KERNEL */
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index a52f08868..22a4ad1ef 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -59,6 +59,7 @@
#include <sys/objlist.h>
#include <sys/zpl.h>
#include <linux/vfs_compat.h>
+#include <linux/fs.h>
#include "zfs_comutil.h"
enum {
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 9803c7fec..77e59a3ba 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -296,6 +296,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
if (pp) {
+
/*
* If filemap_fault() retries there exists a window
* where the page will be unlocked and not up to date.
@@ -3866,7 +3867,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
}
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
- for_sync ? zfs_putpage_sync_commit_cb :
+ B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
zfs_putpage_async_commit_cb, pp);
dmu_tx_commit(tx);
@@ -4009,6 +4010,7 @@ zfs_inactive(struct inode *ip)
static int
zfs_fillpage(struct inode *ip, struct page *pp)
{
+ znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
loff_t i_size = i_size_read(ip);
u_offset_t io_off = page_offset(pp);
@@ -4020,7 +4022,7 @@ zfs_fillpage(struct inode *ip, struct page *pp)
io_len = i_size - io_off;
void *va = kmap(pp);
- int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
+ int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
io_len, va, DMU_READ_PREFETCH);
if (io_len != PAGE_SIZE)
memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
@@ -4058,11 +4060,49 @@ zfs_getpage(struct inode *ip, struct page *pp)
zfsvfs_t *zfsvfs = ITOZSB(ip);
znode_t *zp = ITOZ(ip);
int error;
+ loff_t i_size = i_size_read(ip);
+ u_offset_t io_off = page_offset(pp);
+ size_t io_len = PAGE_SIZE;
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);
+ ASSERT3U(io_off, <, i_size);
+
+ if (io_off + io_len > i_size)
+ io_len = i_size - io_off;
+
+ /*
+ * It is important to hold the rangelock here because it is possible
+ * a Direct I/O write or block clone might be taking place at the same
+ * time that a page is being faulted in through filemap_fault(). With
+ * Direct I/O writes and block cloning db->db_data will be set to NULL
+ * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
+ * rangelock is not held, then there is a race between faulting in a
+ * page and writing out a Direct I/O write or block cloning. Without
+ * the rangelock a NULL pointer dereference can occur in
+ * dmu_read_impl() for db->db_data during the mempcy operation when
+ * zfs_fillpage() calls dmu_read().
+ */
+ zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
+ io_off, io_len, RL_READER);
+ if (lr == NULL) {
+ /*
+ * It is important to drop the page lock before grabbing the
+ * rangelock to avoid another deadlock between here and
+ * zfs_write() -> update_pages(). update_pages() holds both the
+ * rangelock and the page lock.
+ */
+ get_page(pp);
+ unlock_page(pp);
+ lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
+ io_len, RL_READER);
+ lock_page(pp);
+ put_page(pp);
+ }
error = zfs_fillpage(ip, pp);
+ zfs_rangelock_exit(lr);
+
if (error == 0)
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 9dec52215..6b16faa2b 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -322,14 +322,14 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
crhold(cr);
cookie = spl_fstrans_mark();
- int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+ ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
filp->f_flags | zfs_io_flags(kiocb), cr);
spl_fstrans_unmark(cookie);
crfree(cr);
- if (error < 0)
- return (error);
+ if (ret < 0)
+ return (ret);
ssize_t read = count - uio.uio_resid;
kiocb->ki_pos += read;
@@ -384,14 +384,14 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
crhold(cr);
cookie = spl_fstrans_mark();
- int error = -zfs_write(ITOZ(ip), &uio,
+ ret = -zfs_write(ITOZ(ip), &uio,
filp->f_flags | zfs_io_flags(kiocb), cr);
spl_fstrans_unmark(cookie);
crfree(cr);
- if (error < 0)
- return (error);
+ if (ret < 0)
+ return (ret);
ssize_t wrote = count - uio.uio_resid;
kiocb->ki_pos += wrote;
@@ -422,14 +422,14 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov,
crhold(cr);
cookie = spl_fstrans_mark();
- int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
- filp->f_flags | zfs_io_flags(kiocb), cr);
+ ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+ flip->f_flags | zfs_io_flags(kiocb), cr);
spl_fstrans_unmark(cookie);
crfree(cr);
- if (error < 0)
- return (error);
+ if (ret < 0)
+ return (ret);
ssize_t read = count - uio.uio_resid;
kiocb->ki_pos += read;
@@ -467,53 +467,57 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
crhold(cr);
cookie = spl_fstrans_mark();
- int error = -zfs_write(ITOZ(ip), &uio,
+ ret = -zfs_write(ITOZ(ip), &uio,
filp->f_flags | zfs_io_flags(kiocb), cr);
spl_fstrans_unmark(cookie);
crfree(cr);
- if (error < 0)
- return (error);
+ if (ret < 0)
+ return (ret);
ssize_t wrote = count - uio.uio_resid;
kiocb->ki_pos += wrote;
return (wrote);
}
+
#endif /* HAVE_VFS_RW_ITERATE */
-#if defined(HAVE_VFS_RW_ITERATE)
static ssize_t
-zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
+zpl_direct_IO_impl(void)
{
- if (rw == WRITE)
- return (zpl_iter_write(kiocb, iter));
- else
- return (zpl_iter_read(kiocb, iter));
+ /*
+ * All O_DIRECT requests should be handled by
+ * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code
+ * should call the direct_IO address_space_operations function. We set
+ * this code path to be fatal if it is executed.
+ */
+ PANIC(0);
+ return (0);
}
+
+#if defined(HAVE_VFS_RW_ITERATE)
#if defined(HAVE_VFS_DIRECT_IO_ITER)
static ssize_t
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
{
- return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+ return (zpl_direct_IO_impl());
}
#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
static ssize_t
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
{
- ASSERT3S(pos, ==, kiocb->ki_pos);
- return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+ return (zpl_direct_IO_impl());
}
#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
static ssize_t
zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
{
- ASSERT3S(pos, ==, kiocb->ki_pos);
- return (zpl_direct_IO_impl(rw, kiocb, iter));
+ return (zpl_direct_IO_impl());
}
#else
-#error "Unknown direct IO interface"
+#error "Unknown Direct I/O interface"
#endif
#else /* HAVE_VFS_RW_ITERATE */
@@ -523,26 +527,16 @@ static ssize_t
zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov,
loff_t pos, unsigned long nr_segs)
{
- if (rw == WRITE)
- return (zpl_aio_write(kiocb, iov, nr_segs, pos));
- else
- return (zpl_aio_read(kiocb, iov, nr_segs, pos));
+ return (zpl_direct_IO_impl());
}
#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
static ssize_t
zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
{
- const struct iovec *iovp = iov_iter_iovec(iter);
- unsigned long nr_segs = iter->nr_segs;
-
- ASSERT3S(pos, ==, kiocb->ki_pos);
- if (rw == WRITE)
- return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
- else
- return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
+ return (zpl_direct_IO_impl());
}
#else
-#error "Unknown direct IO interface"
+#error "Unknown Direct I/O interface"
#endif
#endif /* HAVE_VFS_RW_ITERATE */
@@ -627,6 +621,7 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
(size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
spl_fstrans_unmark(cookie);
+
if (error)
return (error);