diff options
Diffstat (limited to 'module')
30 files changed, 2256 insertions, 302 deletions
diff --git a/module/Kbuild.in b/module/Kbuild.in index 0472a9348..d96347bad 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -327,6 +327,7 @@ ZFS_OBJS := \ ddt_stats.o \ ddt_zap.o \ dmu.o \ + dmu_direct.o \ dmu_diff.o \ dmu_object.o \ dmu_objset.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 9161204c9..188f5ad2d 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -257,6 +257,7 @@ SRCS+= abd.c \ ddt_stats.c \ ddt_zap.c \ dmu.c \ + dmu_direct.c \ dmu_diff.c \ dmu_object.c \ dmu_objset.c \ diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c index 17886cbeb..74cbe36bb 100644 --- a/module/os/freebsd/spl/spl_uio.c +++ b/module/os/freebsd/spl/spl_uio.c @@ -44,6 +44,10 @@ #include <sys/uio_impl.h> #include <sys/vnode.h> #include <sys/zfs_znode.h> +#include <sys/byteorder.h> +#include <sys/lock.h> +#include <sys/vm.h> +#include <vm/vm_map.h> static void zfs_freeuio(struct uio *uio) @@ -115,3 +119,200 @@ zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) ASSERT3U(zfs_uio_rw(uio), ==, dir); return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio))); } + +/* + * Check if the uio is page-aligned in memory. + */ +boolean_t +zfs_uio_page_aligned(zfs_uio_t *uio) +{ + const struct iovec *iov = GET_UIO_STRUCT(uio)->uio_iov; + + for (int i = zfs_uio_iovcnt(uio); i > 0; iov++, i--) { + uintptr_t addr = (uintptr_t)iov->iov_base; + size_t size = iov->iov_len; + if ((addr & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void +zfs_uio_set_pages_to_stable(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + ASSERT3S(uio->uio_dio.npages, >, 0); + + for (int i = 0; i < uio->uio_dio.npages; i++) { + vm_page_t page = uio->uio_dio.pages[i]; + ASSERT3P(page, !=, NULL); + + MPASS(page == PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(page))); + vm_page_busy_acquire(page, VM_ALLOC_SBUSY); + pmap_remove_write(page); + } +} + +static void +zfs_uio_release_stable_pages(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + for (int i = 0; i < uio->uio_dio.npages; i++) { + vm_page_t page = uio->uio_dio.pages[i]; + + ASSERT3P(page, !=, NULL); + vm_page_sunbusy(page); + } +} + +/* + * If the operation is marked as read, then we are stating the pages will be + * written to and must be given write access. + */ +static int +zfs_uio_hold_pages(unsigned long start, size_t len, int nr_pages, + zfs_uio_rw_t rw, vm_page_t *pages) +{ + vm_map_t map; + vm_prot_t prot; + int count; + + map = &curthread->td_proc->p_vmspace->vm_map; + ASSERT3S(len, >, 0); + + prot = rw == UIO_READ ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ; + count = vm_fault_quick_hold_pages(map, start, len, prot, pages, + nr_pages); + + return (count); +} + +void +zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3P(uio->uio_dio.pages, !=, NULL); + ASSERT(zfs_uio_rw(uio) == rw); + + if (rw == UIO_WRITE) + zfs_uio_release_stable_pages(uio); + + vm_page_unhold_pages(&uio->uio_dio.pages[0], + uio->uio_dio.npages); + + kmem_free(uio->uio_dio.pages, + uio->uio_dio.npages * sizeof (vm_page_t)); +} + +static int +zfs_uio_get_user_pages(unsigned long start, int nr_pages, + size_t len, zfs_uio_rw_t rw, vm_page_t *pages) +{ + int count; + + count = zfs_uio_hold_pages(start, len, nr_pages, rw, pages); + + if (count != nr_pages) { + if (count > 0) + vm_page_unhold_pages(pages, count); + return (0); + } + + ASSERT3S(count, ==, nr_pages); + + return (count); +} + +static int +zfs_uio_iov_step(struct iovec v, zfs_uio_t *uio, int *numpages) +{ + unsigned long addr = (unsigned long)(v.iov_base); + size_t len = v.iov_len; + int n = DIV_ROUND_UP(len, PAGE_SIZE); + + int res = zfs_uio_get_user_pages( + P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, len, + zfs_uio_rw(uio), &uio->uio_dio.pages[uio->uio_dio.npages]); + + if (res != n) + return (SET_ERROR(EFAULT)); + + ASSERT3U(len, ==, res * PAGE_SIZE); + *numpages = res; + return (0); +} + +static int +zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) +{ + const struct iovec *iovp = GET_UIO_STRUCT(uio)->uio_iov; + size_t len = zfs_uio_resid(uio); + + for (int i = 0; i < zfs_uio_iovcnt(uio); i++) { + struct iovec iov; + int numpages = 0; + + if (iovp->iov_len == 0) { + iovp++; + continue; + } + iov.iov_len = MIN(len, iovp->iov_len); + iov.iov_base = iovp->iov_base; + int error = zfs_uio_iov_step(iov, uio, &numpages); + + if (error) + return (error); + + uio->uio_dio.npages += numpages; + len -= iov.iov_len; + iovp++; + } + + ASSERT0(len); + + return (0); +} + +/* + * This function holds user pages into the kernel. In the event that the user + * pages are not successfully held an error value is returned. + * + * On success, 0 is returned. + */ +int +zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + int error = 0; + int npages = DIV_ROUND_UP(zfs_uio_resid(uio), PAGE_SIZE); + size_t size = npages * sizeof (vm_page_t); + + ASSERT(zfs_uio_rw(uio) == rw); + + uio->uio_dio.pages = kmem_alloc(size, KM_SLEEP); + + error = zfs_uio_get_dio_pages_impl(uio); + + if (error) { + vm_page_unhold_pages(&uio->uio_dio.pages[0], + uio->uio_dio.npages); + kmem_free(uio->uio_dio.pages, size); + return (error); + } + + ASSERT3S(uio->uio_dio.npages, >, 0); + + /* + * Since we will be writing the user pages we must make sure that + * they are stable. That way the contents of the pages can not change + * while we are doing: compression, checksumming, encryption, parity + * calculations or deduplication. + */ + if (zfs_uio_rw(uio) == UIO_WRITE) + zfs_uio_set_pages_to_stable(uio); + + uio->uio_extflg |= UIO_DIRECT; + + return (0); +} diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index f24ea3dc7..f20dc5d8c 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -32,6 +32,7 @@ #include <sys/zio.h> #include <sys/zfs_context.h> #include <sys/zfs_znode.h> +#include <sys/vm.h> typedef struct abd_stats { kstat_named_t abdstat_struct_size; @@ -135,7 +136,9 @@ abd_size_alloc_linear(size_t size) void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { - uint_t n = abd_scatter_chunkcnt(abd); + uint_t n; + + n = abd_scatter_chunkcnt(abd); ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); int waste = (n << PAGE_SHIFT) - abd->abd_size; if (op == ABDSTAT_INCR) { @@ -198,10 +201,16 @@ abd_free_chunks(abd_t *abd) { uint_t i, n; - n = abd_scatter_chunkcnt(abd); - for (i = 0; i < n; i++) { - kmem_cache_free(abd_chunk_cache, - ABD_SCATTER(abd).abd_chunks[i]); + /* + * Scatter ABDs may be constructed by abd_alloc_from_pages() from + * an array of pages. In which case they should not be freed. + */ + if (!abd_is_from_pages(abd)) { + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { + kmem_cache_free(abd_chunk_cache, + ABD_SCATTER(abd).abd_chunks[i]); + } } } @@ -342,11 +351,8 @@ abd_fini(void) void abd_free_linear_page(abd_t *abd) { - /* - * FreeBSD does not have scatter linear pages - * so there is an error. - */ - VERIFY(0); + ASSERT3P(abd->abd_u.abd_linear.sf, !=, NULL); + zfs_unmap_page(abd->abd_u.abd_linear.sf); } /* @@ -365,6 +371,26 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) return (abd_alloc_linear(size, is_metadata)); } +static abd_t * +abd_get_offset_from_pages(abd_t *abd, abd_t *sabd, size_t chunkcnt, + size_t new_offset) +{ + ASSERT(abd_is_from_pages(sabd)); + + /* + * Set the child child chunks to point at the parent chunks as + * the chunks are just pages and we don't want to copy them. + */ + size_t parent_offset = new_offset / PAGE_SIZE; + ASSERT3U(parent_offset, <, abd_scatter_chunkcnt(sabd)); + for (int i = 0; i < chunkcnt; i++) + ABD_SCATTER(abd).abd_chunks[i] = + ABD_SCATTER(sabd).abd_chunks[parent_offset + i]; + + abd->abd_flags |= ABD_FLAG_FROM_PAGES; + return (abd); +} + abd_t * abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) @@ -399,6 +425,11 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK; + if (abd_is_from_pages(sabd)) { + return (abd_get_offset_from_pages(abd, sabd, chunkcnt, + new_offset)); + } + /* Copy the scatterlist starting at the correct offset */ (void) memcpy(&ABD_SCATTER(abd).abd_chunks, &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT], @@ -408,6 +439,44 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, } /* + * Allocate a scatter ABD structure from user pages. + */ +abd_t * +abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size) +{ + VERIFY3U(size, <=, DMU_MAX_ACCESS); + ASSERT3U(offset, <, PAGE_SIZE); + ASSERT3P(pages, !=, NULL); + + abd_t *abd = abd_alloc_struct(size); + abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES; + abd->abd_size = size; + + if ((offset + size) <= PAGE_SIZE) { + /* + * There is only a single page worth of data, so we will just + * use a linear ABD. We have to make sure to take into account + * the offset though. In all other cases our offset will be 0 + * as we are always PAGE_SIZE aligned. + */ + abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; + ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0], + &abd->abd_u.abd_linear.sf) + offset; + } else { + ABD_SCATTER(abd).abd_offset = offset; + ASSERT0(ABD_SCATTER(abd).abd_offset); + + /* + * Setting the ABD's abd_chunks to point to the user pages. + */ + for (int i = 0; i < abd_chunkcnt_for_bytes(size); i++) + ABD_SCATTER(abd).abd_chunks[i] = pages[i]; + } + + return (abd); +} + +/* * Initialize the abd_iter. */ void @@ -468,6 +537,16 @@ abd_iter_map(struct abd_iter *aiter) if (abd_is_linear(abd)) { aiter->iter_mapsize = abd->abd_size - offset; paddr = ABD_LINEAR_BUF(abd); + } else if (abd_is_from_pages(abd)) { + aiter->sf = NULL; + offset += ABD_SCATTER(abd).abd_offset; + size_t index = offset / PAGE_SIZE; + offset &= PAGE_MASK; + aiter->iter_mapsize = MIN(PAGE_SIZE - offset, + abd->abd_size - aiter->iter_pos); + paddr = zfs_map_page( + ABD_SCATTER(aiter->iter_abd).abd_chunks[index], + &aiter->sf); } else { offset += ABD_SCATTER(abd).abd_offset; paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT]; @@ -490,6 +569,12 @@ abd_iter_unmap(struct abd_iter *aiter) ASSERT3U(aiter->iter_mapsize, >, 0); } + if (abd_is_from_pages(aiter->iter_abd) && + !abd_is_linear_page(aiter->iter_abd)) { + ASSERT3P(aiter->sf, !=, NULL); + zfs_unmap_page(aiter->sf); + } + aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; } @@ -499,3 +584,67 @@ abd_cache_reap_now(void) { kmem_cache_reap_soon(abd_chunk_cache); } + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will alloate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, 0); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } +#ifdef ZFS_DEBUG + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); +#endif + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * no change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} diff --git a/module/os/freebsd/zfs/zfs_racct.c b/module/os/freebsd/zfs/zfs_racct.c index 883255bc1..2989a9af9 100644 --- a/module/os/freebsd/zfs/zfs_racct.c +++ b/module/os/freebsd/zfs/zfs_racct.c @@ -27,7 +27,7 @@ #include <sys/racct.h> void -zfs_racct_read(uint64_t size, uint64_t iops) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { curthread->td_ru.ru_inblock += iops; #ifdef RACCT @@ -40,10 +40,12 @@ zfs_racct_read(uint64_t size, uint64_t iops) #else (void) size; #endif /* RACCT */ + + spa_iostats_read_add(spa, size, iops, flags); } void -zfs_racct_write(uint64_t size, uint64_t iops) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { curthread->td_ru.ru_oublock += iops; #ifdef RACCT @@ -56,4 +58,6 @@ zfs_racct_write(uint64_t size, uint64_t iops) #else (void) size; #endif /* RACCT */ + + spa_iostats_write_add(spa, size, iops, flags); } diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 01b964f98..5dbca10a3 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4131,7 +4131,7 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, * but that would make the locking messier */ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, - len, commit, NULL, NULL); + len, commit, B_FALSE, NULL, NULL); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { @@ -4266,6 +4266,8 @@ ioflags(int ioflags) flags |= O_APPEND; if (ioflags & IO_NDELAY) flags |= O_NONBLOCK; + if (ioflags & IO_DIRECT) + flags |= O_DIRECT; if (ioflags & IO_SYNC) flags |= O_SYNC; @@ -4285,9 +4287,36 @@ static int zfs_freebsd_read(struct vop_read_args *ap) { zfs_uio_t uio; + int error = 0; zfs_uio_init(&uio, ap->a_uio); - return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), - ap->a_cred)); + error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), + ap->a_cred); + /* + * XXX We occasionally get an EFAULT for Direct I/O reads on + * FreeBSD 13. This still needs to be resolved. The EFAULT comes + * from: + * zfs_uio_get__dio_pages_alloc() -> + * zfs_uio_get_dio_pages_impl() -> + * zfs_uio_iov_step() -> + * zfs_uio_get_user_pages(). + * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O + * read fails to map in the user pages (returning EFAULT) the + * Direct I/O request is broken up into two separate IO requests + * and issued separately using Direct I/O. + */ +#ifdef ZFS_DEBUG + if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) { +#if 0 + printf("%s(%d): Direct I/O read returning EFAULT " + "uio = %p, zfs_uio_offset(uio) = %lu " + "zfs_uio_resid(uio) = %lu\n", + __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio), + zfs_uio_resid(&uio)); +#endif + } + +#endif + return (error); } #ifndef _SYS_SYSPROTO_H_ diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index ddb20b031..c3be4730d 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -922,6 +922,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) if (commit) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); + return (error); } diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 60287ccdd..dae4107e0 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -186,6 +186,7 @@ static int zfs_abd_scatter_min_size = 512 * 3; abd_t *abd_zero_scatter = NULL; struct page; + /* * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will * point to ZERO_PAGE if it is available or it will be an allocated zero'd @@ -453,14 +454,21 @@ abd_free_chunks(abd_t *abd) if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); - abd_for_each_sg(abd, sg, nr_pages, i) { - page = sg_page(sg); - abd_unmark_zfs_page(page); - order = compound_order(page); - __free_pages(page, order); - ASSERT3U(sg->length, <=, PAGE_SIZE << order); - ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + /* + * Scatter ABDs may be constructed by abd_alloc_from_pages() from + * an array of pages. In which case they should not be freed. + */ + if (!abd_is_from_pages(abd)) { + abd_for_each_sg(abd, sg, nr_pages, i) { + page = sg_page(sg); + abd_unmark_zfs_page(page); + order = compound_order(page); + __free_pages(page, order); + ASSERT3U(sg->length, <=, PAGE_SIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } } + abd_free_sg_table(abd); } @@ -551,17 +559,19 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) void abd_verify_scatter(abd_t *abd) { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); ASSERT3U(ABD_SCATTER(abd).abd_offset, <, ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; + +#ifdef ZFS_DEBUG + struct scatterlist *sg = NULL; + size_t n = ABD_SCATTER(abd).abd_nents; + int i = 0; + abd_for_each_sg(abd, sg, n, i) { ASSERT3P(sg_page(sg), !=, NULL); } +#endif } static void @@ -687,14 +697,77 @@ abd_free_linear_page(abd_t *abd) { /* Transform it back into a scatter ABD for freeing */ struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + + /* When backed by user page unmap it */ + if (abd_is_from_pages(abd)) + zfs_kunmap(sg_page(sg)); + abd->abd_flags &= ~ABD_FLAG_LINEAR; abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; ABD_SCATTER(abd).abd_nents = 1; ABD_SCATTER(abd).abd_offset = 0; ABD_SCATTER(abd).abd_sgl = sg; abd_free_chunks(abd); +} + +/* + * Allocate a scatter ABD structure from user pages. The pages must be + * pinned with get_user_pages, or similiar, but need not be mapped via + * the kmap interfaces. + */ +abd_t * +abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) +{ + uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE); + struct sg_table table; + + VERIFY3U(size, <=, DMU_MAX_ACCESS); + ASSERT3U(offset, <, PAGE_SIZE); + ASSERT3P(pages, !=, NULL); + + /* + * Even if this buf is filesystem metadata, we only track that we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd_t *abd = abd_alloc_struct(0); + abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER; + abd->abd_size = size; + + while (sg_alloc_table_from_pages(&table, pages, npages, offset, + size, __GFP_NOWARN | GFP_NOIO) != 0) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + if ((offset + size) <= PAGE_SIZE) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's constructed + * from a user page can be represented this way as long as the + * page is mapped to a virtual address. This allows us to + * apply an offset in to the mapped page. + * + * Note that kmap() must be used, not kmap_atomic(), because + * the mapping needs to bet set up on all CPUs. Using kmap() + * also enables the user of highmem pages when required. + */ + abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + zfs_kmap(sg_page(table.sgl)); + ABD_LINEAR_BUF(abd) = sg_virt(table.sgl); + } else { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + ABD_SCATTER(abd).abd_offset = offset; + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + + ASSERT0(ABD_SCATTER(abd).abd_offset); + } - abd_update_scatter_stats(abd, ABDSTAT_DECR); + return (abd); } /* @@ -746,6 +819,9 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, ABD_SCATTER(abd).abd_offset = new_offset; ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + if (abd_is_from_pages(sabd)) + abd->abd_flags |= ABD_FLAG_FROM_PAGES; + return (abd); } @@ -874,6 +950,115 @@ abd_cache_reap_now(void) } /* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, 0); + /* + * In the event the ABD is composed of a single user page from Direct + * I/O we can not direclty return the raw buffer. This is a consequence + * of not being able to write protect the page and the contents of the + * page can be changed at any time by the user. + */ + if (abd_is_from_pages(abd)) { + buf = zio_buf_alloc(n); + } else if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + +#ifdef ZFS_DEBUG + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); +#endif + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + + /* + * In the event the ABD is composed of a single user page from Direct + * I/O we must make sure copy the data over into the newly allocated + * buffer. This is a consequence of the fact that we can not write + * protect the user page and there is a risk the contents of the page + * could be changed by the user at any moment. + */ + if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will + * not change the contents of the ABD. If you want any changes you made to + * buf to be copied back to abd, use abd_return_buf_copy() instead. If the + * ABD is not constructed from user pages for Direct I/O then an ASSERT + * checks to make sure the contents of buffer have not changed since it was + * borrowed. We can not ASSERT that the contents of the buffer have not changed + * if it is composed of user pages because the pages can not be placed under + * write protection and the user could have possibly changed the contents in + * the pages at any time. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif + if (abd_is_from_pages(abd)) { + zio_buf_free(buf, n); + } else if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else if (abd_is_gang(abd)) { +#ifdef ZFS_DEBUG + /* + * We have to be careful with gang ABD's that we do not ASSERT0 + * for any ABD's that contain user pages from Direct I/O. In + * order to handle this, we just iterate through the gang ABD + * and only verify ABDs that are not from user pages. + */ + void *cmp_buf = buf; + + for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + if (!abd_is_from_pages(cabd)) { + ASSERT0(abd_cmp_buf(cabd, cmp_buf, + cabd->abd_size)); + } + cmp_buf = (char *)cmp_buf + cabd->abd_size; + } +#endif + zio_buf_free(buf, n); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* * This is abd_iter_page(), the function underneath abd_iterate_page_func(). * It yields the next page struct and data offset and size within it, without * mapping it into the address space. diff --git a/module/os/linux/zfs/zfs_racct.c b/module/os/linux/zfs/zfs_racct.c index ce623ef9d..ce197caa4 100644 --- a/module/os/linux/zfs/zfs_racct.c +++ b/module/os/linux/zfs/zfs_racct.c @@ -25,14 +25,35 @@ #include <sys/zfs_racct.h> +#ifdef _KERNEL +#include <linux/task_io_accounting_ops.h> + +void +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + task_io_account_read(size); + spa_iostats_read_add(spa, size, iops, flags); +} + void -zfs_racct_read(uint64_t size, uint64_t iops) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { - (void) size, (void) iops; + task_io_account_write(size); + spa_iostats_write_add(spa, size, iops, flags); } +#else + void -zfs_racct_write(uint64_t size, uint64_t iops) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { - (void) size, (void) iops; + (void) spa, (void) size, (void) iops, (void) flags; } + +void +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + (void) spa, (void) size, (void) iops, (void) flags; +} + +#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index a99a1ba88..637f968f8 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -41,12 +41,19 @@ #ifdef _KERNEL +#include <sys/errno.h> +#include <sys/vmem.h> +#include <sys/sysmacros.h> #include <sys/types.h> #include <sys/uio_impl.h> #include <sys/sysmacros.h> #include <sys/string.h> +#include <sys/zfs_refcount.h> +#include <sys/zfs_debug.h> #include <linux/kmap_compat.h> #include <linux/uaccess.h> +#include <linux/pagemap.h> +#include <linux/mman.h> /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -327,8 +334,13 @@ EXPORT_SYMBOL(zfs_uiomove); int zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) { - if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) { - /* There's never a need to fault in kernel pages */ + if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC || + (uio->uio_extflg & UIO_DIRECT)) { + /* + * There's never a need to fault in kernel pages or Direct I/O + * write pages. Direct I/O write pages have been pinned in so + * there is never a time for these pages a fault will occur. + */ return (0); #if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { @@ -437,9 +449,288 @@ zfs_uioskip(zfs_uio_t *uio, size_t n) uio->uio_iovcnt--; } } + uio->uio_loffset += n; uio->uio_resid -= n; } EXPORT_SYMBOL(zfs_uioskip); +/* + * Check if the uio is page-aligned in memory. + */ +boolean_t +zfs_uio_page_aligned(zfs_uio_t *uio) +{ + boolean_t aligned = B_TRUE; + + if (uio->uio_segflg == UIO_USERSPACE || + uio->uio_segflg == UIO_SYSSPACE) { + const struct iovec *iov = uio->uio_iov; + size_t skip = uio->uio_skip; + + for (int i = uio->uio_iovcnt; i > 0; iov++, i--) { + uintptr_t addr = (uintptr_t)(iov->iov_base + skip); + size_t size = iov->iov_len - skip; + if ((addr & (PAGE_SIZE - 1)) || + (size & (PAGE_SIZE - 1))) { + aligned = B_FALSE; + break; + } + skip = 0; + } +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + unsigned long alignment = + iov_iter_alignment(uio->uio_iter); + aligned = IS_P2ALIGNED(alignment, PAGE_SIZE); +#endif + } else { + /* Currently not supported */ + aligned = B_FALSE; + } + + return (aligned); +} + + +#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64) +#define ZFS_MARKEED_PAGE 0x0 +#define IS_ZFS_MARKED_PAGE(_p) 0 +#define zfs_mark_page(_p) +#define zfs_unmark_page(_p) +#define IS_ZERO_PAGE(_p) 0 + +#else +/* + * Mark pages to know if they were allocated to replace ZERO_PAGE() for + * Direct I/O writes. + */ +#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */ +#define IS_ZFS_MARKED_PAGE(_p) \ + (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE) +#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0)) + +static inline void +zfs_mark_page(struct page *page) +{ + ASSERT3P(page, !=, NULL); + get_page(page); + SetPagePrivate(page); + set_page_private(page, ZFS_MARKED_PAGE); +} + +static inline void +zfs_unmark_page(struct page *page) +{ + ASSERT3P(page, !=, NULL); + set_page_private(page, 0UL); + ClearPagePrivate(page); + put_page(page); +} +#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ + +static void +zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + + for (long i = 0; i < uio->uio_dio.npages; i++) { + struct page *p = uio->uio_dio.pages[i]; + lock_page(p); + + if (IS_ZERO_PAGE(p)) { + /* + * If the user page points the kernels ZERO_PAGE() a + * new zero filled page will just be allocated so the + * contents of the page can not be changed by the user + * while a Direct I/O write is taking place. + */ + gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO | + __GFP_ZERO | GFP_KERNEL; + + ASSERT0(IS_ZFS_MARKED_PAGE(p)); + unlock_page(p); + put_page(p); + + p = __page_cache_alloc(gfp_zero_page); + zfs_mark_page(p); + } else { + unlock_page(p); + } + } +} + +void +zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3P(uio->uio_dio.pages, !=, NULL); + + for (long i = 0; i < uio->uio_dio.npages; i++) { + struct page *p = uio->uio_dio.pages[i]; + + if (IS_ZFS_MARKED_PAGE(p)) { + zfs_unmark_page(p); + __free_page(p); + continue; + } + + put_page(p); + } + + vmem_free(uio->uio_dio.pages, + uio->uio_dio.npages * sizeof (struct page *)); +} + +/* + * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's + * iov_iter_get_pages(). + */ +static int +zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio, + long *numpages) +{ + unsigned long addr = (unsigned long)(v.iov_base); + size_t len = v.iov_len; + unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE); + + long res = zfs_get_user_pages( + P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, rw == UIO_READ, + &uio->uio_dio.pages[uio->uio_dio.npages]); + if (res < 0) { + return (SET_ERROR(-res)); + } else if (len != (res * PAGE_SIZE)) { + return (SET_ERROR(EFAULT)); + } + + ASSERT3S(len, ==, res * PAGE_SIZE); + *numpages = res; + return (0); +} + +static int +zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + const struct iovec *iovp = uio->uio_iov; + size_t skip = uio->uio_skip; + size_t len = uio->uio_resid - skip; + + ASSERT(uio->uio_segflg != UIO_SYSSPACE); + + for (int i = 0; i < uio->uio_iovcnt; i++) { + struct iovec iov; + long numpages = 0; + + if (iovp->iov_len == 0) { + iovp++; + skip = 0; + continue; + } + iov.iov_len = MIN(len, iovp->iov_len - skip); + iov.iov_base = iovp->iov_base + skip; + int error = zfs_uio_iov_step(iov, rw, uio, &numpages); + + if (error) + return (error); + + uio->uio_dio.npages += numpages; + len -= iov.iov_len; + skip = 0; + iovp++; + } + + ASSERT0(len); + + return (0); +} + +#if defined(HAVE_VFS_IOV_ITER) +static int +zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + size_t skip = uio->uio_skip; + size_t wanted = uio->uio_resid - uio->uio_skip; + ssize_t rollback = 0; + ssize_t cnt; + unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); + + while (wanted) { +#if defined(HAVE_IOV_ITER_GET_PAGES2) + cnt = iov_iter_get_pages2(uio->uio_iter, + &uio->uio_dio.pages[uio->uio_dio.npages], + wanted, maxpages, &skip); +#else + cnt = iov_iter_get_pages(uio->uio_iter, + &uio->uio_dio.pages[uio->uio_dio.npages], + wanted, maxpages, &skip); +#endif + if (cnt < 0) { + iov_iter_revert(uio->uio_iter, rollback); + return (SET_ERROR(-cnt)); + } + uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); + rollback += cnt; + wanted -= cnt; + skip = 0; +#if !defined(HAVE_IOV_ITER_GET_PAGES2) + /* + * iov_iter_get_pages2() advances the iov_iter on success. + */ + iov_iter_advance(uio->uio_iter, cnt); +#endif + + } + ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); + iov_iter_revert(uio->uio_iter, rollback); + + return (0); +} +#endif /* HAVE_VFS_IOV_ITER */ + +/* + * This function pins user pages. In the event that the user pages were not + * successfully pinned an error value is returned. + * + * On success, 0 is returned. + */ +int +zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + int error = 0; + long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); + size_t size = npages * sizeof (struct page *); + + if (uio->uio_segflg == UIO_USERSPACE) { + uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); + error = zfs_uio_get_dio_pages_iov(uio, rw); +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); + error = zfs_uio_get_dio_pages_iov_iter(uio, rw); +#endif + } else { + return (SET_ERROR(EOPNOTSUPP)); + } + + ASSERT3S(uio->uio_dio.npages, >=, 0); + + if (error) { + for (long i = 0; i < uio->uio_dio.npages; i++) + put_page(uio->uio_dio.pages[i]); + vmem_free(uio->uio_dio.pages, size); + return (error); + } else { + ASSERT3S(uio->uio_dio.npages, ==, npages); + } + + if (rw == UIO_WRITE) { + zfs_uio_dio_check_for_zero_page(uio); + } + + uio->uio_extflg |= UIO_DIRECT; + + return (0); +} + #endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index a52f08868..22a4ad1ef 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -59,6 +59,7 @@ #include <sys/objlist.h> #include <sys/zpl.h> #include <linux/vfs_compat.h> +#include <linux/fs.h> #include "zfs_comutil.h" enum { diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 9803c7fec..77e59a3ba 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -296,6 +296,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { + /* * If filemap_fault() retries there exists a window * where the page will be unlocked and not up to date. @@ -3866,7 +3867,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, } zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, - for_sync ? zfs_putpage_sync_commit_cb : + B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : zfs_putpage_async_commit_cb, pp); dmu_tx_commit(tx); @@ -4009,6 +4010,7 @@ zfs_inactive(struct inode *ip) static int zfs_fillpage(struct inode *ip, struct page *pp) { + znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); @@ -4020,7 +4022,7 @@ zfs_fillpage(struct inode *ip, struct page *pp) io_len = i_size - io_off; void *va = kmap(pp); - int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); @@ -4058,11 +4060,49 @@ zfs_getpage(struct inode *ip, struct page *pp) zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *zp = ITOZ(ip); int error; + loff_t i_size = i_size_read(ip); + u_offset_t io_off = page_offset(pp); + size_t io_len = PAGE_SIZE; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); + ASSERT3U(io_off, <, i_size); + + if (io_off + io_len > i_size) + io_len = i_size - io_off; + + /* + * It is important to hold the rangelock here because it is possible + * a Direct I/O write or block clone might be taking place at the same + * time that a page is being faulted in through filemap_fault(). With + * Direct I/O writes and block cloning db->db_data will be set to NULL + * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the + * rangelock is not held, then there is a race between faulting in a + * page and writing out a Direct I/O write or block cloning. Without + * the rangelock a NULL pointer dereference can occur in + * dmu_read_impl() for db->db_data during the mempcy operation when + * zfs_fillpage() calls dmu_read(). + */ + zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock, + io_off, io_len, RL_READER); + if (lr == NULL) { + /* + * It is important to drop the page lock before grabbing the + * rangelock to avoid another deadlock between here and + * zfs_write() -> update_pages(). update_pages() holds both the + * rangelock and the page lock. + */ + get_page(pp); + unlock_page(pp); + lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, + io_len, RL_READER); + lock_page(pp); + put_page(pp); + } error = zfs_fillpage(ip, pp); + zfs_rangelock_exit(lr); + if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 9dec52215..6b16faa2b 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -322,14 +322,14 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) crhold(cr); cookie = spl_fstrans_mark(); - int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); - if (error < 0) - return (error); + if (ret < 0) + return (ret); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; @@ -384,14 +384,14 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) crhold(cr); cookie = spl_fstrans_mark(); - int error = -zfs_write(ITOZ(ip), &uio, + ret = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); - if (error < 0) - return (error); + if (ret < 0) + return (ret); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; @@ -422,14 +422,14 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, crhold(cr); cookie = spl_fstrans_mark(); - int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + flip->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); - if (error < 0) - return (error); + if (ret < 0) + return (ret); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; @@ -467,53 +467,57 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, crhold(cr); cookie = spl_fstrans_mark(); - int error = -zfs_write(ITOZ(ip), &uio, + ret = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); - if (error < 0) - return (error); + if (ret < 0) + return (ret); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } + #endif /* HAVE_VFS_RW_ITERATE */ -#if defined(HAVE_VFS_RW_ITERATE) static ssize_t -zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) +zpl_direct_IO_impl(void) { - if (rw == WRITE) - return (zpl_iter_write(kiocb, iter)); - else - return (zpl_iter_read(kiocb, iter)); + /* + * All O_DIRECT requests should be handled by + * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code + * should call the direct_IO address_space_operations function. We set + * this code path to be fatal if it is executed. + */ + PANIC(0); + return (0); } + +#if defined(HAVE_VFS_RW_ITERATE) #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { - return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - ASSERT3S(pos, ==, kiocb->ki_pos); - return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - ASSERT3S(pos, ==, kiocb->ki_pos); - return (zpl_direct_IO_impl(rw, kiocb, iter)); + return (zpl_direct_IO_impl()); } #else -#error "Unknown direct IO interface" +#error "Unknown Direct I/O interface" #endif #else /* HAVE_VFS_RW_ITERATE */ @@ -523,26 +527,16 @@ static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - if (rw == WRITE) - return (zpl_aio_write(kiocb, iov, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iov, nr_segs, pos)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - const struct iovec *iovp = iov_iter_iovec(iter); - unsigned long nr_segs = iter->nr_segs; - - ASSERT3S(pos, ==, kiocb->ki_pos); - if (rw == WRITE) - return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); + return (zpl_direct_IO_impl()); } #else -#error "Unknown direct IO interface" +#error "Unknown Direct I/O interface" #endif #endif /* HAVE_VFS_RW_ITERATE */ @@ -627,6 +621,7 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); + if (error) return (error); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 764993b45..10ac13a89 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -395,6 +395,13 @@ zfs_prop_init(void) { NULL } }; + static const zprop_index_t direct_table[] = { + { "disabled", ZFS_DIRECT_DISABLED }, + { "standard", ZFS_DIRECT_STANDARD }, + { "always", ZFS_DIRECT_ALWAYS }, + { NULL } + }; + struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_DATASET_PROPERTIES); @@ -479,6 +486,10 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "default | full | geom | dev | none", "VOLMODE", volmode_table, sfeatures); + zprop_register_index(ZFS_PROP_DIRECT, "direct", + ZFS_DIRECT_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "disabled | standard | always", "DIRECT", direct_table, + sfeatures); /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c index e2d4d1aef..622323bbb 100644 --- a/module/zcommon/zfs_valstr.c +++ b/module/zcommon/zfs_valstr.c @@ -218,6 +218,7 @@ _VALSTR_BITFIELD_IMPL(zio_flag, { '.', "NP", "NOPWRITE" }, { '.', "EX", "REEXECUTED" }, { '.', "DG", "DELEGATED" }, + { '.', "DC", "DIO_CHKSUM_ERR" }, ) /* END CSTYLED */ @@ -252,6 +253,7 @@ _VALSTR_BITFIELD_IMPL(zio_stage, { 'V', "VD", "VDEV_IO_DONE" }, { 'V', "VA", "VDEV_IO_ASSESS" }, { 'C', "CV", "CHECKSUM_VERIFY" }, + { 'C', "DC", "DIO_CHECKSUM_VERIFY" }, { 'X', "X ", "DONE" }, ) /* END CSTYLED */ diff --git a/module/zfs/abd.c b/module/zfs/abd.c index c8c4d2270..529deeecf 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -89,8 +89,8 @@ * functions. * * As an additional feature, linear and scatter ABD's can be stitched together - * by using the gang ABD type (abd_alloc_gang_abd()). This allows for - * multiple ABDs to be viewed as a singular ABD. + * by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs + * to be viewed as a singular ABD. * * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to * B_FALSE. @@ -109,11 +109,15 @@ void abd_verify(abd_t *abd) { #ifdef ZFS_DEBUG - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + if (abd_is_from_pages(abd)) { + ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS); + } else { + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + } ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | - ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD)); + ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { @@ -136,7 +140,7 @@ abd_verify(abd_t *abd) #endif } -static void +void abd_init_struct(abd_t *abd) { list_link_init(&abd->abd_gang_link); @@ -238,6 +242,7 @@ abd_free_linear(abd_t *abd) abd_free_linear_page(abd); return; } + if (abd->abd_flags & ABD_FLAG_META) { zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } else { @@ -520,6 +525,21 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) */ abd->abd_flags |= ABD_FLAG_LINEAR; + /* + * User pages from Direct I/O requests may be in a single page + * (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag + * that here for abd. This is required because we have to be + * careful when borrowing the buffer from the ABD because we + * can not place user pages under write protection on Linux. + * See the comments in abd_os.c for abd_borrow_buf(), + * abd_borrow_buf_copy(), abd_return_buf() and + * abd_return_buf_copy(). + */ + if (abd_is_from_pages(sabd)) { + abd->abd_flags |= ABD_FLAG_FROM_PAGES | + ABD_FLAG_LINEAR_PAGE; + } + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; } else if (abd_is_gang(sabd)) { size_t left = size; @@ -648,70 +668,6 @@ abd_to_buf(abd_t *abd) return (ABD_LINEAR_BUF(abd)); } -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } -#ifdef ZFS_DEBUG - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); -#endif - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); -#ifdef ZFS_DEBUG - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -#endif - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - void abd_release_ownership_of_buf(abd_t *abd) { diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 714a30e86..b5bcd367b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5961,7 +5961,7 @@ top: ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); - zfs_racct_read(size, 1); + zfs_racct_read(spa, size, 1, 0); } /* Check if the spa even has l2 configured */ diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 914260e74..27a04c2af 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -217,8 +217,7 @@ dataset_kstats_rename(dataset_kstats_t *dk, const char *name) } void -dataset_kstats_update_write_kstats(dataset_kstats_t *dk, - int64_t nwritten) +dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten) { ASSERT3S(nwritten, >=, 0); @@ -230,8 +229,7 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk, } void -dataset_kstats_update_read_kstats(dataset_kstats_t *dk, - int64_t nread) +dataset_kstats_update_read_kstats(dataset_kstats_t *dk, int64_t nread) { ASSERT3S(nread, >=, 0); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 099883ba2..df9368fc8 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -628,7 +628,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db) * L2ARC. */ boolean_t -dbuf_is_l2cacheable(dmu_buf_impl_t *db) +dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp) { if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL || (db->db_objset->os_secondary_cache == @@ -636,10 +636,17 @@ dbuf_is_l2cacheable(dmu_buf_impl_t *db) if (l2arc_exclude_special == 0) return (B_TRUE); - blkptr_t *bp = db->db_blkptr; - if (bp == NULL || BP_IS_HOLE(bp)) + /* + * bp must be checked in the event it was passed from + * dbuf_read_impl() as the result of a the BP being set from + * a Direct I/O write in dbuf_read(). See comments in + * dbuf_read(). + */ + blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp; + + if (db_bp == NULL || BP_IS_HOLE(db_bp)) return (B_FALSE); - uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva); vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; vdev_t *vd = NULL; @@ -1380,6 +1387,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); + /* * All reads are synchronous, so we must have a hold on the dbuf */ @@ -1570,12 +1578,11 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) */ static int dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, - db_lock_type_t dblt, const void *tag) + db_lock_type_t dblt, blkptr_t *bp, const void *tag) { zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t bp, *bpp = NULL; ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1589,43 +1596,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, goto early_unlock; } - /* - * If we have a pending block clone, we don't want to read the - * underlying block, but the content of the block being cloned, - * pointed by the dirty record, so we have the most recent data. - * If there is no dirty record, then we hit a race in a sync - * process when the dirty record is already removed, while the - * dbuf is not yet destroyed. Such case is equivalent to uncached. - */ - if (db->db_state == DB_NOFILL) { - dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); - if (dr != NULL) { - if (!dr->dt.dl.dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bp = dr->dt.dl.dr_overridden_by; - bpp = &bp; - } - } - - if (bpp == NULL && db->db_blkptr != NULL) { - bp = *db->db_blkptr; - bpp = &bp; - } - - err = dbuf_read_hole(db, dn, bpp); + err = dbuf_read_hole(db, dn, bp); if (err == 0) goto early_unlock; - ASSERT(bpp != NULL); + ASSERT(bp != NULL); /* * Any attempt to read a redacted block should result in an error. This * will never happen under normal conditions, but can be useful for * debugging purposes. */ - if (BP_IS_REDACTED(bpp)) { + if (BP_IS_REDACTED(bp)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); @@ -1640,9 +1622,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ - if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) { spa_log_error(db->db_objset->os_spa, &zb, - BP_GET_LOGICAL_BIRTH(bpp)); + BP_GET_LOGICAL_BIRTH(bp)); err = SET_ERROR(EIO); goto early_unlock; } @@ -1653,7 +1635,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, if (!DBUF_IS_CACHEABLE(db)) aflags |= ARC_FLAG_UNCACHED; - else if (dbuf_is_l2cacheable(db)) + else if (dbuf_is_l2cacheable(db, bp)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -1661,17 +1643,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; - if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; + /* - * The zio layer will copy the provided blkptr later, but we have our - * own copy so that we can release the parent's rwlock. We have to - * do that so that if dbuf_read_done is called synchronously (on + * The zio layer will copy the provided blkptr later, but we need to + * do this now so that we can release the parent's rwlock. We have to + * do that now so that if dbuf_read_done is called synchronously (on * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ + blkptr_t copy = *bp; dmu_buf_unlock_parent(db, dblt, tag); - return (arc_read(zio, db->db_objset->os_spa, bpp, + return (arc_read(zio, db->db_objset->os_spa, ©, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb)); @@ -1844,13 +1828,30 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); - if (pio == NULL && (db->db_state == DB_NOFILL || - (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { - spa_t *spa = dn->dn_objset->os_spa; - pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - need_wait = B_TRUE; + blkptr_t *bp; + + /* + * If a block clone or Direct I/O write has occurred we will + * get the dirty records overridden BP so we get the most + * recent data. + */ + err = dmu_buf_get_bp_from_dbuf(db, &bp); + + if (!err) { + if (pio == NULL && (db->db_state == DB_NOFILL || + (bp != NULL && !BP_IS_HOLE(bp)))) { + spa_t *spa = dn->dn_objset->os_spa; + pio = + zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + need_wait = B_TRUE; + } + + err = + dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG); + } else { + mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, FTAG); } - err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); /* dbuf_read_impl drops db_mtx and parent's rwlock. */ miss = (db->db_state != DB_CACHED); } @@ -1918,6 +1919,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); + /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only @@ -1936,16 +1938,20 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); - if (dr->dt.dl.dr_brtwrite) { + if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) { ASSERT0P(dr->dt.dl.dr_data); dr->dt.dl.dr_data = db->db_buf; } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_brtwrite = B_FALSE; + dr->dt.dl.dr_diowrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* + * In the event that Direct I/O was used, we do not + * need to release the buffer from the ARC. + * * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do @@ -2084,6 +2090,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) */ dmu_buf_will_dirty(&db->db, tx); + VERIFY3P(db->db_buf, !=, NULL); + /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); @@ -2532,6 +2540,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; boolean_t brtwrite; + boolean_t diowrite; ASSERT(txg != 0); @@ -2557,7 +2566,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(dr->dr_dbuf == db); brtwrite = dr->dt.dl.dr_brtwrite; + diowrite = dr->dt.dl.dr_diowrite; if (brtwrite) { + ASSERT3B(diowrite, ==, B_FALSE); /* * We are freeing a block that we cloned in the same * transaction group. @@ -2598,10 +2609,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state != DB_NOFILL && !brtwrite) { dbuf_unoverride(dr); - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) + if (dr->dt.dl.dr_data != db->db_buf) { + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); arc_buf_destroy(dr->dt.dl.dr_data, db); + } } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -2610,7 +2622,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - ASSERT(db->db_state == DB_NOFILL || brtwrite || + ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); @@ -2670,8 +2682,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we * want to make sure dbuf_read() will read the pending cloned block and * not the uderlying block that is being replaced. dbuf_undirty() will - * do dbuf_unoverride(), so we will end up with cloned block content, - * without overridden BP. + * do brt_pending_remove() before removing the dirty record. */ (void) dbuf_read(db, NULL, flags); if (undirty) { @@ -2701,23 +2712,126 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) return (dr != NULL); } +/* + * Normally the db_blkptr points to the most recent on-disk content for the + * dbuf (and anything newer will be cached in the dbuf). However, a pending + * block clone or not yet synced Direct I/O write will have a dirty record BP + * pointing to the most recent data. + */ +int +dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + int error = 0; + + if (db->db_level != 0) { + *bp = db->db_blkptr; + return (0); + } + + *bp = db->db_blkptr; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr && db->db_state == DB_NOFILL) { + /* Block clone */ + if (!dr->dt.dl.dr_brtwrite) + error = EIO; + else + *bp = &dr->dt.dl.dr_overridden_by; + } else if (dr && db->db_state == DB_UNCACHED) { + /* Direct I/O write */ + if (dr->dt.dl.dr_diowrite) + *bp = &dr->dt.dl.dr_overridden_by; + } + + return (error); +} + +/* + * Direct I/O reads can read directly from the ARC, but the data has + * to be untransformed in order to copy it over into user pages. + */ +int +dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa) +{ + int err = 0; + DB_DNODE_ENTER(db); + dnode_t *dn = DB_DNODE(db); + + ASSERT3S(db->db_state, ==, DB_CACHED); + ASSERT(MUTEX_HELD(&db->db_mtx)); + + /* + * Ensure that this block's dnode has been decrypted if + * the caller has requested decrypted data. + */ + err = dbuf_read_verify_dnode_crypt(db, dn, 0); + + /* + * If the arc buf is compressed or encrypted and the caller + * requested uncompressed data, we need to untransform it + * before returning. We also call arc_untransform() on any + * unauthenticated blocks, which will verify their MAC if + * the key is now available. + */ + if (err == 0 && db->db_buf != NULL && + (arc_is_encrypted(db->db_buf) || + arc_is_unauthenticated(db->db_buf) || + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + dbuf_fix_old_data(db, spa_syncing_txg(spa)); + err = arc_untransform(db->db_buf, spa, &zb, B_FALSE); + dbuf_set_data(db, db->db_buf); + } + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_hits); + + return (err); +} + void -dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) +dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx) { + /* + * Block clones and Direct I/O writes always happen in open-context. + */ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT0(db->db_level); + ASSERT(!dmu_tx_is_syncing(tx)); + ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - /* - * Block cloning: We are going to clone into this block, so undirty - * modifications done to this block so far in this txg. This includes - * writes and clones into this block. - */ mutex_enter(&db->db_mtx); DBUF_VERIFY(db); - VERIFY(!dbuf_undirty(db, tx)); + + /* + * We are going to clone or issue a Direct I/O write on this block, so + * undirty modifications done to this block so far in this txg. This + * includes writes and clones into this block. + * + * If there dirty record associated with this txg from a previous Direct + * I/O write then space accounting cleanup takes place. It is important + * to go ahead free up the space accounting through dbuf_undirty() -> + * dbuf_unoverride() -> zio_free(). Space accountiung for determining + * if a write can occur in zfs_write() happens through dmu_tx_assign(). + * This can cause an issue with Direct I/O writes in the case of + * overwriting the same block, because all DVA allocations are being + * done in open-context. Constantly allowing Direct I/O overwrites to + * the same block can exhaust the pools available space leading to + * ENOSPC errors at the DVA allocation part of the ZIO pipeline, which + * will eventually suspend the pool. By cleaning up sapce acccounting + * now, the ENOSPC error can be avoided. + * + * Since we are undirtying the record in open-context, we must have a + * hold on the db, so it should never be evicted after calling + * dbuf_undirty(). + */ + VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE); ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg)); + if (db->db_buf != NULL) { /* * If there is an associated ARC buffer with this dbuf we can @@ -2728,6 +2842,11 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(db->db_buf, db); + /* + * Setting the dbuf's data pointers to NULL will force all + * future reads down to the devices to get the most up to date + * version of the data after a Direct I/O write has completed. + */ db->db_buf = NULL; dbuf_clear_data(db); } @@ -2736,7 +2855,8 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT3P(db->db.db_data, ==, NULL); db->db_state = DB_NOFILL; - DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone"); + DTRACE_SET_STATE(db, + "allocating NOFILL buffer for clone or direct I/O write"); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); @@ -2773,21 +2893,28 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) dmu_tx_private_ok(tx)); mutex_enter(&db->db_mtx); - if (db->db_state == DB_NOFILL) { + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); + if (db->db_state == DB_NOFILL || + (db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) { /* - * Block cloning: We will be completely overwriting a block - * cloned in this transaction group, so let's undirty the - * pending clone and mark the block as uncached. This will be - * as if the clone was never done. But if the fill can fail - * we should have a way to return back to the cloned data. + * If the fill can fail we should have a way to return back to + * the cloned or Direct I/O write data. */ - if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + if (canfail && dr) { mutex_exit(&db->db_mtx); dmu_buf_will_dirty(db_fake, tx); return; } - VERIFY(!dbuf_undirty(db, tx)); - db->db_state = DB_UNCACHED; + /* + * Block cloning: We will be completely overwriting a block + * cloned in this transaction group, so let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. + */ + if (dr && dr->dt.dl.dr_brtwrite) { + VERIFY(!dbuf_undirty(db, tx)); + db->db_state = DB_UNCACHED; + } } mutex_exit(&db->db_mtx); @@ -4080,7 +4207,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) } else { mutex_exit(&db->db_mtx); } - } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -4540,24 +4666,32 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) mutex_enter(&db->db_mtx); /* - * To be synced, we must be dirtied. But we - * might have been freed after the dirty. + * To be synced, we must be dirtied. But we might have been freed + * after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ - ASSERT(db->db.db_data == NULL); + ASSERT3P(db->db.db_data, ==, NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else if (db->db_state == DB_READ) { /* - * This buffer has a clone we need to write, and an in-flight - * read on the BP we're about to clone. Its safe to issue the - * write here because the read has already been issued and the - * contents won't change. + * This buffer was either cloned or had a Direct I/O write + * occur and has an in-flgiht read on the BP. It is safe to + * issue the write here, because the read has already been + * issued and the contents won't change. + * + * We can verify the case of both the clone and Direct I/O + * write by making sure the first dirty record for the dbuf + * has no ARC buffer associated with it. */ - ASSERT(dr->dt.dl.dr_brtwrite && - dr->dt.dl.dr_override_state == DR_OVERRIDDEN); + dbuf_dirty_record_t *dr_head = + list_head(&db->db_dirty_records); + ASSERT3P(db->db_buf, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL); + ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } @@ -4608,8 +4742,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_check_blkptr(dn, db); /* - * If this buffer is in the middle of an immediate write, - * wait for the synchronous IO to complete. + * If this buffer is in the middle of an immediate write, wait for the + * synchronous IO to complete. + * + * This is also valid even with Direct I/O writes setting a dirty + * records override state into DR_IN_DMU_SYNC, because all + * Direct I/O writes happen in open-context. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -4913,8 +5051,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + + /* no dr_data if this is a NO_FILL or Direct I/O */ if (dr->dt.dl.dr_data != NULL && dr->dt.dl.dr_data != db->db_buf) { + ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE); + ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE); arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { @@ -5180,7 +5322,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context - * (by dmu_sync() or dmu_buf_write_embedded()). + * (by dmu_sync(), dmu_write_direct(), + * or dmu_buf_write_embedded()). */ abd_t *contents = (data != NULL) ? abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; @@ -5219,7 +5362,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db), - dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, + dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } @@ -5239,7 +5382,7 @@ EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); -EXPORT_SYMBOL(dmu_buf_will_clone); +EXPORT_SYMBOL(dmu_buf_will_clone_or_dio); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index b3eda8ea5..3f87cfe6b 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -609,8 +609,16 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbp[i] = &db->db; } - if (!read) - zfs_racct_write(length, nblks); + /* + * If we are doing O_DIRECT we still hold the dbufs, even for reads, + * but we do not issue any reads here. We do not want to account for + * writes in this case. + * + * O_DIRECT write/read accounting takes place in + * dmu_{write/read}_abd(). + */ + if (!read && ((flags & DMU_DIRECTIO) == 0)) + zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags); if (zs) dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); @@ -897,7 +905,7 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) /* * Get the next "chunk" of file data to free. We traverse the file from - * the end so that the file gets shorter over time (if we crashes in the + * the end so that the file gets shorter over time (if we crash in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * @@ -1168,7 +1176,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, /* * Deal with odd block sizes, where there can't be data past the first - * block. If we ever do the tail block optimization, we will need to + * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { @@ -1178,6 +1186,18 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, size = newsz; } + if (size == 0) + return (0); + + /* Allow Direct I/O when requested and properly aligned */ + if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) && + zfs_dio_aligned(offset, size, PAGESIZE)) { + abd_t *data = abd_get_from_buf(buf, size); + err = dmu_read_abd(dn, offset, size, data, flags); + abd_free(data); + return (err); + } + while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; @@ -1286,22 +1306,41 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } /* - * Note: Lustre is an external consumer of this interface. + * This interface is not used internally by ZFS but is provided for + * use by Lustre which is built on the DMU interfaces. */ -void -dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) +int +dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx, uint32_t flags) { dmu_buf_t **dbp; int numbufs; + int error; if (size == 0) - return; + return (0); + + /* Allow Direct I/O when requested and properly aligned */ + if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) && + zfs_dio_aligned(offset, size, dn->dn_datablksz)) { + abd_t *data = abd_get_from_buf((void *)buf, size); + error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); + abd_free(data); + return (error); + } VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); + return (0); +} + +int +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0)); } void @@ -1365,6 +1404,9 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) dmu_buf_t **dbp; int numbufs, i, err; + if (uio->uio_extflg & UIO_DIRECT) + return (dmu_read_uio_direct(dn, uio, size)); + /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. @@ -1453,23 +1495,53 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_buf_t **dbp; int numbufs; int err = 0; - int i; + uint64_t write_size; - err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, +top: + write_size = size; + + /* + * We only allow Direct I/O writes to happen if we are block + * sized aligned. Otherwise, we pass the write off to the ARC. + */ + if ((uio->uio_extflg & UIO_DIRECT) && + (write_size >= dn->dn_datablksz)) { + if (zfs_dio_aligned(zfs_uio_offset(uio), write_size, + dn->dn_datablksz)) { + return (dmu_write_uio_direct(dn, uio, size, tx)); + } else if (write_size > dn->dn_datablksz && + zfs_dio_offset_aligned(zfs_uio_offset(uio), + dn->dn_datablksz)) { + write_size = + dn->dn_datablksz * (write_size / dn->dn_datablksz); + err = dmu_write_uio_direct(dn, uio, write_size, tx); + if (err == 0) { + size -= write_size; + goto top; + } else { + return (err); + } + } else { + write_size = + P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz); + } + } + + err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); - for (i = 0; i < numbufs; i++) { + for (int i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; - ASSERT(size > 0); + ASSERT(write_size > 0); offset_t off = zfs_uio_offset(uio); bufoff = off - db->db_offset; - tocpy = MIN(db->db_size - bufoff, size); + tocpy = MIN(db->db_size - bufoff, write_size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); @@ -1489,10 +1561,18 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) if (err) break; + write_size -= tocpy; size -= tocpy; } + IMPLY(err == 0, write_size == 0); + dmu_buf_rele_array(dbp, numbufs, FTAG); + + if ((uio->uio_extflg & UIO_DIRECT) && size > 0) { + goto top; + } + return (err); } @@ -1731,7 +1811,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { - zfs_racct_write(blksz, 1); + zfs_racct_write(os->os_spa, blksz, 1, 0); dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { @@ -1761,23 +1841,22 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, return (err); } -typedef struct { - dbuf_dirty_record_t *dsa_dr; - dmu_sync_cb_t *dsa_done; - zgd_t *dsa_zgd; - dmu_tx_t *dsa_tx; -} dmu_sync_arg_t; - -static void +void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; - dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { + dbuf_dirty_record_t *dr = dsa->dsa_dr; + blkptr_t *bp = zio->io_bp; + if (BP_IS_HOLE(bp)) { + dmu_buf_t *db = NULL; + if (dr) + db = &(dr->dr_dbuf->db); + else + db = dsa->dsa_zgd->zgd_db; /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. @@ -1796,7 +1875,7 @@ dmu_sync_late_arrival_ready(zio_t *zio) dmu_sync_ready(zio, NULL, zio->io_private); } -static void +void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; @@ -1809,7 +1888,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ - if (zio->io_error == 0) { + if (zgd && zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } @@ -1848,10 +1927,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } + cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); - dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + if (dsa->dsa_done) + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } @@ -2120,9 +2201,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, - dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), - &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), + dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL, + dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, + &zb)); return (0); } @@ -2385,6 +2467,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; + zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); @@ -2594,7 +2677,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT(db->db_blkid != DMU_SPILL_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - dmu_buf_will_clone(dbuf, tx); + dmu_buf_will_clone_or_dio(dbuf, tx); mutex_enter(&db->db_mtx); @@ -2817,8 +2900,15 @@ EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); +EXPORT_SYMBOL(dmu_read_uio); +EXPORT_SYMBOL(dmu_read_uio_dbuf); +EXPORT_SYMBOL(dmu_read_uio_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); +EXPORT_SYMBOL(dmu_write_by_dnode_flags); +EXPORT_SYMBOL(dmu_write_uio); +EXPORT_SYMBOL(dmu_write_uio_dbuf); +EXPORT_SYMBOL(dmu_write_uio_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c new file mode 100644 index 000000000..91a7fd8df --- /dev/null +++ b/module/zfs/dmu_direct.c @@ -0,0 +1,395 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/zfs_racct.h> +#include <sys/dsl_dataset.h> +#include <sys/dmu_objset.h> + +static abd_t * +make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset, + uint64_t size) +{ + size_t buf_size = db->db.db_size; + abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL; + size_t buf_off = 0; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (offset > db->db.db_offset) { + size_t pre_size = offset - db->db.db_offset; + pre_buf = abd_alloc_for_io(pre_size, B_TRUE); + buf_size -= pre_size; + buf_off = 0; + } else { + buf_off = db->db.db_offset - offset; + size -= buf_off; + } + + if (size < buf_size) { + size_t post_size = buf_size - size; + post_buf = abd_alloc_for_io(post_size, B_TRUE); + buf_size -= post_size; + } + + ASSERT3U(buf_size, >, 0); + abd_t *buf = abd_get_offset_size(data, buf_off, buf_size); + + if (pre_buf || post_buf) { + mbuf = abd_alloc_gang(); + if (pre_buf) + abd_gang_add(mbuf, pre_buf, B_TRUE); + abd_gang_add(mbuf, buf, B_TRUE); + if (post_buf) + abd_gang_add(mbuf, post_buf, B_TRUE); + } else { + mbuf = buf; + } + + return (mbuf); +} + +static void +dmu_read_abd_done(zio_t *zio) +{ + abd_free(zio->io_abd); +} + +static void +dmu_write_direct_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + +static void +dmu_write_direct_done(zio_t *zio) +{ + dmu_sync_arg_t *dsa = zio->io_private; + dbuf_dirty_record_t *dr = dsa->dsa_dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + + abd_free(zio->io_abd); + + mutex_enter(&db->db_mtx); + ASSERT3P(db->db_buf, ==, NULL); + ASSERT3P(dr->dt.dl.dr_data, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + db->db_state = DB_UNCACHED; + mutex_exit(&db->db_mtx); + + dmu_sync_done(zio, NULL, zio->io_private); + + if (zio->io_error != 0) { + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + ASSERT3U(zio->io_error, ==, EIO); + + /* + * In the event of an I/O error this block has been freed in + * zio_done() through zio_dva_unallocate(). Calling + * dmu_sync_done() above set dr_override_state to + * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls + * dbuf_unoverride(), it will skip doing zio_free() to free + * this block as that was already taken care of. + * + * Since we are undirtying the record in open-context, we must + * have a hold on the db, so it should never be evicted after + * calling dbuf_undirty(). + */ + mutex_enter(&db->db_mtx); + VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE); + mutex_exit(&db->db_mtx); + } + + kmem_free(zio->io_bp, sizeof (blkptr_t)); + zio->io_bp = NULL; +} + +int +dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) +{ + objset_t *os = db->db_objset; + dsl_dataset_t *ds = dmu_objset_ds(os); + zbookmark_phys_t zb; + dbuf_dirty_record_t *dr_head; + + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + DB_DNODE_ENTER(db); + zio_prop_t zp; + dmu_write_policy(os, DB_DNODE(db), db->db_level, + WP_DMU_SYNC | WP_DIRECT_WR, &zp); + DB_DNODE_EXIT(db); + + /* + * Dirty this dbuf with DB_NOFILL since we will not have any data + * associated with the dbuf. + */ + dmu_buf_will_clone_or_dio(&db->db, tx); + + mutex_enter(&db->db_mtx); + + uint64_t txg = dmu_tx_get_txg(tx); + ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa)); + ASSERT3U(txg, >, spa_syncing_txg(os->os_spa)); + + dr_head = list_head(&db->db_dirty_records); + ASSERT3U(dr_head->dr_txg, ==, txg); + dr_head->dt.dl.dr_diowrite = B_TRUE; + dr_head->dr_accounted = db->db.db_size; + + blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + if (db->db_blkptr != NULL) { + /* + * Fill in bp with the current block pointer so that + * the nopwrite code can check if we're writing the same + * data that's already on disk. + */ + *bp = *db->db_blkptr; + } else { + memset(bp, 0, sizeof (blkptr_t)); + } + + /* + * Disable nopwrite if the current block pointer could change + * before this TXG syncs. + */ + if (list_next(&db->db_dirty_records, dr_head) != NULL) + zp.zp_nopwrite = B_FALSE; + + ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); + dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + + mutex_exit(&db->db_mtx); + + dmu_objset_willuse_space(os, dr_head->dr_accounted, tx); + + dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr_head; + dsa->dsa_tx = tx; + + zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data, + db->db.db_size, db->db.db_size, &zp, + dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb); + + if (pio == NULL) + return (zio_wait(zio)); + + zio_nowait(zio); + + return (0); +} + +int +dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, + abd_t *data, uint32_t flags, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + spa_t *spa = dn->dn_objset->os_spa; + int numbufs, err; + + ASSERT(flags & DMU_DIRECTIO); + + err = dmu_buf_hold_array_by_dnode(dn, offset, + size, B_FALSE, FTAG, &numbufs, &dbp, flags); + if (err) + return (err); + + zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + for (int i = 0; i < numbufs && err == 0; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + + abd_t *abd = abd_get_offset_size(data, + db->db.db_offset - offset, dn->dn_datablksz); + + zfs_racct_write(spa, db->db.db_size, 1, flags); + err = dmu_write_direct(pio, db, abd, tx); + ASSERT0(err); + } + + err = zio_wait(pio); + + /* + * The dbuf must be held until the Direct I/O write has completed in + * the event there was any errors and dbuf_undirty() was called. + */ + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +int +dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, + abd_t *data, uint32_t flags) +{ + objset_t *os = dn->dn_objset; + spa_t *spa = os->os_spa; + dmu_buf_t **dbp; + int numbufs, err; + + ASSERT(flags & DMU_DIRECTIO); + + err = dmu_buf_hold_array_by_dnode(dn, offset, + size, B_FALSE, FTAG, &numbufs, &dbp, flags); + if (err) + return (err); + + zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + for (int i = 0; i < numbufs; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + abd_t *mbuf; + zbookmark_phys_t zb; + blkptr_t *bp; + + mutex_enter(&db->db_mtx); + + SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + /* + * If there is another read for this dbuf, we will wait for + * that to complete first before checking the db_state below. + */ + while (db->db_state == DB_READ) + cv_wait(&db->db_changed, &db->db_mtx); + + err = dmu_buf_get_bp_from_dbuf(db, &bp); + if (err) { + mutex_exit(&db->db_mtx); + goto error; + } + + /* + * There is no need to read if this is a hole or the data is + * cached. This will not be considered a direct read for IO + * accounting in the same way that an ARC hit is not counted. + */ + if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) { + size_t aoff = offset < db->db.db_offset ? + db->db.db_offset - offset : 0; + size_t boff = offset > db->db.db_offset ? + offset - db->db.db_offset : 0; + size_t len = MIN(size - aoff, db->db.db_size - boff); + + if (db->db_state == DB_CACHED) { + /* + * We need to untransformed the ARC buf data + * before we copy it over. + */ + err = dmu_buf_untransform_direct(db, spa); + ASSERT0(err); + abd_copy_from_buf_off(data, + (char *)db->db.db_data + boff, aoff, len); + } else { + abd_zero_off(data, aoff, len); + } + + mutex_exit(&db->db_mtx); + continue; + } + + mbuf = make_abd_for_dbuf(db, data, offset, size); + ASSERT3P(mbuf, !=, NULL); + + /* + * The dbuf mutex (db_mtx) must be held when creating the ZIO + * for the read. The BP returned from + * dmu_buf_get_bp_from_dbuf() could be from a pending block + * clone or a yet to be synced Direct I/O write that is in the + * dbuf's dirty record. When zio_read() is called, zio_create() + * will make a copy of the BP. However, if zio_read() is called + * without the mutex being held then the dirty record from the + * dbuf could be freed in dbuf_write_done() resulting in garbage + * being set for the zio BP. + */ + zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, + dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL, &zb); + mutex_exit(&db->db_mtx); + + zfs_racct_read(spa, db->db.db_size, 1, flags); + zio_nowait(cio); + } + + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (zio_wait(rio)); + +error: + dmu_buf_rele_array(dbp, numbufs, FTAG); + (void) zio_wait(rio); + return (err); +} + +#ifdef _KERNEL +int +dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size) +{ + offset_t offset = zfs_uio_offset(uio); + offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; + int err; + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3U(page_index, <, uio->uio_dio.npages); + + abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], + offset & (PAGESIZE - 1), size); + err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO); + abd_free(data); + + if (err == 0) + zfs_uioskip(uio, size); + + return (err); +} + +int +dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) +{ + offset_t offset = zfs_uio_offset(uio); + offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; + int err; + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3U(page_index, <, uio->uio_dio.npages); + + abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], + offset & (PAGESIZE - 1), size); + err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); + abd_free(data); + + if (err == 0) + zfs_uioskip(uio, size); + + return (err); +} +#endif /* _KERNEL */ + +EXPORT_SYMBOL(dmu_read_uio_direct); +EXPORT_SYMBOL(dmu_write_uio_direct); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 8f4fefa4f..f030fba22 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -351,6 +351,20 @@ smallblk_changed_cb(void *arg, uint64_t newval) } static void +direct_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD || + newval == ZFS_DIRECT_ALWAYS); + + os->os_direct = newval; +} + +static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; @@ -633,6 +647,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ZFS_PROP_SPECIAL_SMALL_BLOCKS), smallblk_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DIRECT), + direct_changed_cb, os); + } } if (err != 0) { arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 17ed2a620..45a2f0626 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -895,6 +895,14 @@ static const spa_iostats_t spa_iostats_template = { { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, + { "arc_read_count", KSTAT_DATA_UINT64 }, + { "arc_read_bytes", KSTAT_DATA_UINT64 }, + { "arc_write_count", KSTAT_DATA_UINT64 }, + { "arc_write_bytes", KSTAT_DATA_UINT64 }, + { "direct_read_count", KSTAT_DATA_UINT64 }, + { "direct_read_bytes", KSTAT_DATA_UINT64 }, + { "direct_write_count", KSTAT_DATA_UINT64 }, + { "direct_write_bytes", KSTAT_DATA_UINT64 }, }; #define SPA_IOSTATS_ADD(stat, val) \ @@ -938,6 +946,44 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type, } } +void +spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + + if (ksp == NULL) + return; + + spa_iostats_t *iostats = ksp->ks_data; + if (flags & DMU_DIRECTIO) { + SPA_IOSTATS_ADD(direct_read_count, iops); + SPA_IOSTATS_ADD(direct_read_bytes, size); + } else { + SPA_IOSTATS_ADD(arc_read_count, iops); + SPA_IOSTATS_ADD(arc_read_bytes, size); + } +} + +void +spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + + if (ksp == NULL) + return; + + spa_iostats_t *iostats = ksp->ks_data; + if (flags & DMU_DIRECTIO) { + SPA_IOSTATS_ADD(direct_write_count, iops); + SPA_IOSTATS_ADD(direct_write_bytes, size); + } else { + SPA_IOSTATS_ADD(arc_write_count, iops); + SPA_IOSTATS_ADD(arc_write_bytes, size); + } +} + static int spa_iostats_update(kstat_t *ksp, int rw) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 6ae0a1412..9305bd894 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -118,6 +118,11 @@ static unsigned int zfs_slow_io_events_per_second = 20; static unsigned int zfs_deadman_events_per_second = 1; /* + * Rate limit direct write IO verify failures to this many per scond. + */ +static unsigned int zfs_dio_write_verify_events_per_second = 20; + +/* * Rate limit checksum events after this many checksum errors per second. */ static unsigned int zfs_checksum_events_per_second = 20; @@ -153,6 +158,17 @@ int zfs_nocacheflush = 0; uint_t zfs_vdev_max_auto_ashift = 14; uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; +/* + * VDEV checksum verification for Direct I/O writes. This is neccessary for + * Linux, because anonymous pages can not be placed under write protection + * during Direct I/O writes. + */ +#if !defined(__FreeBSD__) +uint_t zfs_vdev_direct_write_verify = 1; +#else +uint_t zfs_vdev_direct_write_verify = 0; +#endif + void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) { @@ -673,6 +689,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); + zfs_ratelimit_init(&vd->vdev_dio_verify_rl, + &zfs_dio_write_verify_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); @@ -1182,6 +1200,7 @@ vdev_free(vdev_t *vd) zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); + zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) @@ -4475,6 +4494,7 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) @@ -6503,7 +6523,14 @@ ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, "Rate limit hung IO (deadman) events to this many per second"); +ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, + "Rate Direct I/O write verify events to this many per second"); + /* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, + "Direct I/O writes will perform for checksum verification before " + "commiting write"); + ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 47346dd5a..9d12bc2eb 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -387,6 +387,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + /* Direct I/O write verify errors */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS, + vs->vs_dio_verify_errors); + /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index f7cecc9af..25b05abd3 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -595,6 +595,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, vs->vs_checksum_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, DATA_TYPE_UINT64, vs->vs_slow_ios, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS, + DATA_TYPE_UINT64, vs->vs_dio_verify_errors, NULL); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 53366ad49..e69b98896 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -160,7 +160,6 @@ #include <sys/types.h> #include <sys/param.h> #include <sys/errno.h> -#include <sys/uio_impl.h> #include <sys/file.h> #include <sys/kmem.h> #include <sys/cmn_err.h> diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 399f5a011..8d0aebbec 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -607,7 +607,7 @@ static int64_t zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, - zil_callback_t callback, void *callback_data) + boolean_t o_direct, zil_callback_t callback, void *callback_data) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); uint32_t blocksize = zp->z_blksz; @@ -622,7 +622,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, return; } - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && resid >= zfs_immediate_write_sz) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index f3db953ea..f9cc5b010 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -35,7 +35,6 @@ #include <sys/time.h> #include <sys/sysmacros.h> #include <sys/vfs.h> -#include <sys/uio_impl.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/kmem.h> @@ -75,6 +74,14 @@ int zfs_bclone_enabled = 1; static int zfs_bclone_wait_dirty = 0; /* + * Enable Direct I/O. If this setting is 0, then all I/O requests will be + * directed through the ARC acting as though the dataset property direct was + * set to disabled. + */ +static int zfs_dio_enabled = 1; + + +/* * Maximum bytes to read per chunk in zfs_read(). */ static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; @@ -203,6 +210,77 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) } /* + * Determine if Direct I/O has been requested (either via the O_DIRECT flag or + * the "direct" dataset property). When inherited by the property only apply + * the O_DIRECT flag to correctly aligned IO requests. The rational for this + * is it allows the property to be safely set on a dataset without forcing + * all of the applications to be aware of the alignment restrictions. When + * O_DIRECT is explicitly requested by an application return EINVAL if the + * request is unaligned. In all cases, if the range for this request has + * been mmap'ed then we will perform buffered I/O to keep the mapped region + * synhronized with the ARC. + * + * It is possible that a file's pages could be mmap'ed after it is checked + * here. If so, that is handled coorarding in zfs_write(). See comments in the + * following area for how this is handled: + * zfs_write() -> update_pages() + */ +static int +zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, + int *ioflagp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + int ioflag = *ioflagp; + int error = 0; + + if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED || + zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { + /* + * Direct I/O is disabled or the region is mmap'ed. In either + * case the I/O request will just directed through the ARC. + */ + ioflag &= ~O_DIRECT; + goto out; + } else if (os->os_direct == ZFS_DIRECT_ALWAYS && + zfs_uio_page_aligned(uio) && + zfs_uio_aligned(uio, PAGE_SIZE)) { + if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) || + (rw == UIO_READ)) { + ioflag |= O_DIRECT; + } + } else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) { + /* + * Direct I/O was requested through the direct=always, but it + * is not properly PAGE_SIZE aligned. The request will be + * directed through the ARC. + */ + ioflag &= ~O_DIRECT; + } + + if (ioflag & O_DIRECT) { + if (!zfs_uio_page_aligned(uio) || + !zfs_uio_aligned(uio, PAGE_SIZE)) { + error = SET_ERROR(EINVAL); + goto out; + } + + error = zfs_uio_get_dio_pages_alloc(uio, rw); + if (error) { + goto out; + } + } + + IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT); + ASSERT0(error); + +out: + *ioflagp = ioflag; + return (error); +} + +/* * Read bytes from specified file into supplied buffer. * * IN: zp - inode of file to be read from. @@ -286,24 +364,58 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) error = 0; goto out; } - ASSERT(zfs_uio_offset(uio) < zp->z_size); + + /* + * Setting up Direct I/O if requested. + */ + error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag); + if (error) { + goto out; + } + #if defined(__linux__) ssize_t start_offset = zfs_uio_offset(uio); #endif + ssize_t chunk_size = zfs_vnops_read_chunk_size; ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; + ssize_t dio_remaining_resid = 0; + + if (uio->uio_extflg & UIO_DIRECT) { + /* + * All pages for an O_DIRECT request ahve already been mapped + * so there's no compelling reason to handle this uio in + * smaller chunks. + */ + chunk_size = DMU_MAX_ACCESS; + + /* + * In the event that the O_DIRECT request is reading the entire + * file, it is possible file's length is not page sized + * aligned. However, lower layers expect that the Direct I/O + * request is page-aligned. In this case, as much of the file + * that can be read using Direct I/O happens and the remaining + * amount will be read through the ARC. + * + * This is still consistent with the semantics of Direct I/O in + * ZFS as at a minimum the I/O request must be page-aligned. + */ + dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); + if (dio_remaining_resid != 0) + n -= dio_remaining_resid; + } while (n > 0) { - ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); + ssize_t nbytes = MIN(n, chunk_size - + P2PHASE(zfs_uio_offset(uio), chunk_size)); #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif if (zn_has_cached_data(zp, zfs_uio_offset(uio), - zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { + zfs_uio_offset(uio) + nbytes - 1)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), @@ -332,12 +444,40 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) n -= nbytes; } + if (error == 0 && (uio->uio_extflg & UIO_DIRECT) && + dio_remaining_resid != 0) { + /* + * Temporarily remove the UIO_DIRECT flag from the UIO so the + * remainder of the file can be read using the ARC. + */ + uio->uio_extflg &= ~UIO_DIRECT; + + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + dio_remaining_resid - 1)) { + error = mappedread(zp, dio_remaining_resid, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, + dio_remaining_resid); + } + uio->uio_extflg |= UIO_DIRECT; + + if (error != 0) + n += dio_remaining_resid; + } else if (error && (uio->uio_extflg & UIO_DIRECT)) { + n += dio_remaining_resid; + } int64_t nread = start_resid - n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); - task_io_account_read(nread); out: zfs_rangelock_exit(lr); + /* + * Cleanup for Direct I/O if requested. + */ + if (uio->uio_extflg & UIO_DIRECT) + zfs_uio_free_dio_pages(uio, UIO_READ); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); zfs_exit(zfsvfs, FTAG); return (error); @@ -422,6 +562,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) int error = 0, error1; ssize_t start_resid = zfs_uio_resid(uio); uint64_t clear_setid_bits_txg = 0; + boolean_t o_direct_defer = B_FALSE; /* * Fasttrack empty write @@ -475,6 +616,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) } /* + * Setting up Direct I/O if requested. + */ + error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag); + if (error) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(error)); + } + + /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. */ @@ -504,6 +654,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) woff = zp->z_size; } zfs_uio_setoffset(uio, woff); + /* + * We need to update the starting offset as well because it is + * set previously in the ZPL (Linux) and VNOPS (FreeBSD) + * layers. + */ + zfs_uio_setsoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of @@ -540,6 +696,33 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) const uint64_t projid = zp->z_projid; /* + * In the event we are increasing the file block size + * (lr_length == UINT64_MAX), we will direct the write to the ARC. + * Because zfs_grow_blocksize() will read from the ARC in order to + * grow the dbuf, we avoid doing Direct I/O here as that would cause + * data written to disk to be overwritten by data in the ARC during + * the sync phase. Besides writing data twice to disk, we also + * want to avoid consistency concerns between data in the the ARC and + * on disk while growing the file's blocksize. + * + * We will only temporarily remove Direct I/O and put it back after + * we have grown the blocksize. We do this in the event a request + * is larger than max_blksz, so further requests to + * dmu_write_uio_dbuf() will still issue the requests using Direct + * IO. + * + * As an example: + * The first block to file is being written as a 4k request with + * a recorsize of 1K. The first 1K issued in the loop below will go + * through the ARC; however, the following 3 1K requests will + * use Direct I/O. + */ + if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) { + uio->uio_extflg &= ~UIO_DIRECT; + o_direct_defer = B_TRUE; + } + + /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. @@ -580,6 +763,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) ssize_t nbytes = n; if (n >= blksz && woff >= zp->z_size && P2PHASE(woff, blksz) == 0 && + !(uio->uio_extflg & UIO_DIRECT) && (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { /* * This write covers a full block. "Borrow" a buffer @@ -705,9 +889,30 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } + /* + * There is a window where a file's pages can be mmap'ed after + * zfs_setup_direct() is called. This is due to the fact that + * the rangelock in this function is acquired after calling + * zfs_setup_direct(). This is done so that + * zfs_uio_prefaultpages() does not attempt to fault in pages + * on Linux for Direct I/O requests. This is not necessary as + * the pages are pinned in memory and can not be faulted out. + * Ideally, the rangelock would be held before calling + * zfs_setup_direct() and zfs_uio_prefaultpages(); however, + * this can lead to a deadlock as zfs_getpage() also acquires + * the rangelock as a RL_WRITER and prefaulting the pages can + * lead to zfs_getpage() being called. + * + * In the case of the pages being mapped after + * zfs_setup_direct() is called, the call to update_pages() + * will still be made to make sure there is consistency between + * the ARC and the Linux page cache. This is an ufortunate + * situation as the data will be read back into the ARC after + * the Direct I/O write has completed, but this is the penality + * for writing to a mmap'ed region of a file using Direct I/O. + */ if (tx_bytes && - zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && - !(ioflag & O_DIRECT)) { + zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } @@ -756,10 +961,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * the TX_WRITE records logged here. */ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, - NULL, NULL); + uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL, + NULL); dmu_tx_commit(tx); + /* + * Direct I/O was deferred in order to grow the first block. + * At this point it can be re-enabled for subsequent writes. + */ + if (o_direct_defer) { + ASSERT(ioflag & O_DIRECT); + uio->uio_extflg |= UIO_DIRECT; + o_direct_defer = B_FALSE; + } + if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); @@ -767,10 +983,22 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) pfbytes -= nbytes; } + if (o_direct_defer) { + ASSERT(ioflag & O_DIRECT); + uio->uio_extflg |= UIO_DIRECT; + o_direct_defer = B_FALSE; + } + zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); /* + * Cleanup for Direct I/O if requested. + */ + if (uio->uio_extflg & UIO_DIRECT) + zfs_uio_free_dio_pages(uio, UIO_WRITE); + + /* * If we're in replay mode, or we made no progress, or the * uio data is inaccessible return an error. Otherwise, it's * at least a partial write, so it's successful. @@ -784,9 +1012,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (commit) zil_commit(zilog, zp->z_id); - const int64_t nwritten = start_resid - zfs_uio_resid(uio); + int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); - task_io_account_write(nwritten); zfs_exit(zfsvfs, FTAG); return (0); @@ -846,7 +1073,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; - dmu_buf_t *db; zgd_t *zgd; int error = 0; uint64_t zp_gen; @@ -890,8 +1116,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, + size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); @@ -929,18 +1155,44 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zil_fault_io = 0; } #endif + + dmu_buf_t *dbp; if (error == 0) error = dmu_buf_hold_noread(os, object, offset, zgd, - &db); + &dbp); if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; + zgd->zgd_db = dbp; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; + boolean_t direct_write = B_FALSE; + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr = + dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); + if (dr != NULL && dr->dt.dl.dr_diowrite) + direct_write = B_TRUE; + mutex_exit(&db->db_mtx); + + /* + * All Direct I/O writes will have already completed and + * the block pointer can be immediately stored in the + * log record. + */ + if (direct_write) { + /* + * A Direct I/O write always covers an entire + * block. + */ + ASSERT3U(dbp->db_size, ==, zp->z_blksz); + lr->lr_blkptr = dr->dt.dl.dr_overridden_by; + zfs_get_done(zgd, 0); + return (0); + } - zgd->zgd_db = db; + blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_bp = bp; - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); + ASSERT3U(dbp->db_offset, ==, offset); + ASSERT3U(dbp->db_size, ==, size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); @@ -975,7 +1227,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, return (error); } - static void zfs_get_done(zgd_t *zgd, int error) { @@ -1559,3 +1810,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, "Wait for dirty blocks when cloning"); + +ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, + "Enable Direct I/O"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 53992931e..66a8a9fef 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -803,6 +803,12 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { + ASSERT3U(*errorp, ==, EIO); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + } + (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -1282,20 +1288,14 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; + enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ? + ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE; - ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && - zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && - zp->zp_compress >= ZIO_COMPRESS_OFF && - zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - DMU_OT_IS_VALID(zp->zp_type) && - zp->zp_level < 32 && - zp->zp_copies > 0 && - zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, - ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? - ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); + ZIO_STAGE_OPEN, pipeline); zio->io_ready = ready; zio->io_children_ready = children_ready; @@ -1572,6 +1572,19 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; + } else if (type == ZIO_TYPE_WRITE && + pio->io_prop.zp_direct_write == B_TRUE) { + /* + * By default we only will verify checksums for Direct I/O + * writes for Linux. FreeBSD is able to place user pages under + * write protection before issuing them to the ZIO pipeline. + * + * Checksum validation errors will only be reported through + * the top-level VDEV, which is set by this child ZIO. + */ + ASSERT3P(bp, !=, NULL); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { @@ -3104,6 +3117,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; + zp.zp_direct_write = B_FALSE; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); @@ -3577,6 +3591,13 @@ zio_ddt_write(zio_t *zio) ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); + /* + * Deduplication will not take place for Direct I/O writes. The + * ddt_tree will be emptied in syncing context. Direct I/O writes take + * place in the open-context. Direct I/O write can not attempt to + * modify the ddt_tree while issuing out a write. + */ + ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE); ddt_enter(ddt); dde = ddt_lookup(ddt, bp); @@ -4509,6 +4530,19 @@ zio_vdev_io_assess(zio_t *zio) zio->io_vsd = NULL; } + /* + * If a Direct I/O write checksum verify error has occurred then this + * I/O should not attempt to be issued again. Instead the EIO will + * be returned. + */ + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); + ASSERT3U(zio->io_error, ==, EIO); + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + return (zio); + } + + if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); @@ -4822,6 +4856,49 @@ zio_checksum_verify(zio_t *zio) return (zio); } +static zio_t * +zio_dio_checksum_verify(zio_t *zio) +{ + zio_t *pio = zio_unique_parent(zio); + int error; + + ASSERT3P(zio->io_vd, !=, NULL); + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + + if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0) + goto out; + + if ((error = zio_checksum_error(zio, NULL)) != 0) { + zio->io_error = error; + if (error == ECKSUM) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_dio_verify_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + zio->io_error = SET_ERROR(EIO); + zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + + /* + * The EIO error must be propagated up to the logical + * parent ZIO in zio_notify_parent() so it can be + * returned to dmu_write_abd(). + */ + zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE; + + (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0); + } + } + +out: + return (zio); +} + + /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ @@ -5152,7 +5229,8 @@ zio_done(zio_t *zio) * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && - !vdev_is_dead(zio->io_vd)) { + !vdev_is_dead(zio->io_vd) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { @@ -5167,6 +5245,7 @@ zio_done(zio_t *zio) if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the @@ -5188,7 +5267,8 @@ zio_done(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && - !(zio->io_flags & ZIO_FLAG_CANFAIL)) { + !(zio->io_flags & ZIO_FLAG_CANFAIL) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else @@ -5239,6 +5319,14 @@ zio_done(zio_t *zio) if (zio->io_reexecute) { /* + * A Direct I/O write that has a checksum verify error should + * not attempt to reexecute. Instead, EAGAIN should just be + * propagated back up so the write can be attempt to be issued + * through the ARC. + */ + ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); + + /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not @@ -5398,6 +5486,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, + zio_dio_checksum_verify, zio_done }; |