diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/os/freebsd/spl/sys/mutex.h | 1 | ||||
-rw-r--r-- | include/os/freebsd/spl/sys/param.h | 1 | ||||
-rw-r--r-- | include/os/freebsd/spl/sys/uio.h | 31 | ||||
-rw-r--r-- | include/os/freebsd/zfs/sys/abd_os.h | 11 | ||||
-rw-r--r-- | include/os/linux/kernel/linux/kmap_compat.h | 47 | ||||
-rw-r--r-- | include/os/linux/spl/sys/uio.h | 47 | ||||
-rw-r--r-- | include/os/linux/zfs/sys/abd_os.h | 3 | ||||
-rw-r--r-- | include/os/linux/zfs/sys/zfs_znode_impl.h | 6 | ||||
-rw-r--r-- | include/os/linux/zfs/sys/zpl.h | 1 | ||||
-rw-r--r-- | include/sys/abd.h | 7 | ||||
-rw-r--r-- | include/sys/abd_impl.h | 14 | ||||
-rw-r--r-- | include/sys/arc.h | 3 | ||||
-rw-r--r-- | include/sys/dbuf.h | 29 | ||||
-rw-r--r-- | include/sys/dmu.h | 14 | ||||
-rw-r--r-- | include/sys/dmu_impl.h | 34 | ||||
-rw-r--r-- | include/sys/dmu_objset.h | 1 | ||||
-rw-r--r-- | include/sys/fm/fs/zfs.h | 2 | ||||
-rw-r--r-- | include/sys/fs/zfs.h | 11 | ||||
-rw-r--r-- | include/sys/spa.h | 12 | ||||
-rw-r--r-- | include/sys/uio_impl.h | 37 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 10 | ||||
-rw-r--r-- | include/sys/zfs_racct.h | 7 | ||||
-rw-r--r-- | include/sys/zfs_znode.h | 2 | ||||
-rw-r--r-- | include/sys/zio.h | 2 | ||||
-rw-r--r-- | include/sys/zio_impl.h | 7 |
25 files changed, 293 insertions, 47 deletions
diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index 8cfe56c75..bbff9fe80 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -70,4 +70,5 @@ typedef enum { #define mutex_exit(lock) sx_xunlock(lock) #define mutex_owned(lock) sx_xlocked(lock) #define mutex_owner(lock) sx_xholder(lock) + #endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/freebsd/spl/sys/param.h b/include/os/freebsd/spl/sys/param.h index 92724e332..96440dce0 100644 --- a/include/os/freebsd/spl/sys/param.h +++ b/include/os/freebsd/spl/sys/param.h @@ -33,6 +33,7 @@ #include <sys/types.h> #include_next <sys/param.h> #define PAGESIZE PAGE_SIZE +#define PAGESHIFT PAGE_SHIFT #define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) #ifdef _KERNEL #include <sys/systm.h> diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h index b9d41903e..2bd5bdb80 100644 --- a/include/os/freebsd/spl/sys/uio.h +++ b/include/os/freebsd/spl/sys/uio.h @@ -34,13 +34,30 @@ #include_next <sys/uio.h> #include <sys/_uio.h> #include <sys/debug.h> +#include <sys/sysmacros.h> + +/* + * uio_extflg: extended flags + */ +#define UIO_DIRECT 0x0001 /* Direct I/O requset */ typedef struct iovec iovec_t; typedef enum uio_seg zfs_uio_seg_t; typedef enum uio_rw zfs_uio_rw_t; +/* + * This structure is used when doing Direct I/O. + */ +typedef struct { + vm_page_t *pages; + int npages; +} zfs_uio_dio_t; + typedef struct zfs_uio { struct uio *uio; + offset_t uio_soffset; + uint16_t uio_extflg; + zfs_uio_dio_t uio_dio; } zfs_uio_t; #define GET_UIO_STRUCT(u) (u)->uio @@ -52,6 +69,7 @@ typedef struct zfs_uio { #define zfs_uio_iovbase(u, idx) GET_UIO_STRUCT(u)->uio_iov[(idx)].iov_base #define zfs_uio_td(u) GET_UIO_STRUCT(u)->uio_td #define zfs_uio_rw(u) GET_UIO_STRUCT(u)->uio_rw +#define zfs_uio_soffset(u) (u)->uio_soffset #define zfs_uio_fault_disable(u, set) #define zfs_uio_prefaultpages(size, u) (0) @@ -62,6 +80,13 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) } static inline void +zfs_uio_setsoffset(zfs_uio_t *uio, offset_t off) +{ + ASSERT3U(zfs_uio_offset(uio), ==, off); + zfs_uio_soffset(uio) = off; +} + +static inline void zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { zfs_uio_resid(uio) -= size; @@ -71,7 +96,11 @@ zfs_uio_advance(zfs_uio_t *uio, ssize_t size) static __inline void zfs_uio_init(zfs_uio_t *uio, struct uio *uio_s) { - GET_UIO_STRUCT(uio) = uio_s; + memset(uio, 0, sizeof (zfs_uio_t)); + if (uio_s != NULL) { + GET_UIO_STRUCT(uio) = uio_s; + zfs_uio_soffset(uio) = uio_s->uio_offset; + } } int zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio); diff --git a/include/os/freebsd/zfs/sys/abd_os.h b/include/os/freebsd/zfs/sys/abd_os.h index 57122ee83..be825b3b8 100644 --- a/include/os/freebsd/zfs/sys/abd_os.h +++ b/include/os/freebsd/zfs/sys/abd_os.h @@ -26,10 +26,15 @@ #ifndef _ABD_OS_H #define _ABD_OS_H +#include <sys/vm.h> +#include <vm/vm_page.h> + #ifdef __cplusplus extern "C" { #endif +struct abd; + struct abd_scatter { uint_t abd_offset; void *abd_chunks[1]; /* actually variable-length */ @@ -37,8 +42,14 @@ struct abd_scatter { struct abd_linear { void *abd_buf; +#if defined(_KERNEL) + struct sf_buf *sf; /* for LINEAR_PAGE FreeBSD */ +#endif }; +__attribute__((malloc)) +struct abd *abd_alloc_from_pages(vm_page_t *, unsigned long, uint64_t); + #ifdef __cplusplus } #endif diff --git a/include/os/linux/kernel/linux/kmap_compat.h b/include/os/linux/kernel/linux/kmap_compat.h index fb59c5f02..432c0e991 100644 --- a/include/os/linux/kernel/linux/kmap_compat.h +++ b/include/os/linux/kernel/linux/kmap_compat.h @@ -38,6 +38,8 @@ #define zfs_kmap_local(page) kmap_atomic(page) #define zfs_kunmap_local(addr) kunmap_atomic(addr) #endif +#define zfs_kmap(page) kmap(page) +#define zfs_kunmap(page) kunmap(page) /* 5.0 API change - no more 'type' argument for access_ok() */ #ifdef HAVE_ACCESS_OK_TYPE @@ -46,4 +48,49 @@ #define zfs_access_ok(type, addr, size) access_ok(addr, size) #endif +/* + * read returning FOLL_WRITE is due to the fact that we are stating + * that the kernel will have write access to the user pages. So, when + * a Direct I/O read request is issued, the kernel must write to the user + * pages. + * + * get_user_pages_unlocked was not available to 4.0, so we also check + * for get_user_pages on older kernels. + */ +/* 4.9 API change - for and read flag is passed as gup flags */ +#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0) + +/* 4.8 API change - no longer takes struct task_struct as arguement */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(addr, numpages, read, 0, pages) + +/* 4.0-4.3, 4.5-4.7 API */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \ + pages) + +/* 4.4 API */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT_GUP_FLAGS) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(current, current->mm, addr, numpages, pages, \ + read ? FOLL_WRITE : 0) + +/* Using get_user_pages if kernel is < 4.0 */ +#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \ + NULL) +#else +/* + * This case is unreachable. We must be able to use either + * get_user_pages_unlocked() or get_user_pages() to map user pages into + * the kernel. + */ +#error "Unknown Direct I/O interface" +#endif + #endif /* _ZFS_KMAP_H */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 5e6ea8d3c..5d483685e 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -33,6 +33,12 @@ #include <linux/bio.h> #include <asm/uaccess.h> #include <sys/types.h> +#include <sys/string.h> + +/* + * uio_extflg: extended flags + */ +#define UIO_DIRECT 0x0001 /* Direct I/O request */ #if defined(HAVE_VFS_IOV_ITER) && defined(HAVE_FAULT_IN_IOV_ITER_READABLE) #define iov_iter_fault_in_readable(a, b) fault_in_iov_iter_readable(a, b) @@ -54,6 +60,14 @@ typedef enum zfs_uio_seg { #endif } zfs_uio_seg_t; +/* + * This structures is used when doing Direct I/O. + */ +typedef struct { + struct page **pages; /* Mapped pages */ + long npages; /* Number of mapped pages */ +} zfs_uio_dio_t; + typedef struct zfs_uio { union { const struct iovec *uio_iov; @@ -62,15 +76,16 @@ typedef struct zfs_uio { struct iov_iter *uio_iter; #endif }; - int uio_iovcnt; - offset_t uio_loffset; - zfs_uio_seg_t uio_segflg; + int uio_iovcnt; /* Number of iovecs */ + offset_t uio_soffset; /* Starting logical offset */ + offset_t uio_loffset; /* Current logical offset */ + zfs_uio_seg_t uio_segflg; /* Segment type */ boolean_t uio_fault_disable; - uint16_t uio_fmode; - uint16_t uio_extflg; - ssize_t uio_resid; - - size_t uio_skip; + uint16_t uio_fmode; /* Access mode (unused) */ + uint16_t uio_extflg; /* Extra flags (UIO_DIRECT) */ + ssize_t uio_resid; /* Residual unprocessed bytes */ + size_t uio_skip; /* Skipped bytes in current iovec */ + zfs_uio_dio_t uio_dio; /* Direct I/O user pages */ struct request *rq; } zfs_uio_t; @@ -83,6 +98,7 @@ typedef struct zfs_uio { #define zfs_uio_iovlen(u, idx) (u)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(u, idx) (u)->uio_iov[(idx)].iov_base #define zfs_uio_fault_disable(u, set) (u)->uio_fault_disable = set +#define zfs_uio_soffset(u) (u)->uio_soffset #define zfs_uio_rlimit_fsize(z, u) (0) #define zfs_uio_fault_move(p, n, rw, u) zfs_uiomove((p), (n), (rw), (u)) @@ -95,6 +111,13 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) } static inline void +zfs_uio_setsoffset(zfs_uio_t *uio, offset_t off) +{ + ASSERT3U(zfs_uio_offset(uio), ==, off); + zfs_uio_soffset(uio) = off; +} + +static inline void zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { uio->uio_resid -= size; @@ -117,6 +140,8 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } static inline void @@ -146,6 +171,8 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) } uio->rq = rq; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } #if defined(HAVE_VFS_IOV_ITER) @@ -162,8 +189,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } -#endif +#endif /* HAVE_VFS_IOV_ITER */ #if defined(HAVE_ITER_IOV) #define zfs_uio_iter_iov(iter) iter_iov((iter)) diff --git a/include/os/linux/zfs/sys/abd_os.h b/include/os/linux/zfs/sys/abd_os.h index ce4f5a2bd..606e8bf68 100644 --- a/include/os/linux/zfs/sys/abd_os.h +++ b/include/os/linux/zfs/sys/abd_os.h @@ -55,6 +55,9 @@ int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +__attribute__((malloc)) +abd_t *abd_alloc_from_pages(struct page **, unsigned long, uint64_t); + #ifdef __cplusplus } #endif diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index 0be2c445a..e02886518 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -184,12 +184,6 @@ extern int zfs_inode_alloc(struct super_block *, struct inode **ip); extern void zfs_inode_destroy(struct inode *); extern void zfs_mark_inode_dirty(struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *); - -#if defined(HAVE_UIO_RW) -extern caddr_t zfs_map_page(page_t *, enum seg_rw); -extern void zfs_unmap_page(page_t *, caddr_t); -#endif /* HAVE_UIO_RW */ - extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE]; #ifdef __cplusplus diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 91a4751ff..c8eefe4fe 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -32,7 +32,6 @@ #include <linux/exportfs.h> #include <linux/falloc.h> #include <linux/parser.h> -#include <linux/task_io_accounting_ops.h> #include <linux/vfs_compat.h> #include <linux/writeback.h> #include <linux/xattr_compat.h> diff --git a/include/sys/abd.h b/include/sys/abd.h index 567b88c0f..bd3a7bd7c 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -46,6 +46,7 @@ typedef enum abd_flags { ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */ ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */ ABD_FLAG_ALLOCD = 1 << 8, /* we allocated the abd_t */ + ABD_FLAG_FROM_PAGES = 1 << 9, /* does not own pages */ } abd_flags_t; typedef struct abd { @@ -200,6 +201,12 @@ abd_get_size(abd_t *abd) return (abd->abd_size); } +static inline boolean_t +abd_is_from_pages(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_FROM_PAGES) ? B_TRUE : B_FALSE); +} + /* * Module lifecycle * Defined in each specific OS's abd_os.c diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 1eb25d94a..35a64f862 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -43,6 +43,9 @@ typedef enum abd_stats_op { /* forward declarations */ struct scatterlist; struct page; +#if defined(__FreeBSD__) && defined(_KERNEL) +struct sf_buf; +#endif struct abd_iter { /* public interface */ @@ -70,7 +73,11 @@ struct abd_iter { size_t iter_pos; size_t iter_offset; /* offset in current sg/abd_buf, */ /* abd_offset included */ +#if defined(__FreeBSD__) && defined(_KERNEL) + struct sf_buf *sf; /* used to map in vm_page_t FreeBSD */ +#else struct scatterlist *iter_sg; /* current sg */ +#endif }; extern abd_t *abd_zero_scatter; @@ -78,6 +85,7 @@ extern abd_t *abd_zero_scatter; abd_t *abd_gang_get_offset(abd_t *, size_t *); abd_t *abd_alloc_struct(size_t); void abd_free_struct(abd_t *); +void abd_init_struct(abd_t *); /* * OS specific functions @@ -108,9 +116,9 @@ void abd_iter_page(struct abd_iter *); #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) -#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) -#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) -#define ABD_GANG(abd) (abd->abd_u.abd_gang) +#define ABD_SCATTER(abd) ((abd)->abd_u.abd_scatter) +#define ABD_LINEAR_BUF(abd) ((abd)->abd_u.abd_linear.abd_buf) +#define ABD_GANG(abd) ((abd)->abd_u.abd_gang) #ifdef __cplusplus } diff --git a/include/sys/arc.h b/include/sys/arc.h index c92b3eee6..883c07b4f 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -120,7 +120,7 @@ typedef enum arc_flags /* * Private ARC flags. These flags are private ARC only flags that - * will show up in b_flags in the arc_hdr_buf_t. These flags should + * will show up in b_flags in the arc_buf_hdr_t. These flags should * only be set by ARC code. */ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ @@ -179,7 +179,6 @@ typedef enum arc_flags ARC_FLAG_COMPRESS_4 = 1 << 28, ARC_FLAG_COMPRESS_5 = 1 << 29, ARC_FLAG_COMPRESS_6 = 1 << 30 - } arc_flags_t; typedef enum arc_buf_flags { diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 8b03b1f89..56741cd2a 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -61,17 +61,17 @@ extern "C" { /* * The simplified state transition diagram for dbufs looks like: * - * +--> READ --+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * ^ | ^ ^ - * | | | | - * | +--> FILL --+ | - * | | | - * | | | - * | +------> NOFILL -----+ - * | | + * +-------> READ ------+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * ^ | ^ ^ + * | | | | + * | +-------> FILL ------+ | + * | | | | + * | | | | + * | +------> NOFILL -----+-----> UNCACHED + * | | (Direct I/O) * +---------------+ * * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range @@ -176,6 +176,7 @@ typedef struct dbuf_dirty_record { uint8_t dr_copies; boolean_t dr_nopwrite; boolean_t dr_brtwrite; + boolean_t dr_diowrite; boolean_t dr_has_raw_params; /* @@ -384,7 +385,7 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid, uint64_t *hash_out); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); -void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); @@ -393,6 +394,8 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +int dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp); +int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, @@ -473,7 +476,7 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db); +boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp); #ifdef ZFS_DEBUG diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 928f5f2b4..22cbd7fc7 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -525,6 +525,7 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 +#define WP_DIRECT_WR 0x8 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); @@ -589,6 +590,7 @@ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, dmu_buf_t ***dbpp, uint32_t flags); int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp); + /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. @@ -873,16 +875,20 @@ int dmu_free_long_object(objset_t *os, uint64_t object); #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ #define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ +#define DMU_DIRECTIO 4 /* use Direct I/O */ + int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags); + void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); -void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); +int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx, uint32_t flags); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx); #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 83ae2b76b..4eaa39940 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -35,6 +35,10 @@ #include <sys/dnode.h> #include <sys/zfs_context.h> #include <sys/zfs_ioctl.h> +#include <sys/uio.h> +#include <sys/abd.h> +#include <sys/arc.h> +#include <sys/dbuf.h> #ifdef __cplusplus extern "C" { @@ -134,7 +138,7 @@ extern "C" { * db_data_pending * db_dirtied * db_link - * db_dirty_node (??) + * db_dirty_records * db_dirtycnt * db_d.* * db.* @@ -150,8 +154,10 @@ extern "C" { * dbuf_find: none (db_holds) * dbuf_hash_insert: none (db_holds) * dmu_buf_read_array_impl: none (db_state, db_changed) - * dmu_sync: none (db_dirty_node, db_d) + * dmu_sync: none (db_dirty_records, db_d) * dnode_reallocate: none (db) + * dmu_write_direct: none (db_dirty_records, db_d) + * dmu_write_direct_done: none (db_dirty_records, db_d) * * dn_mtx (leaf) * protects: @@ -234,8 +240,9 @@ extern "C" { * dnode_new_blkid */ -struct objset; struct dmu_pool; +struct dmu_buf; +struct zgd; typedef struct dmu_sendstatus { list_node_t dss_link; @@ -245,9 +252,30 @@ typedef struct dmu_sendstatus { uint64_t dss_blocks; /* blocks visited during the sending process */ } dmu_sendstatus_t; +/* + * dmu_sync_{ready/done} args + */ +typedef struct { + dbuf_dirty_record_t *dsa_dr; + void (*dsa_done)(struct zgd *, int); + struct zgd *dsa_zgd; + dmu_tx_t *dsa_tx; +} dmu_sync_arg_t; + +void dmu_sync_done(zio_t *, arc_buf_t *buf, void *varg); +void dmu_sync_ready(zio_t *, arc_buf_t *buf, void *varg); + void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); +int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *); +int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags); +int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *); +#if defined(_KERNEL) +int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t); +int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index a9123e862..587dac738 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -134,6 +134,7 @@ struct objset { zfs_cache_type_t os_secondary_cache; zfs_prefetch_type_t os_prefetch; zfs_sync_type_t os_sync; + zfs_direct_t os_direct; zfs_redundant_metadata_type_t os_redundant_metadata; uint64_t os_recordsize; /* diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index c746600cd..55b150c04 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -42,6 +42,7 @@ extern "C" { #define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DELAY "delay" #define FM_EREPORT_ZFS_DEADMAN "deadman" +#define FM_EREPORT_ZFS_DIO_VERIFY "dio_verify" #define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" @@ -84,6 +85,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS "dio_verify_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index fc4f22cd5..3852fa031 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -193,6 +193,7 @@ typedef enum { ZFS_PROP_SNAPSHOTS_CHANGED, ZFS_PROP_PREFETCH, ZFS_PROP_VOLTHREADING, + ZFS_PROP_DIRECT, ZFS_NUM_PROPS } zfs_prop_t; @@ -533,6 +534,12 @@ typedef enum { ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; +typedef enum { + ZFS_DIRECT_DISABLED = 0, + ZFS_DIRECT_STANDARD, + ZFS_DIRECT_ALWAYS +} zfs_direct_t; + typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, @@ -790,6 +797,9 @@ typedef struct zpool_load_policy { /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" +/* Number of Direct I/O write verify errors */ +#define ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS "vdev_dio_verify_errors" + /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" @@ -1262,6 +1272,7 @@ typedef struct vdev_stat { uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_noalloc; /* allocations halted? */ uint64_t vs_pspace; /* physical capacity */ + uint64_t vs_dio_verify_errors; /* DIO write verify errors */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ diff --git a/include/sys/spa.h b/include/sys/spa.h index aa66d489e..ca30b60c0 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -949,6 +949,14 @@ typedef struct spa_iostats { kstat_named_t simple_trim_bytes_skipped; kstat_named_t simple_trim_extents_failed; kstat_named_t simple_trim_bytes_failed; + kstat_named_t arc_read_count; + kstat_named_t arc_read_bytes; + kstat_named_t arc_write_count; + kstat_named_t arc_write_bytes; + kstat_named_t direct_read_count; + kstat_named_t direct_read_bytes; + kstat_named_t direct_write_count; + kstat_named_t direct_write_bytes; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); @@ -972,6 +980,10 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); +extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, + uint32_t flags); +extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, + uint32_t flags); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index aa34edda5..90b801272 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -40,10 +40,47 @@ #define _SYS_UIO_IMPL_H #include <sys/uio.h> +#include <sys/sysmacros.h> extern int zfs_uiomove(void *, size_t, zfs_uio_rw_t, zfs_uio_t *); extern int zfs_uiocopy(void *, size_t, zfs_uio_rw_t, zfs_uio_t *, size_t *); extern void zfs_uioskip(zfs_uio_t *, size_t); +extern void zfs_uio_free_dio_pages(zfs_uio_t *, zfs_uio_rw_t); +extern int zfs_uio_get_dio_pages_alloc(zfs_uio_t *, zfs_uio_rw_t); +extern boolean_t zfs_uio_page_aligned(zfs_uio_t *); + +static inline boolean_t +zfs_dio_page_aligned(void *buf) +{ + return ((((uintptr_t)(buf) & (PAGESIZE - 1)) == 0) ? + B_TRUE : B_FALSE); +} + +static inline boolean_t +zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) +{ + return (IS_P2ALIGNED(offset, blksz)); +} + +static inline boolean_t +zfs_dio_size_aligned(uint64_t size, uint64_t blksz) +{ + return ((size % blksz) == 0); +} + +static inline boolean_t +zfs_dio_aligned(uint64_t offset, uint64_t size, uint64_t blksz) +{ + return (zfs_dio_offset_aligned(offset, blksz) && + zfs_dio_size_aligned(size, blksz)); +} + +static inline boolean_t +zfs_uio_aligned(zfs_uio_t *uio, uint64_t blksz) +{ + return (zfs_dio_aligned(zfs_uio_offset(uio), zfs_uio_resid(uio), + blksz)); +} static inline void zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 57ff31e89..abd66b8ab 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -448,9 +448,14 @@ struct vdev { /* * We rate limit ZIO delay, deadman, and checksum events, since they * can flood ZED with tons of events when a drive is acting up. + * + * We also rate limit Direct I/O write verify errors, since a user might + * be continually manipulating a buffer that can flood ZED with tons of + * events. */ zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; + zfs_ratelimit_t vdev_dio_verify_rl; zfs_ratelimit_t vdev_checksum_rl; /* @@ -649,6 +654,11 @@ extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); +/* + * VDEV checksum verification for Direct I/O writes + */ +extern uint_t zfs_vdev_direct_write_verify; + #ifdef __cplusplus } #endif diff --git a/include/sys/zfs_racct.h b/include/sys/zfs_racct.h index 0e8bd04c1..ff84cccb0 100644 --- a/include/sys/zfs_racct.h +++ b/include/sys/zfs_racct.h @@ -26,12 +26,13 @@ #ifndef _SYS_ZFS_RACCT_H #define _SYS_ZFS_RACCT_H -#include <sys/zfs_context.h> +#include <sys/types.h> +#include <sys/spa.h> /* * Platform-dependent resource accounting hooks */ -void zfs_racct_read(uint64_t size, uint64_t iops); -void zfs_racct_write(uint64_t size, uint64_t iops); +void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); +void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); #endif /* _SYS_ZFS_RACCT_H */ diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index d71144807..c852c4758 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -308,7 +308,7 @@ extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, const char *dname, znode_t *szp, znode_t *wzp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t len, boolean_t commit, - zil_callback_t callback, void *callback_data); + boolean_t o_direct, zil_callback_t callback, void *callback_data); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zio.h b/include/sys/zio.h index 3a756949a..628416e98 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -225,6 +225,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) +#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 31) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) @@ -355,6 +356,7 @@ typedef struct zio_prop { boolean_t zp_brtwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; + boolean_t zp_direct_write; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 2c846a5d4..a5e3ab238 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -160,8 +160,9 @@ enum zio_stage { ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--XT */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */ + ZIO_STAGE_DIO_CHECKSUM_VERIFY = 1 << 25, /* -W---- */ - ZIO_STAGE_DONE = 1 << 25 /* RWFCXT */ + ZIO_STAGE_DONE = 1 << 26 /* RWFCXT */ }; #define ZIO_ROOT_PIPELINE \ @@ -227,6 +228,10 @@ enum zio_stage { ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) +#define ZIO_DIRECT_WRITE_PIPELINE \ + ZIO_WRITE_PIPELINE & \ + (~ZIO_STAGE_ISSUE_ASYNC) + #define ZIO_DDT_CHILD_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_VDEV_IO_STAGES | \ |