aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorBrian Atkinson <[email protected]>2024-09-14 16:47:59 -0400
committerGitHub <[email protected]>2024-09-14 13:47:59 -0700
commita10e552b9992673626f7a2ffcc234337f23410c9 (patch)
tree90825de54248238315a5478c7a824935af09bb3c /module/zfs
parent1713aa7b4d209616fab96a68e17a6fec6837247c (diff)
Adding Direct IO Support
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads. O_DIRECT support in ZFS will always ensure there is coherency between buffered and O_DIRECT IO requests. This ensures that all IO requests, whether buffered or direct, will see the same file contents at all times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While data is written directly to VDEV disks, metadata will not be synced until the associated TXG is synced. For both O_DIRECT read and write request the offset and request sizes, at a minimum, must be PAGE_SIZE aligned. In the event they are not, then EINVAL is returned unless the direct property is set to always (see below). For O_DIRECT writes: The request also must be block aligned (recordsize) or the write request will take the normal (buffered) write path. In the event that request is block aligned and a cached copy of the buffer in the ARC, then it will be discarded from the ARC forcing all further reads to retrieve the data from disk. For O_DIRECT reads: The only alignment restrictions are PAGE_SIZE alignment. In the event that the requested data is in buffered (in the ARC) it will just be copied from the ARC into the user buffer. For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in the event that file contents are mmap'ed. In this case, all requests that are at least PAGE_SIZE aligned will just fall back to the buffered paths. If the request however is not PAGE_SIZE aligned, EINVAL will be returned as always regardless if the file's contents are mmap'ed. Since O_DIRECT writes go through the normal ZIO pipeline, the following operations are supported just as with normal buffered writes: Checksum Compression Encryption Erasure Coding There is one caveat for the data integrity of O_DIRECT writes that is distinct for each of the OS's supported by ZFS. FreeBSD - FreeBSD is able to place user pages under write protection so any data in the user buffers and written directly down to the VDEV disks is guaranteed to not change. There is no concern with data integrity and O_DIRECT writes. Linux - Linux is not able to place anonymous user pages under write protection. Because of this, if the user decides to manipulate the page contents while the write operation is occurring, data integrity can not be guaranteed. However, there is a module parameter `zfs_vdev_direct_write_verify` that controls the if a O_DIRECT writes that can occur to a top-level VDEV before a checksum verify is run before the contents of the I/O buffer are committed to disk. In the event of a checksum verification failure the write will return EIO. The number of O_DIRECT write checksum verification errors can be observed by doing `zpool status -d`, which will list all verification errors that have occurred on a top-level VDEV. Along with `zpool status`, a ZED event will be issues as `dio_verify` when a checksum verification error occurs. ZVOLs and dedup is not currently supported with Direct I/O. A new dataset property `direct` has been added with the following 3 allowable values: disabled - Accepts O_DIRECT flag, but silently ignores it and treats the request as a buffered IO request. standard - Follows the alignment restrictions outlined above for write/read IO requests when the O_DIRECT flag is used. always - Treats every write/read IO request as though it passed O_DIRECT and will do O_DIRECT if the alignment restrictions are met otherwise will redirect through the ARC. This property will not allow a request to fail. There is also a module parameter zfs_dio_enabled that can be used to force all reads and writes through the ARC. By setting this module parameter to 0, it mimics as if the direct dataset property is set to disabled. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Brian Atkinson <[email protected]> Co-authored-by: Mark Maybee <[email protected]> Co-authored-by: Matt Macy <[email protected]> Co-authored-by: Brian Behlendorf <[email protected]> Closes #10018
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/abd.c94
-rw-r--r--module/zfs/arc.c2
-rw-r--r--module/zfs/dataset_kstats.c6
-rw-r--r--module/zfs/dbuf.c317
-rw-r--r--module/zfs/dmu.c154
-rw-r--r--module/zfs/dmu_direct.c395
-rw-r--r--module/zfs/dmu_objset.c19
-rw-r--r--module/zfs/spa_stats.c46
-rw-r--r--module/zfs/vdev.c27
-rw-r--r--module/zfs/vdev_label.c4
-rw-r--r--module/zfs/zfs_fm.c2
-rw-r--r--module/zfs/zfs_ioctl.c1
-rw-r--r--module/zfs/zfs_log.c4
-rw-r--r--module/zfs/zfs_vnops.c294
-rw-r--r--module/zfs/zio.c113
15 files changed, 1250 insertions, 228 deletions
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index c8c4d2270..529deeecf 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -89,8 +89,8 @@
* functions.
*
* As an additional feature, linear and scatter ABD's can be stitched together
- * by using the gang ABD type (abd_alloc_gang_abd()). This allows for
- * multiple ABDs to be viewed as a singular ABD.
+ * by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs
+ * to be viewed as a singular ABD.
*
* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
* B_FALSE.
@@ -109,11 +109,15 @@ void
abd_verify(abd_t *abd)
{
#ifdef ZFS_DEBUG
- ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ if (abd_is_from_pages(abd)) {
+ ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS);
+ } else {
+ ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ }
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
- ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
+ ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
@@ -136,7 +140,7 @@ abd_verify(abd_t *abd)
#endif
}
-static void
+void
abd_init_struct(abd_t *abd)
{
list_link_init(&abd->abd_gang_link);
@@ -238,6 +242,7 @@ abd_free_linear(abd_t *abd)
abd_free_linear_page(abd);
return;
}
+
if (abd->abd_flags & ABD_FLAG_META) {
zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
} else {
@@ -520,6 +525,21 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
*/
abd->abd_flags |= ABD_FLAG_LINEAR;
+ /*
+ * User pages from Direct I/O requests may be in a single page
+ * (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag
+ * that here for abd. This is required because we have to be
+ * careful when borrowing the buffer from the ABD because we
+ * can not place user pages under write protection on Linux.
+ * See the comments in abd_os.c for abd_borrow_buf(),
+ * abd_borrow_buf_copy(), abd_return_buf() and
+ * abd_return_buf_copy().
+ */
+ if (abd_is_from_pages(sabd)) {
+ abd->abd_flags |= ABD_FLAG_FROM_PAGES |
+ ABD_FLAG_LINEAR_PAGE;
+ }
+
ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
} else if (abd_is_gang(sabd)) {
size_t left = size;
@@ -648,70 +668,6 @@ abd_to_buf(abd_t *abd)
return (ABD_LINEAR_BUF(abd));
}
-/*
- * Borrow a raw buffer from an ABD without copying the contents of the ABD
- * into the buffer. If the ABD is scattered, this will allocate a raw buffer
- * whose contents are undefined. To copy over the existing data in the ABD, use
- * abd_borrow_buf_copy() instead.
- */
-void *
-abd_borrow_buf(abd_t *abd, size_t n)
-{
- void *buf;
- abd_verify(abd);
- ASSERT3U(abd->abd_size, >=, n);
- if (abd_is_linear(abd)) {
- buf = abd_to_buf(abd);
- } else {
- buf = zio_buf_alloc(n);
- }
-#ifdef ZFS_DEBUG
- (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
-#endif
- return (buf);
-}
-
-void *
-abd_borrow_buf_copy(abd_t *abd, size_t n)
-{
- void *buf = abd_borrow_buf(abd, n);
- if (!abd_is_linear(abd)) {
- abd_copy_to_buf(buf, abd, n);
- }
- return (buf);
-}
-
-/*
- * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
- * not change the contents of the ABD and will ASSERT that you didn't modify
- * the buffer since it was borrowed. If you want any changes you made to buf to
- * be copied back to abd, use abd_return_buf_copy() instead.
- */
-void
-abd_return_buf(abd_t *abd, void *buf, size_t n)
-{
- abd_verify(abd);
- ASSERT3U(abd->abd_size, >=, n);
-#ifdef ZFS_DEBUG
- (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
-#endif
- if (abd_is_linear(abd)) {
- ASSERT3P(buf, ==, abd_to_buf(abd));
- } else {
- ASSERT0(abd_cmp_buf(abd, buf, n));
- zio_buf_free(buf, n);
- }
-}
-
-void
-abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
-{
- if (!abd_is_linear(abd)) {
- abd_copy_from_buf(abd, buf, n);
- }
- abd_return_buf(abd, buf, n);
-}
-
void
abd_release_ownership_of_buf(abd_t *abd)
{
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 714a30e86..b5bcd367b 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -5961,7 +5961,7 @@ top:
ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
metadata, misses);
- zfs_racct_read(size, 1);
+ zfs_racct_read(spa, size, 1, 0);
}
/* Check if the spa even has l2 configured */
diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c
index 914260e74..27a04c2af 100644
--- a/module/zfs/dataset_kstats.c
+++ b/module/zfs/dataset_kstats.c
@@ -217,8 +217,7 @@ dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
}
void
-dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
- int64_t nwritten)
+dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten)
{
ASSERT3S(nwritten, >=, 0);
@@ -230,8 +229,7 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
}
void
-dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
- int64_t nread)
+dataset_kstats_update_read_kstats(dataset_kstats_t *dk, int64_t nread)
{
ASSERT3S(nread, >=, 0);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 099883ba2..df9368fc8 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -628,7 +628,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
* L2ARC.
*/
boolean_t
-dbuf_is_l2cacheable(dmu_buf_impl_t *db)
+dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)
{
if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
(db->db_objset->os_secondary_cache ==
@@ -636,10 +636,17 @@ dbuf_is_l2cacheable(dmu_buf_impl_t *db)
if (l2arc_exclude_special == 0)
return (B_TRUE);
- blkptr_t *bp = db->db_blkptr;
- if (bp == NULL || BP_IS_HOLE(bp))
+ /*
+ * bp must be checked in the event it was passed from
+ * dbuf_read_impl() as the result of a the BP being set from
+ * a Direct I/O write in dbuf_read(). See comments in
+ * dbuf_read().
+ */
+ blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;
+
+ if (db_bp == NULL || BP_IS_HOLE(db_bp))
return (B_FALSE);
- uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+ uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);
vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
vdev_t *vd = NULL;
@@ -1380,6 +1387,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
mutex_enter(&db->db_mtx);
ASSERT3U(db->db_state, ==, DB_READ);
+
/*
* All reads are synchronous, so we must have a hold on the dbuf
*/
@@ -1570,12 +1578,11 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
*/
static int
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
- db_lock_type_t dblt, const void *tag)
+ db_lock_type_t dblt, blkptr_t *bp, const void *tag)
{
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err, zio_flags;
- blkptr_t bp, *bpp = NULL;
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1589,43 +1596,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
goto early_unlock;
}
- /*
- * If we have a pending block clone, we don't want to read the
- * underlying block, but the content of the block being cloned,
- * pointed by the dirty record, so we have the most recent data.
- * If there is no dirty record, then we hit a race in a sync
- * process when the dirty record is already removed, while the
- * dbuf is not yet destroyed. Such case is equivalent to uncached.
- */
- if (db->db_state == DB_NOFILL) {
- dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
- if (dr != NULL) {
- if (!dr->dt.dl.dr_brtwrite) {
- err = EIO;
- goto early_unlock;
- }
- bp = dr->dt.dl.dr_overridden_by;
- bpp = &bp;
- }
- }
-
- if (bpp == NULL && db->db_blkptr != NULL) {
- bp = *db->db_blkptr;
- bpp = &bp;
- }
-
- err = dbuf_read_hole(db, dn, bpp);
+ err = dbuf_read_hole(db, dn, bp);
if (err == 0)
goto early_unlock;
- ASSERT(bpp != NULL);
+ ASSERT(bp != NULL);
/*
* Any attempt to read a redacted block should result in an error. This
* will never happen under normal conditions, but can be useful for
* debugging purposes.
*/
- if (BP_IS_REDACTED(bpp)) {
+ if (BP_IS_REDACTED(bp)) {
ASSERT(dsl_dataset_feature_is_active(
db->db_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
@@ -1640,9 +1622,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
- if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
spa_log_error(db->db_objset->os_spa, &zb,
- BP_GET_LOGICAL_BIRTH(bpp));
+ BP_GET_LOGICAL_BIRTH(bp));
err = SET_ERROR(EIO);
goto early_unlock;
}
@@ -1653,7 +1635,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
if (!DBUF_IS_CACHEABLE(db))
aflags |= ARC_FLAG_UNCACHED;
- else if (dbuf_is_l2cacheable(db))
+ else if (dbuf_is_l2cacheable(db, bp))
aflags |= ARC_FLAG_L2CACHE;
dbuf_add_ref(db, NULL);
@@ -1661,17 +1643,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
zio_flags = (flags & DB_RF_CANFAIL) ?
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
- if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
+ if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;
+
/*
- * The zio layer will copy the provided blkptr later, but we have our
- * own copy so that we can release the parent's rwlock. We have to
- * do that so that if dbuf_read_done is called synchronously (on
+ * The zio layer will copy the provided blkptr later, but we need to
+ * do this now so that we can release the parent's rwlock. We have to
+ * do that now so that if dbuf_read_done is called synchronously (on
* an l1 cache hit) we don't acquire the db_mtx while holding the
* parent's rwlock, which would be a lock ordering violation.
*/
+ blkptr_t copy = *bp;
dmu_buf_unlock_parent(db, dblt, tag);
- return (arc_read(zio, db->db_objset->os_spa, bpp,
+ return (arc_read(zio, db->db_objset->os_spa, &copy,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb));
@@ -1844,13 +1828,30 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
ASSERT(db->db_state == DB_UNCACHED ||
db->db_state == DB_NOFILL);
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
- if (pio == NULL && (db->db_state == DB_NOFILL ||
- (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
- spa_t *spa = dn->dn_objset->os_spa;
- pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
- need_wait = B_TRUE;
+ blkptr_t *bp;
+
+ /*
+ * If a block clone or Direct I/O write has occurred we will
+ * get the dirty records overridden BP so we get the most
+ * recent data.
+ */
+ err = dmu_buf_get_bp_from_dbuf(db, &bp);
+
+ if (!err) {
+ if (pio == NULL && (db->db_state == DB_NOFILL ||
+ (bp != NULL && !BP_IS_HOLE(bp)))) {
+ spa_t *spa = dn->dn_objset->os_spa;
+ pio =
+ zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ need_wait = B_TRUE;
+ }
+
+ err =
+ dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dmu_buf_unlock_parent(db, dblt, FTAG);
}
- err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
miss = (db->db_state != DB_CACHED);
}
@@ -1918,6 +1919,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
uint64_t txg = dr->dr_txg;
ASSERT(MUTEX_HELD(&db->db_mtx));
+
/*
* This assert is valid because dmu_sync() expects to be called by
* a zilog's get_data while holding a range lock. This call only
@@ -1936,16 +1938,20 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);
- if (dr->dt.dl.dr_brtwrite) {
+ if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
ASSERT0P(dr->dt.dl.dr_data);
dr->dt.dl.dr_data = db->db_buf;
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
dr->dt.dl.dr_brtwrite = B_FALSE;
+ dr->dt.dl.dr_diowrite = B_FALSE;
dr->dt.dl.dr_has_raw_params = B_FALSE;
/*
+ * In the event that Direct I/O was used, we do not
+ * need to release the buffer from the ARC.
+ *
* Release the already-written buffer, so we leave it in
* a consistent dirty state. Note that all callers are
* modifying the buffer, so they will immediately do
@@ -2084,6 +2090,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
*/
dmu_buf_will_dirty(&db->db, tx);
+ VERIFY3P(db->db_buf, !=, NULL);
+
/* create the data buffer for the new block */
buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
@@ -2532,6 +2540,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
uint64_t txg = tx->tx_txg;
boolean_t brtwrite;
+ boolean_t diowrite;
ASSERT(txg != 0);
@@ -2557,7 +2566,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(dr->dr_dbuf == db);
brtwrite = dr->dt.dl.dr_brtwrite;
+ diowrite = dr->dt.dl.dr_diowrite;
if (brtwrite) {
+ ASSERT3B(diowrite, ==, B_FALSE);
/*
* We are freeing a block that we cloned in the same
* transaction group.
@@ -2598,10 +2609,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_state != DB_NOFILL && !brtwrite) {
dbuf_unoverride(dr);
- ASSERT(db->db_buf != NULL);
- ASSERT(dr->dt.dl.dr_data != NULL);
- if (dr->dt.dl.dr_data != db->db_buf)
+ if (dr->dt.dl.dr_data != db->db_buf) {
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
@@ -2610,7 +2622,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_dirtycnt -= 1;
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+ ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
@@ -2670,8 +2682,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
* want to make sure dbuf_read() will read the pending cloned block and
* not the uderlying block that is being replaced. dbuf_undirty() will
- * do dbuf_unoverride(), so we will end up with cloned block content,
- * without overridden BP.
+ * do brt_pending_remove() before removing the dirty record.
*/
(void) dbuf_read(db, NULL, flags);
if (undirty) {
@@ -2701,23 +2712,126 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
return (dr != NULL);
}
+/*
+ * Normally the db_blkptr points to the most recent on-disk content for the
+ * dbuf (and anything newer will be cached in the dbuf). However, a pending
+ * block clone or not yet synced Direct I/O write will have a dirty record BP
+ * pointing to the most recent data.
+ */
+int
+dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ int error = 0;
+
+ if (db->db_level != 0) {
+ *bp = db->db_blkptr;
+ return (0);
+ }
+
+ *bp = db->db_blkptr;
+ dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+ if (dr && db->db_state == DB_NOFILL) {
+ /* Block clone */
+ if (!dr->dt.dl.dr_brtwrite)
+ error = EIO;
+ else
+ *bp = &dr->dt.dl.dr_overridden_by;
+ } else if (dr && db->db_state == DB_UNCACHED) {
+ /* Direct I/O write */
+ if (dr->dt.dl.dr_diowrite)
+ *bp = &dr->dt.dl.dr_overridden_by;
+ }
+
+ return (error);
+}
+
+/*
+ * Direct I/O reads can read directly from the ARC, but the data has
+ * to be untransformed in order to copy it over into user pages.
+ */
+int
+dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)
+{
+ int err = 0;
+ DB_DNODE_ENTER(db);
+ dnode_t *dn = DB_DNODE(db);
+
+ ASSERT3S(db->db_state, ==, DB_CACHED);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ /*
+ * Ensure that this block's dnode has been decrypted if
+ * the caller has requested decrypted data.
+ */
+ err = dbuf_read_verify_dnode_crypt(db, dn, 0);
+
+ /*
+ * If the arc buf is compressed or encrypted and the caller
+ * requested uncompressed data, we need to untransform it
+ * before returning. We also call arc_untransform() on any
+ * unauthenticated blocks, which will verify their MAC if
+ * the key is now available.
+ */
+ if (err == 0 && db->db_buf != NULL &&
+ (arc_is_encrypted(db->db_buf) ||
+ arc_is_unauthenticated(db->db_buf) ||
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ zbookmark_phys_t zb;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+ db->db.db_object, db->db_level, db->db_blkid);
+ dbuf_fix_old_data(db, spa_syncing_txg(spa));
+ err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
+ dbuf_set_data(db, db->db_buf);
+ }
+ DB_DNODE_EXIT(db);
+ DBUF_STAT_BUMP(hash_hits);
+
+ return (err);
+}
+
void
-dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
+ /*
+ * Block clones and Direct I/O writes always happen in open-context.
+ */
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT0(db->db_level);
+ ASSERT(!dmu_tx_is_syncing(tx));
+ ASSERT0(db->db_level);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
- /*
- * Block cloning: We are going to clone into this block, so undirty
- * modifications done to this block so far in this txg. This includes
- * writes and clones into this block.
- */
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
- VERIFY(!dbuf_undirty(db, tx));
+
+ /*
+ * We are going to clone or issue a Direct I/O write on this block, so
+ * undirty modifications done to this block so far in this txg. This
+ * includes writes and clones into this block.
+ *
+ * If there dirty record associated with this txg from a previous Direct
+ * I/O write then space accounting cleanup takes place. It is important
+ * to go ahead free up the space accounting through dbuf_undirty() ->
+ * dbuf_unoverride() -> zio_free(). Space accountiung for determining
+ * if a write can occur in zfs_write() happens through dmu_tx_assign().
+ * This can cause an issue with Direct I/O writes in the case of
+ * overwriting the same block, because all DVA allocations are being
+ * done in open-context. Constantly allowing Direct I/O overwrites to
+ * the same block can exhaust the pools available space leading to
+ * ENOSPC errors at the DVA allocation part of the ZIO pipeline, which
+ * will eventually suspend the pool. By cleaning up sapce acccounting
+ * now, the ENOSPC error can be avoided.
+ *
+ * Since we are undirtying the record in open-context, we must have a
+ * hold on the db, so it should never be evicted after calling
+ * dbuf_undirty().
+ */
+ VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);
ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+
if (db->db_buf != NULL) {
/*
* If there is an associated ARC buffer with this dbuf we can
@@ -2728,6 +2842,11 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
arc_buf_destroy(db->db_buf, db);
+ /*
+ * Setting the dbuf's data pointers to NULL will force all
+ * future reads down to the devices to get the most up to date
+ * version of the data after a Direct I/O write has completed.
+ */
db->db_buf = NULL;
dbuf_clear_data(db);
}
@@ -2736,7 +2855,8 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
ASSERT3P(db->db.db_data, ==, NULL);
db->db_state = DB_NOFILL;
- DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+ DTRACE_SET_STATE(db,
+ "allocating NOFILL buffer for clone or direct I/O write");
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
@@ -2773,21 +2893,28 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
dmu_tx_private_ok(tx));
mutex_enter(&db->db_mtx);
- if (db->db_state == DB_NOFILL) {
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+ if (db->db_state == DB_NOFILL ||
+ (db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
/*
- * Block cloning: We will be completely overwriting a block
- * cloned in this transaction group, so let's undirty the
- * pending clone and mark the block as uncached. This will be
- * as if the clone was never done. But if the fill can fail
- * we should have a way to return back to the cloned data.
+ * If the fill can fail we should have a way to return back to
+ * the cloned or Direct I/O write data.
*/
- if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+ if (canfail && dr) {
mutex_exit(&db->db_mtx);
dmu_buf_will_dirty(db_fake, tx);
return;
}
- VERIFY(!dbuf_undirty(db, tx));
- db->db_state = DB_UNCACHED;
+ /*
+ * Block cloning: We will be completely overwriting a block
+ * cloned in this transaction group, so let's undirty the
+ * pending clone and mark the block as uncached. This will be
+ * as if the clone was never done.
+ */
+ if (dr && dr->dt.dl.dr_brtwrite) {
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_state = DB_UNCACHED;
+ }
}
mutex_exit(&db->db_mtx);
@@ -4080,7 +4207,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
} else {
mutex_exit(&db->db_mtx);
}
-
}
#pragma weak dmu_buf_refcount = dbuf_refcount
@@ -4540,24 +4666,32 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
/*
- * To be synced, we must be dirtied. But we
- * might have been freed after the dirty.
+ * To be synced, we must be dirtied. But we might have been freed
+ * after the dirty.
*/
if (db->db_state == DB_UNCACHED) {
/* This buffer has been freed since it was dirtied */
- ASSERT(db->db.db_data == NULL);
+ ASSERT3P(db->db.db_data, ==, NULL);
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
} else if (db->db_state == DB_READ) {
/*
- * This buffer has a clone we need to write, and an in-flight
- * read on the BP we're about to clone. Its safe to issue the
- * write here because the read has already been issued and the
- * contents won't change.
+ * This buffer was either cloned or had a Direct I/O write
+ * occur and has an in-flgiht read on the BP. It is safe to
+ * issue the write here, because the read has already been
+ * issued and the contents won't change.
+ *
+ * We can verify the case of both the clone and Direct I/O
+ * write by making sure the first dirty record for the dbuf
+ * has no ARC buffer associated with it.
*/
- ASSERT(dr->dt.dl.dr_brtwrite &&
- dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
+ dbuf_dirty_record_t *dr_head =
+ list_head(&db->db_dirty_records);
+ ASSERT3P(db->db_buf, ==, NULL);
+ ASSERT3P(db->db.db_data, ==, NULL);
+ ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL);
+ ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);
} else {
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
@@ -4608,8 +4742,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_check_blkptr(dn, db);
/*
- * If this buffer is in the middle of an immediate write,
- * wait for the synchronous IO to complete.
+ * If this buffer is in the middle of an immediate write, wait for the
+ * synchronous IO to complete.
+ *
+ * This is also valid even with Direct I/O writes setting a dirty
+ * records override state into DR_IN_DMU_SYNC, because all
+ * Direct I/O writes happen in open-context.
*/
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -4913,8 +5051,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (db->db_level == 0) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+
+ /* no dr_data if this is a NO_FILL or Direct I/O */
if (dr->dt.dl.dr_data != NULL &&
dr->dt.dl.dr_data != db->db_buf) {
+ ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
+ ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
arc_buf_destroy(dr->dt.dl.dr_data, db);
}
} else {
@@ -5180,7 +5322,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
* The BP for this block has been provided by open context
- * (by dmu_sync() or dmu_buf_write_embedded()).
+ * (by dmu_sync(), dmu_write_direct(),
+ * or dmu_buf_write_embedded()).
*/
abd_t *contents = (data != NULL) ?
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
@@ -5219,7 +5362,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dr_zio = arc_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
- dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+ dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,
children_ready_cb, dbuf_write_done, db,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
@@ -5239,7 +5382,7 @@ EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_is_dirty);
-EXPORT_SYMBOL(dmu_buf_will_clone);
+EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
EXPORT_SYMBOL(dmu_buf_fill_done);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index b3eda8ea5..3f87cfe6b 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -609,8 +609,16 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dbp[i] = &db->db;
}
- if (!read)
- zfs_racct_write(length, nblks);
+ /*
+ * If we are doing O_DIRECT we still hold the dbufs, even for reads,
+ * but we do not issue any reads here. We do not want to account for
+ * writes in this case.
+ *
+ * O_DIRECT write/read accounting takes place in
+ * dmu_{write/read}_abd().
+ */
+ if (!read && ((flags & DMU_DIRECTIO) == 0))
+ zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
if (zs)
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
@@ -897,7 +905,7 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
/*
* Get the next "chunk" of file data to free. We traverse the file from
- * the end so that the file gets shorter over time (if we crashes in the
+ * the end so that the file gets shorter over time (if we crash in the
* middle, this will leave us in a better state). We find allocated file
* data by simply searching the allocated level 1 indirects.
*
@@ -1168,7 +1176,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
/*
* Deal with odd block sizes, where there can't be data past the first
- * block. If we ever do the tail block optimization, we will need to
+ * block. If we ever do the tail block optimization, we will need to
* handle that here as well.
*/
if (dn->dn_maxblkid == 0) {
@@ -1178,6 +1186,18 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
size = newsz;
}
+ if (size == 0)
+ return (0);
+
+ /* Allow Direct I/O when requested and properly aligned */
+ if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) &&
+ zfs_dio_aligned(offset, size, PAGESIZE)) {
+ abd_t *data = abd_get_from_buf(buf, size);
+ err = dmu_read_abd(dn, offset, size, data, flags);
+ abd_free(data);
+ return (err);
+ }
+
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
int i;
@@ -1286,22 +1306,41 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
}
/*
- * Note: Lustre is an external consumer of this interface.
+ * This interface is not used internally by ZFS but is provided for
+ * use by Lustre which is built on the DMU interfaces.
*/
-void
-dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
- const void *buf, dmu_tx_t *tx)
+int
+dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx, uint32_t flags)
{
dmu_buf_t **dbp;
int numbufs;
+ int error;
if (size == 0)
- return;
+ return (0);
+
+ /* Allow Direct I/O when requested and properly aligned */
+ if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
+ zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
+ abd_t *data = abd_get_from_buf((void *)buf, size);
+ error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
+ abd_free(data);
+ return (error);
+ }
VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (0);
+}
+
+int
+dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
}
void
@@ -1365,6 +1404,9 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
dmu_buf_t **dbp;
int numbufs, i, err;
+ if (uio->uio_extflg & UIO_DIRECT)
+ return (dmu_read_uio_direct(dn, uio, size));
+
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
@@ -1453,23 +1495,53 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
dmu_buf_t **dbp;
int numbufs;
int err = 0;
- int i;
+ uint64_t write_size;
- err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
+top:
+ write_size = size;
+
+ /*
+ * We only allow Direct I/O writes to happen if we are block
+ * sized aligned. Otherwise, we pass the write off to the ARC.
+ */
+ if ((uio->uio_extflg & UIO_DIRECT) &&
+ (write_size >= dn->dn_datablksz)) {
+ if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
+ dn->dn_datablksz)) {
+ return (dmu_write_uio_direct(dn, uio, size, tx));
+ } else if (write_size > dn->dn_datablksz &&
+ zfs_dio_offset_aligned(zfs_uio_offset(uio),
+ dn->dn_datablksz)) {
+ write_size =
+ dn->dn_datablksz * (write_size / dn->dn_datablksz);
+ err = dmu_write_uio_direct(dn, uio, write_size, tx);
+ if (err == 0) {
+ size -= write_size;
+ goto top;
+ } else {
+ return (err);
+ }
+ } else {
+ write_size =
+ P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
+ }
+ }
+
+ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
if (err)
return (err);
- for (i = 0; i < numbufs; i++) {
+ for (int i = 0; i < numbufs; i++) {
uint64_t tocpy;
int64_t bufoff;
dmu_buf_t *db = dbp[i];
- ASSERT(size > 0);
+ ASSERT(write_size > 0);
offset_t off = zfs_uio_offset(uio);
bufoff = off - db->db_offset;
- tocpy = MIN(db->db_size - bufoff, size);
+ tocpy = MIN(db->db_size - bufoff, write_size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
@@ -1489,10 +1561,18 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
if (err)
break;
+ write_size -= tocpy;
size -= tocpy;
}
+ IMPLY(err == 0, write_size == 0);
+
dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
+ goto top;
+ }
+
return (err);
}
@@ -1731,7 +1811,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
* same size as the dbuf.
*/
if (offset == db->db.db_offset && blksz == db->db.db_size) {
- zfs_racct_write(blksz, 1);
+ zfs_racct_write(os->os_spa, blksz, 1, 0);
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
@@ -1761,23 +1841,22 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
return (err);
}
-typedef struct {
- dbuf_dirty_record_t *dsa_dr;
- dmu_sync_cb_t *dsa_done;
- zgd_t *dsa_zgd;
- dmu_tx_t *dsa_tx;
-} dmu_sync_arg_t;
-
-static void
+void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
(void) buf;
dmu_sync_arg_t *dsa = varg;
- dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
- blkptr_t *bp = zio->io_bp;
if (zio->io_error == 0) {
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
+ blkptr_t *bp = zio->io_bp;
+
if (BP_IS_HOLE(bp)) {
+ dmu_buf_t *db = NULL;
+ if (dr)
+ db = &(dr->dr_dbuf->db);
+ else
+ db = dsa->dsa_zgd->zgd_db;
/*
* A block of zeros may compress to a hole, but the
* block size still needs to be known for replay.
@@ -1796,7 +1875,7 @@ dmu_sync_late_arrival_ready(zio_t *zio)
dmu_sync_ready(zio, NULL, zio->io_private);
}
-static void
+void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
(void) buf;
@@ -1809,7 +1888,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
* Record the vdev(s) backing this blkptr so they can be flushed after
* the writes for the lwb have completed.
*/
- if (zio->io_error == 0) {
+ if (zgd && zio->io_error == 0) {
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
}
@@ -1848,10 +1927,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
} else {
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
}
+
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
- dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+ if (dsa->dsa_done)
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
kmem_free(dsa, sizeof (*dsa));
}
@@ -2120,9 +2201,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dsa->dsa_tx = NULL;
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
- dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
- &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+ dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db),
+ dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL,
+ dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL,
+ &zb));
return (0);
}
@@ -2385,6 +2467,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_nopwrite = nopwrite;
zp->zp_encrypt = encrypt;
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
+ zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -2594,7 +2677,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
ASSERT(db->db_blkid != DMU_SPILL_BLKID);
ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
- dmu_buf_will_clone(dbuf, tx);
+ dmu_buf_will_clone_or_dio(dbuf, tx);
mutex_enter(&db->db_mtx);
@@ -2817,8 +2900,15 @@ EXPORT_SYMBOL(dmu_free_long_range);
EXPORT_SYMBOL(dmu_free_long_object);
EXPORT_SYMBOL(dmu_read);
EXPORT_SYMBOL(dmu_read_by_dnode);
+EXPORT_SYMBOL(dmu_read_uio);
+EXPORT_SYMBOL(dmu_read_uio_dbuf);
+EXPORT_SYMBOL(dmu_read_uio_dnode);
EXPORT_SYMBOL(dmu_write);
EXPORT_SYMBOL(dmu_write_by_dnode);
+EXPORT_SYMBOL(dmu_write_by_dnode_flags);
+EXPORT_SYMBOL(dmu_write_uio);
+EXPORT_SYMBOL(dmu_write_uio_dbuf);
+EXPORT_SYMBOL(dmu_write_uio_dnode);
EXPORT_SYMBOL(dmu_prealloc);
EXPORT_SYMBOL(dmu_object_info);
EXPORT_SYMBOL(dmu_object_info_from_dnode);
diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c
new file mode 100644
index 000000000..91a7fd8df
--- /dev/null
+++ b/module/zfs/dmu_direct.c
@@ -0,0 +1,395 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_racct.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dmu_objset.h>
+
+static abd_t *
+make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,
+ uint64_t size)
+{
+ size_t buf_size = db->db.db_size;
+ abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;
+ size_t buf_off = 0;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (offset > db->db.db_offset) {
+ size_t pre_size = offset - db->db.db_offset;
+ pre_buf = abd_alloc_for_io(pre_size, B_TRUE);
+ buf_size -= pre_size;
+ buf_off = 0;
+ } else {
+ buf_off = db->db.db_offset - offset;
+ size -= buf_off;
+ }
+
+ if (size < buf_size) {
+ size_t post_size = buf_size - size;
+ post_buf = abd_alloc_for_io(post_size, B_TRUE);
+ buf_size -= post_size;
+ }
+
+ ASSERT3U(buf_size, >, 0);
+ abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);
+
+ if (pre_buf || post_buf) {
+ mbuf = abd_alloc_gang();
+ if (pre_buf)
+ abd_gang_add(mbuf, pre_buf, B_TRUE);
+ abd_gang_add(mbuf, buf, B_TRUE);
+ if (post_buf)
+ abd_gang_add(mbuf, post_buf, B_TRUE);
+ } else {
+ mbuf = buf;
+ }
+
+ return (mbuf);
+}
+
+static void
+dmu_read_abd_done(zio_t *zio)
+{
+ abd_free(zio->io_abd);
+}
+
+static void
+dmu_write_direct_ready(zio_t *zio)
+{
+ dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dmu_write_direct_done(zio_t *zio)
+{
+ dmu_sync_arg_t *dsa = zio->io_private;
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3P(db->db_buf, ==, NULL);
+ ASSERT3P(dr->dt.dl.dr_data, ==, NULL);
+ ASSERT3P(db->db.db_data, ==, NULL);
+ db->db_state = DB_UNCACHED;
+ mutex_exit(&db->db_mtx);
+
+ dmu_sync_done(zio, NULL, zio->io_private);
+
+ if (zio->io_error != 0) {
+ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+ ASSERT3U(zio->io_error, ==, EIO);
+
+ /*
+ * In the event of an I/O error this block has been freed in
+ * zio_done() through zio_dva_unallocate(). Calling
+ * dmu_sync_done() above set dr_override_state to
+ * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
+ * dbuf_unoverride(), it will skip doing zio_free() to free
+ * this block as that was already taken care of.
+ *
+ * Since we are undirtying the record in open-context, we must
+ * have a hold on the db, so it should never be evicted after
+ * calling dbuf_undirty().
+ */
+ mutex_enter(&db->db_mtx);
+ VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);
+ mutex_exit(&db->db_mtx);
+ }
+
+ kmem_free(zio->io_bp, sizeof (blkptr_t));
+ zio->io_bp = NULL;
+}
+
+int
+dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
+{
+ objset_t *os = db->db_objset;
+ dsl_dataset_t *ds = dmu_objset_ds(os);
+ zbookmark_phys_t zb;
+ dbuf_dirty_record_t *dr_head;
+
+ SET_BOOKMARK(&zb, ds->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ DB_DNODE_ENTER(db);
+ zio_prop_t zp;
+ dmu_write_policy(os, DB_DNODE(db), db->db_level,
+ WP_DMU_SYNC | WP_DIRECT_WR, &zp);
+ DB_DNODE_EXIT(db);
+
+ /*
+ * Dirty this dbuf with DB_NOFILL since we will not have any data
+ * associated with the dbuf.
+ */
+ dmu_buf_will_clone_or_dio(&db->db, tx);
+
+ mutex_enter(&db->db_mtx);
+
+ uint64_t txg = dmu_tx_get_txg(tx);
+ ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));
+ ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));
+
+ dr_head = list_head(&db->db_dirty_records);
+ ASSERT3U(dr_head->dr_txg, ==, txg);
+ dr_head->dt.dl.dr_diowrite = B_TRUE;
+ dr_head->dr_accounted = db->db.db_size;
+
+ blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ if (db->db_blkptr != NULL) {
+ /*
+ * Fill in bp with the current block pointer so that
+ * the nopwrite code can check if we're writing the same
+ * data that's already on disk.
+ */
+ *bp = *db->db_blkptr;
+ } else {
+ memset(bp, 0, sizeof (blkptr_t));
+ }
+
+ /*
+ * Disable nopwrite if the current block pointer could change
+ * before this TXG syncs.
+ */
+ if (list_next(&db->db_dirty_records, dr_head) != NULL)
+ zp.zp_nopwrite = B_FALSE;
+
+ ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
+ dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+
+ mutex_exit(&db->db_mtx);
+
+ dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);
+
+ dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = dr_head;
+ dsa->dsa_tx = tx;
+
+ zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,
+ db->db.db_size, db->db.db_size, &zp,
+ dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);
+
+ if (pio == NULL)
+ return (zio_wait(zio));
+
+ zio_nowait(zio);
+
+ return (0);
+}
+
+int
+dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
+ abd_t *data, uint32_t flags, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ spa_t *spa = dn->dn_objset->os_spa;
+ int numbufs, err;
+
+ ASSERT(flags & DMU_DIRECTIO);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset,
+ size, B_FALSE, FTAG, &numbufs, &dbp, flags);
+ if (err)
+ return (err);
+
+ zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ for (int i = 0; i < numbufs && err == 0; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+
+ abd_t *abd = abd_get_offset_size(data,
+ db->db.db_offset - offset, dn->dn_datablksz);
+
+ zfs_racct_write(spa, db->db.db_size, 1, flags);
+ err = dmu_write_direct(pio, db, abd, tx);
+ ASSERT0(err);
+ }
+
+ err = zio_wait(pio);
+
+ /*
+ * The dbuf must be held until the Direct I/O write has completed in
+ * the event there was any errors and dbuf_undirty() was called.
+ */
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (err);
+}
+
+int
+dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
+ abd_t *data, uint32_t flags)
+{
+ objset_t *os = dn->dn_objset;
+ spa_t *spa = os->os_spa;
+ dmu_buf_t **dbp;
+ int numbufs, err;
+
+ ASSERT(flags & DMU_DIRECTIO);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset,
+ size, B_FALSE, FTAG, &numbufs, &dbp, flags);
+ if (err)
+ return (err);
+
+ zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ for (int i = 0; i < numbufs; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ abd_t *mbuf;
+ zbookmark_phys_t zb;
+ blkptr_t *bp;
+
+ mutex_enter(&db->db_mtx);
+
+ SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ /*
+ * If there is another read for this dbuf, we will wait for
+ * that to complete first before checking the db_state below.
+ */
+ while (db->db_state == DB_READ)
+ cv_wait(&db->db_changed, &db->db_mtx);
+
+ err = dmu_buf_get_bp_from_dbuf(db, &bp);
+ if (err) {
+ mutex_exit(&db->db_mtx);
+ goto error;
+ }
+
+ /*
+ * There is no need to read if this is a hole or the data is
+ * cached. This will not be considered a direct read for IO
+ * accounting in the same way that an ARC hit is not counted.
+ */
+ if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {
+ size_t aoff = offset < db->db.db_offset ?
+ db->db.db_offset - offset : 0;
+ size_t boff = offset > db->db.db_offset ?
+ offset - db->db.db_offset : 0;
+ size_t len = MIN(size - aoff, db->db.db_size - boff);
+
+ if (db->db_state == DB_CACHED) {
+ /*
+ * We need to untransformed the ARC buf data
+ * before we copy it over.
+ */
+ err = dmu_buf_untransform_direct(db, spa);
+ ASSERT0(err);
+ abd_copy_from_buf_off(data,
+ (char *)db->db.db_data + boff, aoff, len);
+ } else {
+ abd_zero_off(data, aoff, len);
+ }
+
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+
+ mbuf = make_abd_for_dbuf(db, data, offset, size);
+ ASSERT3P(mbuf, !=, NULL);
+
+ /*
+ * The dbuf mutex (db_mtx) must be held when creating the ZIO
+ * for the read. The BP returned from
+ * dmu_buf_get_bp_from_dbuf() could be from a pending block
+ * clone or a yet to be synced Direct I/O write that is in the
+ * dbuf's dirty record. When zio_read() is called, zio_create()
+ * will make a copy of the BP. However, if zio_read() is called
+ * without the mutex being held then the dirty record from the
+ * dbuf could be freed in dbuf_write_done() resulting in garbage
+ * being set for the zio BP.
+ */
+ zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
+ dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL, &zb);
+ mutex_exit(&db->db_mtx);
+
+ zfs_racct_read(spa, db->db.db_size, 1, flags);
+ zio_nowait(cio);
+ }
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (zio_wait(rio));
+
+error:
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ (void) zio_wait(rio);
+ return (err);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
+{
+ offset_t offset = zfs_uio_offset(uio);
+ offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
+ int err;
+
+ ASSERT(uio->uio_extflg & UIO_DIRECT);
+ ASSERT3U(page_index, <, uio->uio_dio.npages);
+
+ abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
+ offset & (PAGESIZE - 1), size);
+ err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
+ abd_free(data);
+
+ if (err == 0)
+ zfs_uioskip(uio, size);
+
+ return (err);
+}
+
+int
+dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
+{
+ offset_t offset = zfs_uio_offset(uio);
+ offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
+ int err;
+
+ ASSERT(uio->uio_extflg & UIO_DIRECT);
+ ASSERT3U(page_index, <, uio->uio_dio.npages);
+
+ abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
+ offset & (PAGESIZE - 1), size);
+ err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
+ abd_free(data);
+
+ if (err == 0)
+ zfs_uioskip(uio, size);
+
+ return (err);
+}
+#endif /* _KERNEL */
+
+EXPORT_SYMBOL(dmu_read_uio_direct);
+EXPORT_SYMBOL(dmu_write_uio_direct);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 8f4fefa4f..f030fba22 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -351,6 +351,20 @@ smallblk_changed_cb(void *arg, uint64_t newval)
}
static void
+direct_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD ||
+ newval == ZFS_DIRECT_ALWAYS);
+
+ os->os_direct = newval;
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -633,6 +647,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ZFS_PROP_SPECIAL_SMALL_BLOCKS),
smallblk_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DIRECT),
+ direct_changed_cb, os);
+ }
}
if (err != 0) {
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
index 17ed2a620..45a2f0626 100644
--- a/module/zfs/spa_stats.c
+++ b/module/zfs/spa_stats.c
@@ -895,6 +895,14 @@ static const spa_iostats_t spa_iostats_template = {
{ "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 },
{ "simple_trim_extents_failed", KSTAT_DATA_UINT64 },
{ "simple_trim_bytes_failed", KSTAT_DATA_UINT64 },
+ { "arc_read_count", KSTAT_DATA_UINT64 },
+ { "arc_read_bytes", KSTAT_DATA_UINT64 },
+ { "arc_write_count", KSTAT_DATA_UINT64 },
+ { "arc_write_bytes", KSTAT_DATA_UINT64 },
+ { "direct_read_count", KSTAT_DATA_UINT64 },
+ { "direct_read_bytes", KSTAT_DATA_UINT64 },
+ { "direct_write_count", KSTAT_DATA_UINT64 },
+ { "direct_write_bytes", KSTAT_DATA_UINT64 },
};
#define SPA_IOSTATS_ADD(stat, val) \
@@ -938,6 +946,44 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
}
}
+void
+spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+ kstat_t *ksp = shk->kstat;
+
+ if (ksp == NULL)
+ return;
+
+ spa_iostats_t *iostats = ksp->ks_data;
+ if (flags & DMU_DIRECTIO) {
+ SPA_IOSTATS_ADD(direct_read_count, iops);
+ SPA_IOSTATS_ADD(direct_read_bytes, size);
+ } else {
+ SPA_IOSTATS_ADD(arc_read_count, iops);
+ SPA_IOSTATS_ADD(arc_read_bytes, size);
+ }
+}
+
+void
+spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+ kstat_t *ksp = shk->kstat;
+
+ if (ksp == NULL)
+ return;
+
+ spa_iostats_t *iostats = ksp->ks_data;
+ if (flags & DMU_DIRECTIO) {
+ SPA_IOSTATS_ADD(direct_write_count, iops);
+ SPA_IOSTATS_ADD(direct_write_bytes, size);
+ } else {
+ SPA_IOSTATS_ADD(arc_write_count, iops);
+ SPA_IOSTATS_ADD(arc_write_bytes, size);
+ }
+}
+
static int
spa_iostats_update(kstat_t *ksp, int rw)
{
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 6ae0a1412..9305bd894 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -118,6 +118,11 @@ static unsigned int zfs_slow_io_events_per_second = 20;
static unsigned int zfs_deadman_events_per_second = 1;
/*
+ * Rate limit direct write IO verify failures to this many per scond.
+ */
+static unsigned int zfs_dio_write_verify_events_per_second = 20;
+
+/*
* Rate limit checksum events after this many checksum errors per second.
*/
static unsigned int zfs_checksum_events_per_second = 20;
@@ -153,6 +158,17 @@ int zfs_nocacheflush = 0;
uint_t zfs_vdev_max_auto_ashift = 14;
uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+/*
+ * VDEV checksum verification for Direct I/O writes. This is neccessary for
+ * Linux, because anonymous pages can not be placed under write protection
+ * during Direct I/O writes.
+ */
+#if !defined(__FreeBSD__)
+uint_t zfs_vdev_direct_write_verify = 1;
+#else
+uint_t zfs_vdev_direct_write_verify = 0;
+#endif
+
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
{
@@ -673,6 +689,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
1);
zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
1);
+ zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
+ &zfs_dio_write_verify_events_per_second, 1);
zfs_ratelimit_init(&vd->vdev_checksum_rl,
&zfs_checksum_events_per_second, 1);
@@ -1182,6 +1200,7 @@ vdev_free(vdev_t *vd)
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_deadman_rl);
+ zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
if (vd == spa->spa_root_vdev)
@@ -4475,6 +4494,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_stat.vs_dio_verify_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;
for (int c = 0; c < vd->vdev_children; c++)
@@ -6503,7 +6523,14 @@ ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
"Rate limit hung IO (deadman) events to this many per second");
+ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
+ "Rate Direct I/O write verify events to this many per second");
+
/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
+ "Direct I/O writes will perform for checksum verification before "
+ "commiting write");
+
ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
"Rate limit checksum events to this many checksum errors per second "
"(do not set below ZED threshold).");
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 47346dd5a..9d12bc2eb 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -387,6 +387,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
/* IO delays */
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
+ /* Direct I/O write verify errors */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS,
+ vs->vs_dio_verify_errors);
+
/* Add extended stats nvlist to main nvlist */
fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index f7cecc9af..25b05abd3 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -595,6 +595,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, vs->vs_checksum_errors,
FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
DATA_TYPE_UINT64, vs->vs_slow_ios,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS,
+ DATA_TYPE_UINT64, vs->vs_dio_verify_errors,
NULL);
}
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 53366ad49..e69b98896 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -160,7 +160,6 @@
#include <sys/types.h>
#include <sys/param.h>
#include <sys/errno.h>
-#include <sys/uio_impl.h>
#include <sys/file.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 399f5a011..8d0aebbec 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -607,7 +607,7 @@ static int64_t zfs_immediate_write_sz = 32768;
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
- zil_callback_t callback, void *callback_data)
+ boolean_t o_direct, zil_callback_t callback, void *callback_data)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
uint32_t blocksize = zp->z_blksz;
@@ -622,7 +622,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
return;
}
- if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
write_state = WR_INDIRECT;
else if (!spa_has_slogs(zilog->zl_spa) &&
resid >= zfs_immediate_write_sz)
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index f3db953ea..f9cc5b010 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -35,7 +35,6 @@
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
-#include <sys/uio_impl.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/kmem.h>
@@ -75,6 +74,14 @@ int zfs_bclone_enabled = 1;
static int zfs_bclone_wait_dirty = 0;
/*
+ * Enable Direct I/O. If this setting is 0, then all I/O requests will be
+ * directed through the ARC acting as though the dataset property direct was
+ * set to disabled.
+ */
+static int zfs_dio_enabled = 1;
+
+
+/*
* Maximum bytes to read per chunk in zfs_read().
*/
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
@@ -203,6 +210,77 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
}
/*
+ * Determine if Direct I/O has been requested (either via the O_DIRECT flag or
+ * the "direct" dataset property). When inherited by the property only apply
+ * the O_DIRECT flag to correctly aligned IO requests. The rational for this
+ * is it allows the property to be safely set on a dataset without forcing
+ * all of the applications to be aware of the alignment restrictions. When
+ * O_DIRECT is explicitly requested by an application return EINVAL if the
+ * request is unaligned. In all cases, if the range for this request has
+ * been mmap'ed then we will perform buffered I/O to keep the mapped region
+ * synhronized with the ARC.
+ *
+ * It is possible that a file's pages could be mmap'ed after it is checked
+ * here. If so, that is handled coorarding in zfs_write(). See comments in the
+ * following area for how this is handled:
+ * zfs_write() -> update_pages()
+ */
+static int
+zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
+ int *ioflagp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ objset_t *os = zfsvfs->z_os;
+ int ioflag = *ioflagp;
+ int error = 0;
+
+ if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED ||
+ zn_has_cached_data(zp, zfs_uio_offset(uio),
+ zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
+ /*
+ * Direct I/O is disabled or the region is mmap'ed. In either
+ * case the I/O request will just directed through the ARC.
+ */
+ ioflag &= ~O_DIRECT;
+ goto out;
+ } else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
+ zfs_uio_page_aligned(uio) &&
+ zfs_uio_aligned(uio, PAGE_SIZE)) {
+ if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
+ (rw == UIO_READ)) {
+ ioflag |= O_DIRECT;
+ }
+ } else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
+ /*
+ * Direct I/O was requested through the direct=always, but it
+ * is not properly PAGE_SIZE aligned. The request will be
+ * directed through the ARC.
+ */
+ ioflag &= ~O_DIRECT;
+ }
+
+ if (ioflag & O_DIRECT) {
+ if (!zfs_uio_page_aligned(uio) ||
+ !zfs_uio_aligned(uio, PAGE_SIZE)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ error = zfs_uio_get_dio_pages_alloc(uio, rw);
+ if (error) {
+ goto out;
+ }
+ }
+
+ IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
+ ASSERT0(error);
+
+out:
+ *ioflagp = ioflag;
+ return (error);
+}
+
+/*
* Read bytes from specified file into supplied buffer.
*
* IN: zp - inode of file to be read from.
@@ -286,24 +364,58 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
error = 0;
goto out;
}
-
ASSERT(zfs_uio_offset(uio) < zp->z_size);
+
+ /*
+ * Setting up Direct I/O if requested.
+ */
+ error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag);
+ if (error) {
+ goto out;
+ }
+
#if defined(__linux__)
ssize_t start_offset = zfs_uio_offset(uio);
#endif
+ ssize_t chunk_size = zfs_vnops_read_chunk_size;
ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
ssize_t start_resid = n;
+ ssize_t dio_remaining_resid = 0;
+
+ if (uio->uio_extflg & UIO_DIRECT) {
+ /*
+ * All pages for an O_DIRECT request ahve already been mapped
+ * so there's no compelling reason to handle this uio in
+ * smaller chunks.
+ */
+ chunk_size = DMU_MAX_ACCESS;
+
+ /*
+ * In the event that the O_DIRECT request is reading the entire
+ * file, it is possible file's length is not page sized
+ * aligned. However, lower layers expect that the Direct I/O
+ * request is page-aligned. In this case, as much of the file
+ * that can be read using Direct I/O happens and the remaining
+ * amount will be read through the ARC.
+ *
+ * This is still consistent with the semantics of Direct I/O in
+ * ZFS as at a minimum the I/O request must be page-aligned.
+ */
+ dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
+ if (dio_remaining_resid != 0)
+ n -= dio_remaining_resid;
+ }
while (n > 0) {
- ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
- P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
+ ssize_t nbytes = MIN(n, chunk_size -
+ P2PHASE(zfs_uio_offset(uio), chunk_size));
#ifdef UIO_NOCOPY
if (zfs_uio_segflg(uio) == UIO_NOCOPY)
error = mappedread_sf(zp, nbytes, uio);
else
#endif
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
- zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
+ zfs_uio_offset(uio) + nbytes - 1)) {
error = mappedread(zp, nbytes, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -332,12 +444,40 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
n -= nbytes;
}
+ if (error == 0 && (uio->uio_extflg & UIO_DIRECT) &&
+ dio_remaining_resid != 0) {
+ /*
+ * Temporarily remove the UIO_DIRECT flag from the UIO so the
+ * remainder of the file can be read using the ARC.
+ */
+ uio->uio_extflg &= ~UIO_DIRECT;
+
+ if (zn_has_cached_data(zp, zfs_uio_offset(uio),
+ zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
+ error = mappedread(zp, dio_remaining_resid, uio);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
+ dio_remaining_resid);
+ }
+ uio->uio_extflg |= UIO_DIRECT;
+
+ if (error != 0)
+ n += dio_remaining_resid;
+ } else if (error && (uio->uio_extflg & UIO_DIRECT)) {
+ n += dio_remaining_resid;
+ }
int64_t nread = start_resid - n;
+
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
- task_io_account_read(nread);
out:
zfs_rangelock_exit(lr);
+ /*
+ * Cleanup for Direct I/O if requested.
+ */
+ if (uio->uio_extflg & UIO_DIRECT)
+ zfs_uio_free_dio_pages(uio, UIO_READ);
+
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -422,6 +562,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
int error = 0, error1;
ssize_t start_resid = zfs_uio_resid(uio);
uint64_t clear_setid_bits_txg = 0;
+ boolean_t o_direct_defer = B_FALSE;
/*
* Fasttrack empty write
@@ -475,6 +616,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
}
/*
+ * Setting up Direct I/O if requested.
+ */
+ error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag);
+ if (error) {
+ zfs_exit(zfsvfs, FTAG);
+ return (SET_ERROR(error));
+ }
+
+ /*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
*/
@@ -504,6 +654,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
woff = zp->z_size;
}
zfs_uio_setoffset(uio, woff);
+ /*
+ * We need to update the starting offset as well because it is
+ * set previously in the ZPL (Linux) and VNOPS (FreeBSD)
+ * layers.
+ */
+ zfs_uio_setsoffset(uio, woff);
} else {
/*
* Note that if the file block size will change as a result of
@@ -540,6 +696,33 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
const uint64_t projid = zp->z_projid;
/*
+ * In the event we are increasing the file block size
+ * (lr_length == UINT64_MAX), we will direct the write to the ARC.
+ * Because zfs_grow_blocksize() will read from the ARC in order to
+ * grow the dbuf, we avoid doing Direct I/O here as that would cause
+ * data written to disk to be overwritten by data in the ARC during
+ * the sync phase. Besides writing data twice to disk, we also
+ * want to avoid consistency concerns between data in the the ARC and
+ * on disk while growing the file's blocksize.
+ *
+ * We will only temporarily remove Direct I/O and put it back after
+ * we have grown the blocksize. We do this in the event a request
+ * is larger than max_blksz, so further requests to
+ * dmu_write_uio_dbuf() will still issue the requests using Direct
+ * IO.
+ *
+ * As an example:
+ * The first block to file is being written as a 4k request with
+ * a recorsize of 1K. The first 1K issued in the loop below will go
+ * through the ARC; however, the following 3 1K requests will
+ * use Direct I/O.
+ */
+ if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) {
+ uio->uio_extflg &= ~UIO_DIRECT;
+ o_direct_defer = B_TRUE;
+ }
+
+ /*
* Write the file in reasonable size chunks. Each chunk is written
* in a separate transaction; this keeps the intent log records small
* and allows us to do more fine-grained space accounting.
@@ -580,6 +763,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
ssize_t nbytes = n;
if (n >= blksz && woff >= zp->z_size &&
P2PHASE(woff, blksz) == 0 &&
+ !(uio->uio_extflg & UIO_DIRECT) &&
(blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
/*
* This write covers a full block. "Borrow" a buffer
@@ -705,9 +889,30 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
zfs_uioskip(uio, nbytes);
tx_bytes = nbytes;
}
+ /*
+ * There is a window where a file's pages can be mmap'ed after
+ * zfs_setup_direct() is called. This is due to the fact that
+ * the rangelock in this function is acquired after calling
+ * zfs_setup_direct(). This is done so that
+ * zfs_uio_prefaultpages() does not attempt to fault in pages
+ * on Linux for Direct I/O requests. This is not necessary as
+ * the pages are pinned in memory and can not be faulted out.
+ * Ideally, the rangelock would be held before calling
+ * zfs_setup_direct() and zfs_uio_prefaultpages(); however,
+ * this can lead to a deadlock as zfs_getpage() also acquires
+ * the rangelock as a RL_WRITER and prefaulting the pages can
+ * lead to zfs_getpage() being called.
+ *
+ * In the case of the pages being mapped after
+ * zfs_setup_direct() is called, the call to update_pages()
+ * will still be made to make sure there is consistency between
+ * the ARC and the Linux page cache. This is an ufortunate
+ * situation as the data will be read back into the ARC after
+ * the Direct I/O write has completed, but this is the penality
+ * for writing to a mmap'ed region of a file using Direct I/O.
+ */
if (tx_bytes &&
- zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
- !(ioflag & O_DIRECT)) {
+ zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) {
update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
}
@@ -756,10 +961,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* the TX_WRITE records logged here.
*/
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
- NULL, NULL);
+ uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL,
+ NULL);
dmu_tx_commit(tx);
+ /*
+ * Direct I/O was deferred in order to grow the first block.
+ * At this point it can be re-enabled for subsequent writes.
+ */
+ if (o_direct_defer) {
+ ASSERT(ioflag & O_DIRECT);
+ uio->uio_extflg |= UIO_DIRECT;
+ o_direct_defer = B_FALSE;
+ }
+
if (error != 0)
break;
ASSERT3S(tx_bytes, ==, nbytes);
@@ -767,10 +983,22 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
pfbytes -= nbytes;
}
+ if (o_direct_defer) {
+ ASSERT(ioflag & O_DIRECT);
+ uio->uio_extflg |= UIO_DIRECT;
+ o_direct_defer = B_FALSE;
+ }
+
zfs_znode_update_vfs(zp);
zfs_rangelock_exit(lr);
/*
+ * Cleanup for Direct I/O if requested.
+ */
+ if (uio->uio_extflg & UIO_DIRECT)
+ zfs_uio_free_dio_pages(uio, UIO_WRITE);
+
+ /*
* If we're in replay mode, or we made no progress, or the
* uio data is inaccessible return an error. Otherwise, it's
* at least a partial write, so it's successful.
@@ -784,9 +1012,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
if (commit)
zil_commit(zilog, zp->z_id);
- const int64_t nwritten = start_resid - zfs_uio_resid(uio);
+ int64_t nwritten = start_resid - zfs_uio_resid(uio);
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
- task_io_account_write(nwritten);
zfs_exit(zfsvfs, FTAG);
return (0);
@@ -846,7 +1073,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
- dmu_buf_t *db;
zgd_t *zgd;
int error = 0;
uint64_t zp_gen;
@@ -890,8 +1116,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
+ size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
@@ -929,18 +1155,44 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
zil_fault_io = 0;
}
#endif
+
+ dmu_buf_t *dbp;
if (error == 0)
error = dmu_buf_hold_noread(os, object, offset, zgd,
- &db);
+ &dbp);
if (error == 0) {
- blkptr_t *bp = &lr->lr_blkptr;
+ zgd->zgd_db = dbp;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp;
+ boolean_t direct_write = B_FALSE;
+ mutex_enter(&db->db_mtx);
+ dbuf_dirty_record_t *dr =
+ dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg);
+ if (dr != NULL && dr->dt.dl.dr_diowrite)
+ direct_write = B_TRUE;
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * All Direct I/O writes will have already completed and
+ * the block pointer can be immediately stored in the
+ * log record.
+ */
+ if (direct_write) {
+ /*
+ * A Direct I/O write always covers an entire
+ * block.
+ */
+ ASSERT3U(dbp->db_size, ==, zp->z_blksz);
+ lr->lr_blkptr = dr->dt.dl.dr_overridden_by;
+ zfs_get_done(zgd, 0);
+ return (0);
+ }
- zgd->zgd_db = db;
+ blkptr_t *bp = &lr->lr_blkptr;
zgd->zgd_bp = bp;
- ASSERT(db->db_offset == offset);
- ASSERT(db->db_size == size);
+ ASSERT3U(dbp->db_offset, ==, offset);
+ ASSERT3U(dbp->db_size, ==, size);
error = dmu_sync(zio, lr->lr_common.lrc_txg,
zfs_get_done, zgd);
@@ -975,7 +1227,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
return (error);
}
-
static void
zfs_get_done(zgd_t *zgd, int error)
{
@@ -1559,3 +1810,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
"Wait for dirty blocks when cloning");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
+ "Enable Direct I/O");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 53992931e..66a8a9fef 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -803,6 +803,12 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0);
+ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+ ASSERT3U(*errorp, ==, EIO);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ }
+
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
@@ -1282,20 +1288,14 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_flag_t flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
+ enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ?
+ ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE;
- ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
- zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
- zp->zp_compress >= ZIO_COMPRESS_OFF &&
- zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
- DMU_OT_IS_VALID(zp->zp_type) &&
- zp->zp_level < 32 &&
- zp->zp_copies > 0 &&
- zp->zp_copies <= spa_max_replication(spa));
zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
- ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
+ ZIO_STAGE_OPEN, pipeline);
zio->io_ready = ready;
zio->io_children_ready = children_ready;
@@ -1572,6 +1572,19 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
*/
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+ } else if (type == ZIO_TYPE_WRITE &&
+ pio->io_prop.zp_direct_write == B_TRUE) {
+ /*
+ * By default we only will verify checksums for Direct I/O
+ * writes for Linux. FreeBSD is able to place user pages under
+ * write protection before issuing them to the ZIO pipeline.
+ *
+ * Checksum validation errors will only be reported through
+ * the top-level VDEV, which is set by this child ZIO.
+ */
+ ASSERT3P(bp, !=, NULL);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY;
}
if (vd->vdev_ops->vdev_op_leaf) {
@@ -3104,6 +3117,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zp.zp_nopwrite = B_FALSE;
zp.zp_encrypt = gio->io_prop.zp_encrypt;
zp.zp_byteorder = gio->io_prop.zp_byteorder;
+ zp.zp_direct_write = B_FALSE;
memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -3577,6 +3591,13 @@ zio_ddt_write(zio_t *zio)
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
+ /*
+ * Deduplication will not take place for Direct I/O writes. The
+ * ddt_tree will be emptied in syncing context. Direct I/O writes take
+ * place in the open-context. Direct I/O write can not attempt to
+ * modify the ddt_tree while issuing out a write.
+ */
+ ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE);
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp);
@@ -4509,6 +4530,19 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_vsd = NULL;
}
+ /*
+ * If a Direct I/O write checksum verify error has occurred then this
+ * I/O should not attempt to be issued again. Instead the EIO will
+ * be returned.
+ */
+ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ ASSERT3U(zio->io_error, ==, EIO);
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ return (zio);
+ }
+
+
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO);
@@ -4822,6 +4856,49 @@ zio_checksum_verify(zio_t *zio)
return (zio);
}
+static zio_t *
+zio_dio_checksum_verify(zio_t *zio)
+{
+ zio_t *pio = zio_unique_parent(zio);
+ int error;
+
+ ASSERT3P(zio->io_vd, !=, NULL);
+ ASSERT3P(zio->io_bp, !=, NULL);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+
+ if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0)
+ goto out;
+
+ if ((error = zio_checksum_error(zio, NULL)) != 0) {
+ zio->io_error = error;
+ if (error == ECKSUM) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_dio_verify_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+ zio->io_error = SET_ERROR(EIO);
+ zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+
+ /*
+ * The EIO error must be propagated up to the logical
+ * parent ZIO in zio_notify_parent() so it can be
+ * returned to dmu_write_abd().
+ */
+ zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE;
+
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark,
+ zio, 0);
+ }
+ }
+
+out:
+ return (zio);
+}
+
+
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
@@ -5152,7 +5229,8 @@ zio_done(zio_t *zio)
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
- !vdev_is_dead(zio->io_vd)) {
+ !vdev_is_dead(zio->io_vd) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
if (ret != EALREADY) {
@@ -5167,6 +5245,7 @@ zio_done(zio_t *zio)
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
zio == zio->io_logical) {
/*
* For logical I/O requests, tell the SPA to log the
@@ -5188,7 +5267,8 @@ zio_done(zio_t *zio)
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
if (IO_IS_ALLOCATING(zio) &&
- !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ !(zio->io_flags & ZIO_FLAG_CANFAIL) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
@@ -5239,6 +5319,14 @@ zio_done(zio_t *zio)
if (zio->io_reexecute) {
/*
+ * A Direct I/O write that has a checksum verify error should
+ * not attempt to reexecute. Instead, EAGAIN should just be
+ * propagated back up so the write can be attempt to be issued
+ * through the ARC.
+ */
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
+
+ /*
* This is a logical I/O that wants to reexecute.
*
* Reexecute is top-down. When an i/o fails, if it's not
@@ -5398,6 +5486,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_vdev_io_done,
zio_vdev_io_assess,
zio_checksum_verify,
+ zio_dio_checksum_verify,
zio_done
};