aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/abd.c94
-rw-r--r--module/zfs/arc.c2
-rw-r--r--module/zfs/dataset_kstats.c6
-rw-r--r--module/zfs/dbuf.c317
-rw-r--r--module/zfs/dmu.c154
-rw-r--r--module/zfs/dmu_direct.c395
-rw-r--r--module/zfs/dmu_objset.c19
-rw-r--r--module/zfs/spa_stats.c46
-rw-r--r--module/zfs/vdev.c27
-rw-r--r--module/zfs/vdev_label.c4
-rw-r--r--module/zfs/zfs_fm.c2
-rw-r--r--module/zfs/zfs_ioctl.c1
-rw-r--r--module/zfs/zfs_log.c4
-rw-r--r--module/zfs/zfs_vnops.c294
-rw-r--r--module/zfs/zio.c113
15 files changed, 1250 insertions, 228 deletions
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index c8c4d2270..529deeecf 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -89,8 +89,8 @@
* functions.
*
* As an additional feature, linear and scatter ABD's can be stitched together
- * by using the gang ABD type (abd_alloc_gang_abd()). This allows for
- * multiple ABDs to be viewed as a singular ABD.
+ * by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs
+ * to be viewed as a singular ABD.
*
* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
* B_FALSE.
@@ -109,11 +109,15 @@ void
abd_verify(abd_t *abd)
{
#ifdef ZFS_DEBUG
- ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ if (abd_is_from_pages(abd)) {
+ ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS);
+ } else {
+ ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ }
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
- ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
+ ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
@@ -136,7 +140,7 @@ abd_verify(abd_t *abd)
#endif
}
-static void
+void
abd_init_struct(abd_t *abd)
{
list_link_init(&abd->abd_gang_link);
@@ -238,6 +242,7 @@ abd_free_linear(abd_t *abd)
abd_free_linear_page(abd);
return;
}
+
if (abd->abd_flags & ABD_FLAG_META) {
zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
} else {
@@ -520,6 +525,21 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
*/
abd->abd_flags |= ABD_FLAG_LINEAR;
+ /*
+ * User pages from Direct I/O requests may be in a single page
+ * (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag
+ * that here for abd. This is required because we have to be
+ * careful when borrowing the buffer from the ABD because we
+ * can not place user pages under write protection on Linux.
+ * See the comments in abd_os.c for abd_borrow_buf(),
+ * abd_borrow_buf_copy(), abd_return_buf() and
+ * abd_return_buf_copy().
+ */
+ if (abd_is_from_pages(sabd)) {
+ abd->abd_flags |= ABD_FLAG_FROM_PAGES |
+ ABD_FLAG_LINEAR_PAGE;
+ }
+
ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
} else if (abd_is_gang(sabd)) {
size_t left = size;
@@ -648,70 +668,6 @@ abd_to_buf(abd_t *abd)
return (ABD_LINEAR_BUF(abd));
}
-/*
- * Borrow a raw buffer from an ABD without copying the contents of the ABD
- * into the buffer. If the ABD is scattered, this will allocate a raw buffer
- * whose contents are undefined. To copy over the existing data in the ABD, use
- * abd_borrow_buf_copy() instead.
- */
-void *
-abd_borrow_buf(abd_t *abd, size_t n)
-{
- void *buf;
- abd_verify(abd);
- ASSERT3U(abd->abd_size, >=, n);
- if (abd_is_linear(abd)) {
- buf = abd_to_buf(abd);
- } else {
- buf = zio_buf_alloc(n);
- }
-#ifdef ZFS_DEBUG
- (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
-#endif
- return (buf);
-}
-
-void *
-abd_borrow_buf_copy(abd_t *abd, size_t n)
-{
- void *buf = abd_borrow_buf(abd, n);
- if (!abd_is_linear(abd)) {
- abd_copy_to_buf(buf, abd, n);
- }
- return (buf);
-}
-
-/*
- * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
- * not change the contents of the ABD and will ASSERT that you didn't modify
- * the buffer since it was borrowed. If you want any changes you made to buf to
- * be copied back to abd, use abd_return_buf_copy() instead.
- */
-void
-abd_return_buf(abd_t *abd, void *buf, size_t n)
-{
- abd_verify(abd);
- ASSERT3U(abd->abd_size, >=, n);
-#ifdef ZFS_DEBUG
- (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
-#endif
- if (abd_is_linear(abd)) {
- ASSERT3P(buf, ==, abd_to_buf(abd));
- } else {
- ASSERT0(abd_cmp_buf(abd, buf, n));
- zio_buf_free(buf, n);
- }
-}
-
-void
-abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
-{
- if (!abd_is_linear(abd)) {
- abd_copy_from_buf(abd, buf, n);
- }
- abd_return_buf(abd, buf, n);
-}
-
void
abd_release_ownership_of_buf(abd_t *abd)
{
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 714a30e86..b5bcd367b 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -5961,7 +5961,7 @@ top:
ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
metadata, misses);
- zfs_racct_read(size, 1);
+ zfs_racct_read(spa, size, 1, 0);
}
/* Check if the spa even has l2 configured */
diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c
index 914260e74..27a04c2af 100644
--- a/module/zfs/dataset_kstats.c
+++ b/module/zfs/dataset_kstats.c
@@ -217,8 +217,7 @@ dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
}
void
-dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
- int64_t nwritten)
+dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten)
{
ASSERT3S(nwritten, >=, 0);
@@ -230,8 +229,7 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
}
void
-dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
- int64_t nread)
+dataset_kstats_update_read_kstats(dataset_kstats_t *dk, int64_t nread)
{
ASSERT3S(nread, >=, 0);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 099883ba2..df9368fc8 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -628,7 +628,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
* L2ARC.
*/
boolean_t
-dbuf_is_l2cacheable(dmu_buf_impl_t *db)
+dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)
{
if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
(db->db_objset->os_secondary_cache ==
@@ -636,10 +636,17 @@ dbuf_is_l2cacheable(dmu_buf_impl_t *db)
if (l2arc_exclude_special == 0)
return (B_TRUE);
- blkptr_t *bp = db->db_blkptr;
- if (bp == NULL || BP_IS_HOLE(bp))
+ /*
+ * bp must be checked in the event it was passed from
+ * dbuf_read_impl() as the result of a the BP being set from
+ * a Direct I/O write in dbuf_read(). See comments in
+ * dbuf_read().
+ */
+ blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;
+
+ if (db_bp == NULL || BP_IS_HOLE(db_bp))
return (B_FALSE);
- uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+ uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);
vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
vdev_t *vd = NULL;
@@ -1380,6 +1387,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
mutex_enter(&db->db_mtx);
ASSERT3U(db->db_state, ==, DB_READ);
+
/*
* All reads are synchronous, so we must have a hold on the dbuf
*/
@@ -1570,12 +1578,11 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
*/
static int
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
- db_lock_type_t dblt, const void *tag)
+ db_lock_type_t dblt, blkptr_t *bp, const void *tag)
{
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err, zio_flags;
- blkptr_t bp, *bpp = NULL;
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1589,43 +1596,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
goto early_unlock;
}
- /*
- * If we have a pending block clone, we don't want to read the
- * underlying block, but the content of the block being cloned,
- * pointed by the dirty record, so we have the most recent data.
- * If there is no dirty record, then we hit a race in a sync
- * process when the dirty record is already removed, while the
- * dbuf is not yet destroyed. Such case is equivalent to uncached.
- */
- if (db->db_state == DB_NOFILL) {
- dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
- if (dr != NULL) {
- if (!dr->dt.dl.dr_brtwrite) {
- err = EIO;
- goto early_unlock;
- }
- bp = dr->dt.dl.dr_overridden_by;
- bpp = &bp;
- }
- }
-
- if (bpp == NULL && db->db_blkptr != NULL) {
- bp = *db->db_blkptr;
- bpp = &bp;
- }
-
- err = dbuf_read_hole(db, dn, bpp);
+ err = dbuf_read_hole(db, dn, bp);
if (err == 0)
goto early_unlock;
- ASSERT(bpp != NULL);
+ ASSERT(bp != NULL);
/*
* Any attempt to read a redacted block should result in an error. This
* will never happen under normal conditions, but can be useful for
* debugging purposes.
*/
- if (BP_IS_REDACTED(bpp)) {
+ if (BP_IS_REDACTED(bp)) {
ASSERT(dsl_dataset_feature_is_active(
db->db_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
@@ -1640,9 +1622,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
- if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
spa_log_error(db->db_objset->os_spa, &zb,
- BP_GET_LOGICAL_BIRTH(bpp));
+ BP_GET_LOGICAL_BIRTH(bp));
err = SET_ERROR(EIO);
goto early_unlock;
}
@@ -1653,7 +1635,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
if (!DBUF_IS_CACHEABLE(db))
aflags |= ARC_FLAG_UNCACHED;
- else if (dbuf_is_l2cacheable(db))
+ else if (dbuf_is_l2cacheable(db, bp))
aflags |= ARC_FLAG_L2CACHE;
dbuf_add_ref(db, NULL);
@@ -1661,17 +1643,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
zio_flags = (flags & DB_RF_CANFAIL) ?
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
- if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
+ if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;
+
/*
- * The zio layer will copy the provided blkptr later, but we have our
- * own copy so that we can release the parent's rwlock. We have to
- * do that so that if dbuf_read_done is called synchronously (on
+ * The zio layer will copy the provided blkptr later, but we need to
+ * do this now so that we can release the parent's rwlock. We have to
+ * do that now so that if dbuf_read_done is called synchronously (on
* an l1 cache hit) we don't acquire the db_mtx while holding the
* parent's rwlock, which would be a lock ordering violation.
*/
+ blkptr_t copy = *bp;
dmu_buf_unlock_parent(db, dblt, tag);
- return (arc_read(zio, db->db_objset->os_spa, bpp,
+ return (arc_read(zio, db->db_objset->os_spa, &copy,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb));
@@ -1844,13 +1828,30 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
ASSERT(db->db_state == DB_UNCACHED ||
db->db_state == DB_NOFILL);
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
- if (pio == NULL && (db->db_state == DB_NOFILL ||
- (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
- spa_t *spa = dn->dn_objset->os_spa;
- pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
- need_wait = B_TRUE;
+ blkptr_t *bp;
+
+ /*
+ * If a block clone or Direct I/O write has occurred we will
+ * get the dirty records overridden BP so we get the most
+ * recent data.
+ */
+ err = dmu_buf_get_bp_from_dbuf(db, &bp);
+
+ if (!err) {
+ if (pio == NULL && (db->db_state == DB_NOFILL ||
+ (bp != NULL && !BP_IS_HOLE(bp)))) {
+ spa_t *spa = dn->dn_objset->os_spa;
+ pio =
+ zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ need_wait = B_TRUE;
+ }
+
+ err =
+ dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dmu_buf_unlock_parent(db, dblt, FTAG);
}
- err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
miss = (db->db_state != DB_CACHED);
}
@@ -1918,6 +1919,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
uint64_t txg = dr->dr_txg;
ASSERT(MUTEX_HELD(&db->db_mtx));
+
/*
* This assert is valid because dmu_sync() expects to be called by
* a zilog's get_data while holding a range lock. This call only
@@ -1936,16 +1938,20 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);
- if (dr->dt.dl.dr_brtwrite) {
+ if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
ASSERT0P(dr->dt.dl.dr_data);
dr->dt.dl.dr_data = db->db_buf;
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
dr->dt.dl.dr_brtwrite = B_FALSE;
+ dr->dt.dl.dr_diowrite = B_FALSE;
dr->dt.dl.dr_has_raw_params = B_FALSE;
/*
+ * In the event that Direct I/O was used, we do not
+ * need to release the buffer from the ARC.
+ *
* Release the already-written buffer, so we leave it in
* a consistent dirty state. Note that all callers are
* modifying the buffer, so they will immediately do
@@ -2084,6 +2090,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
*/
dmu_buf_will_dirty(&db->db, tx);
+ VERIFY3P(db->db_buf, !=, NULL);
+
/* create the data buffer for the new block */
buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
@@ -2532,6 +2540,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
uint64_t txg = tx->tx_txg;
boolean_t brtwrite;
+ boolean_t diowrite;
ASSERT(txg != 0);
@@ -2557,7 +2566,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(dr->dr_dbuf == db);
brtwrite = dr->dt.dl.dr_brtwrite;
+ diowrite = dr->dt.dl.dr_diowrite;
if (brtwrite) {
+ ASSERT3B(diowrite, ==, B_FALSE);
/*
* We are freeing a block that we cloned in the same
* transaction group.
@@ -2598,10 +2609,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_state != DB_NOFILL && !brtwrite) {
dbuf_unoverride(dr);
- ASSERT(db->db_buf != NULL);
- ASSERT(dr->dt.dl.dr_data != NULL);
- if (dr->dt.dl.dr_data != db->db_buf)
+ if (dr->dt.dl.dr_data != db->db_buf) {
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
@@ -2610,7 +2622,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_dirtycnt -= 1;
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+ ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
@@ -2670,8 +2682,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
* want to make sure dbuf_read() will read the pending cloned block and
* not the uderlying block that is being replaced. dbuf_undirty() will
- * do dbuf_unoverride(), so we will end up with cloned block content,
- * without overridden BP.
+ * do brt_pending_remove() before removing the dirty record.
*/
(void) dbuf_read(db, NULL, flags);
if (undirty) {
@@ -2701,23 +2712,126 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
return (dr != NULL);
}
+/*
+ * Normally the db_blkptr points to the most recent on-disk content for the
+ * dbuf (and anything newer will be cached in the dbuf). However, a pending
+ * block clone or not yet synced Direct I/O write will have a dirty record BP
+ * pointing to the most recent data.
+ */
+int
+dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ int error = 0;
+
+ if (db->db_level != 0) {
+ *bp = db->db_blkptr;
+ return (0);
+ }
+
+ *bp = db->db_blkptr;
+ dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+ if (dr && db->db_state == DB_NOFILL) {
+ /* Block clone */
+ if (!dr->dt.dl.dr_brtwrite)
+ error = EIO;
+ else
+ *bp = &dr->dt.dl.dr_overridden_by;
+ } else if (dr && db->db_state == DB_UNCACHED) {
+ /* Direct I/O write */
+ if (dr->dt.dl.dr_diowrite)
+ *bp = &dr->dt.dl.dr_overridden_by;
+ }
+
+ return (error);
+}
+
+/*
+ * Direct I/O reads can read directly from the ARC, but the data has
+ * to be untransformed in order to copy it over into user pages.
+ */
+int
+dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)
+{
+ int err = 0;
+ DB_DNODE_ENTER(db);
+ dnode_t *dn = DB_DNODE(db);
+
+ ASSERT3S(db->db_state, ==, DB_CACHED);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ /*
+ * Ensure that this block's dnode has been decrypted if
+ * the caller has requested decrypted data.
+ */
+ err = dbuf_read_verify_dnode_crypt(db, dn, 0);
+
+ /*
+ * If the arc buf is compressed or encrypted and the caller
+ * requested uncompressed data, we need to untransform it
+ * before returning. We also call arc_untransform() on any
+ * unauthenticated blocks, which will verify their MAC if
+ * the key is now available.
+ */
+ if (err == 0 && db->db_buf != NULL &&
+ (arc_is_encrypted(db->db_buf) ||
+ arc_is_unauthenticated(db->db_buf) ||
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ zbookmark_phys_t zb;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+ db->db.db_object, db->db_level, db->db_blkid);
+ dbuf_fix_old_data(db, spa_syncing_txg(spa));
+ err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
+ dbuf_set_data(db, db->db_buf);
+ }
+ DB_DNODE_EXIT(db);
+ DBUF_STAT_BUMP(hash_hits);
+
+ return (err);
+}
+
void
-dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
+ /*
+ * Block clones and Direct I/O writes always happen in open-context.
+ */
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT0(db->db_level);
+ ASSERT(!dmu_tx_is_syncing(tx));
+ ASSERT0(db->db_level);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
- /*
- * Block cloning: We are going to clone into this block, so undirty
- * modifications done to this block so far in this txg. This includes
- * writes and clones into this block.
- */
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
- VERIFY(!dbuf_undirty(db, tx));
+
+ /*
+ * We are going to clone or issue a Direct I/O write on this block, so
+ * undirty modifications done to this block so far in this txg. This
+ * includes writes and clones into this block.
+ *
+ * If there dirty record associated with this txg from a previous Direct
+ * I/O write then space accounting cleanup takes place. It is important
+ * to go ahead free up the space accounting through dbuf_undirty() ->
+ * dbuf_unoverride() -> zio_free(). Space accountiung for determining
+ * if a write can occur in zfs_write() happens through dmu_tx_assign().
+ * This can cause an issue with Direct I/O writes in the case of
+ * overwriting the same block, because all DVA allocations are being
+ * done in open-context. Constantly allowing Direct I/O overwrites to
+ * the same block can exhaust the pools available space leading to
+ * ENOSPC errors at the DVA allocation part of the ZIO pipeline, which
+ * will eventually suspend the pool. By cleaning up sapce acccounting
+ * now, the ENOSPC error can be avoided.
+ *
+ * Since we are undirtying the record in open-context, we must have a
+ * hold on the db, so it should never be evicted after calling
+ * dbuf_undirty().
+ */
+ VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);
ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+
if (db->db_buf != NULL) {
/*
* If there is an associated ARC buffer with this dbuf we can
@@ -2728,6 +2842,11 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
arc_buf_destroy(db->db_buf, db);
+ /*
+ * Setting the dbuf's data pointers to NULL will force all
+ * future reads down to the devices to get the most up to date
+ * version of the data after a Direct I/O write has completed.
+ */
db->db_buf = NULL;
dbuf_clear_data(db);
}
@@ -2736,7 +2855,8 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
ASSERT3P(db->db.db_data, ==, NULL);
db->db_state = DB_NOFILL;
- DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+ DTRACE_SET_STATE(db,
+ "allocating NOFILL buffer for clone or direct I/O write");
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
@@ -2773,21 +2893,28 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
dmu_tx_private_ok(tx));
mutex_enter(&db->db_mtx);
- if (db->db_state == DB_NOFILL) {
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+ if (db->db_state == DB_NOFILL ||
+ (db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
/*
- * Block cloning: We will be completely overwriting a block
- * cloned in this transaction group, so let's undirty the
- * pending clone and mark the block as uncached. This will be
- * as if the clone was never done. But if the fill can fail
- * we should have a way to return back to the cloned data.
+ * If the fill can fail we should have a way to return back to
+ * the cloned or Direct I/O write data.
*/
- if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+ if (canfail && dr) {
mutex_exit(&db->db_mtx);
dmu_buf_will_dirty(db_fake, tx);
return;
}
- VERIFY(!dbuf_undirty(db, tx));
- db->db_state = DB_UNCACHED;
+ /*
+ * Block cloning: We will be completely overwriting a block
+ * cloned in this transaction group, so let's undirty the
+ * pending clone and mark the block as uncached. This will be
+ * as if the clone was never done.
+ */
+ if (dr && dr->dt.dl.dr_brtwrite) {
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_state = DB_UNCACHED;
+ }
}
mutex_exit(&db->db_mtx);
@@ -4080,7 +4207,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
} else {
mutex_exit(&db->db_mtx);
}
-
}
#pragma weak dmu_buf_refcount = dbuf_refcount
@@ -4540,24 +4666,32 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
/*
- * To be synced, we must be dirtied. But we
- * might have been freed after the dirty.
+ * To be synced, we must be dirtied. But we might have been freed
+ * after the dirty.
*/
if (db->db_state == DB_UNCACHED) {
/* This buffer has been freed since it was dirtied */
- ASSERT(db->db.db_data == NULL);
+ ASSERT3P(db->db.db_data, ==, NULL);
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
} else if (db->db_state == DB_READ) {
/*
- * This buffer has a clone we need to write, and an in-flight
- * read on the BP we're about to clone. Its safe to issue the
- * write here because the read has already been issued and the
- * contents won't change.
+ * This buffer was either cloned or had a Direct I/O write
+ * occur and has an in-flgiht read on the BP. It is safe to
+ * issue the write here, because the read has already been
+ * issued and the contents won't change.
+ *
+ * We can verify the case of both the clone and Direct I/O
+ * write by making sure the first dirty record for the dbuf
+ * has no ARC buffer associated with it.
*/
- ASSERT(dr->dt.dl.dr_brtwrite &&
- dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
+ dbuf_dirty_record_t *dr_head =
+ list_head(&db->db_dirty_records);
+ ASSERT3P(db->db_buf, ==, NULL);
+ ASSERT3P(db->db.db_data, ==, NULL);
+ ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL);
+ ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);
} else {
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
@@ -4608,8 +4742,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_check_blkptr(dn, db);
/*
- * If this buffer is in the middle of an immediate write,
- * wait for the synchronous IO to complete.
+ * If this buffer is in the middle of an immediate write, wait for the
+ * synchronous IO to complete.
+ *
+ * This is also valid even with Direct I/O writes setting a dirty
+ * records override state into DR_IN_DMU_SYNC, because all
+ * Direct I/O writes happen in open-context.
*/
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -4913,8 +5051,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (db->db_level == 0) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+
+ /* no dr_data if this is a NO_FILL or Direct I/O */
if (dr->dt.dl.dr_data != NULL &&
dr->dt.dl.dr_data != db->db_buf) {
+ ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
+ ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
arc_buf_destroy(dr->dt.dl.dr_data, db);
}
} else {
@@ -5180,7 +5322,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
* The BP for this block has been provided by open context
- * (by dmu_sync() or dmu_buf_write_embedded()).
+ * (by dmu_sync(), dmu_write_direct(),
+ * or dmu_buf_write_embedded()).
*/
abd_t *contents = (data != NULL) ?
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
@@ -5219,7 +5362,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dr_zio = arc_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
- dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+ dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,
children_ready_cb, dbuf_write_done, db,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
@@ -5239,7 +5382,7 @@ EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_is_dirty);
-EXPORT_SYMBOL(dmu_buf_will_clone);
+EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
EXPORT_SYMBOL(dmu_buf_fill_done);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index b3eda8ea5..3f87cfe6b 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -609,8 +609,16 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dbp[i] = &db->db;
}
- if (!read)
- zfs_racct_write(length, nblks);
+ /*
+ * If we are doing O_DIRECT we still hold the dbufs, even for reads,
+ * but we do not issue any reads here. We do not want to account for
+ * writes in this case.
+ *
+ * O_DIRECT write/read accounting takes place in
+ * dmu_{write/read}_abd().
+ */
+ if (!read && ((flags & DMU_DIRECTIO) == 0))
+ zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
if (zs)
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
@@ -897,7 +905,7 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
/*
* Get the next "chunk" of file data to free. We traverse the file from
- * the end so that the file gets shorter over time (if we crashes in the
+ * the end so that the file gets shorter over time (if we crash in the
* middle, this will leave us in a better state). We find allocated file
* data by simply searching the allocated level 1 indirects.
*
@@ -1168,7 +1176,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
/*
* Deal with odd block sizes, where there can't be data past the first
- * block. If we ever do the tail block optimization, we will need to
+ * block. If we ever do the tail block optimization, we will need to
* handle that here as well.
*/
if (dn->dn_maxblkid == 0) {
@@ -1178,6 +1186,18 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
size = newsz;
}
+ if (size == 0)
+ return (0);
+
+ /* Allow Direct I/O when requested and properly aligned */
+ if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) &&
+ zfs_dio_aligned(offset, size, PAGESIZE)) {
+ abd_t *data = abd_get_from_buf(buf, size);
+ err = dmu_read_abd(dn, offset, size, data, flags);
+ abd_free(data);
+ return (err);
+ }
+
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
int i;
@@ -1286,22 +1306,41 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
}
/*
- * Note: Lustre is an external consumer of this interface.
+ * This interface is not used internally by ZFS but is provided for
+ * use by Lustre which is built on the DMU interfaces.
*/
-void
-dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
- const void *buf, dmu_tx_t *tx)
+int
+dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx, uint32_t flags)
{
dmu_buf_t **dbp;
int numbufs;
+ int error;
if (size == 0)
- return;
+ return (0);
+
+ /* Allow Direct I/O when requested and properly aligned */
+ if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
+ zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
+ abd_t *data = abd_get_from_buf((void *)buf, size);
+ error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
+ abd_free(data);
+ return (error);
+ }
VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (0);
+}
+
+int
+dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
}
void
@@ -1365,6 +1404,9 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
dmu_buf_t **dbp;
int numbufs, i, err;
+ if (uio->uio_extflg & UIO_DIRECT)
+ return (dmu_read_uio_direct(dn, uio, size));
+
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
@@ -1453,23 +1495,53 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
dmu_buf_t **dbp;
int numbufs;
int err = 0;
- int i;
+ uint64_t write_size;
- err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
+top:
+ write_size = size;
+
+ /*
+ * We only allow Direct I/O writes to happen if we are block
+ * sized aligned. Otherwise, we pass the write off to the ARC.
+ */
+ if ((uio->uio_extflg & UIO_DIRECT) &&
+ (write_size >= dn->dn_datablksz)) {
+ if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
+ dn->dn_datablksz)) {
+ return (dmu_write_uio_direct(dn, uio, size, tx));
+ } else if (write_size > dn->dn_datablksz &&
+ zfs_dio_offset_aligned(zfs_uio_offset(uio),
+ dn->dn_datablksz)) {
+ write_size =
+ dn->dn_datablksz * (write_size / dn->dn_datablksz);
+ err = dmu_write_uio_direct(dn, uio, write_size, tx);
+ if (err == 0) {
+ size -= write_size;
+ goto top;
+ } else {
+ return (err);
+ }
+ } else {
+ write_size =
+ P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
+ }
+ }
+
+ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
if (err)
return (err);
- for (i = 0; i < numbufs; i++) {
+ for (int i = 0; i < numbufs; i++) {
uint64_t tocpy;
int64_t bufoff;
dmu_buf_t *db = dbp[i];
- ASSERT(size > 0);
+ ASSERT(write_size > 0);
offset_t off = zfs_uio_offset(uio);
bufoff = off - db->db_offset;
- tocpy = MIN(db->db_size - bufoff, size);
+ tocpy = MIN(db->db_size - bufoff, write_size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
@@ -1489,10 +1561,18 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
if (err)
break;
+ write_size -= tocpy;
size -= tocpy;
}
+ IMPLY(err == 0, write_size == 0);
+
dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
+ goto top;
+ }
+
return (err);
}
@@ -1731,7 +1811,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
* same size as the dbuf.
*/
if (offset == db->db.db_offset && blksz == db->db.db_size) {
- zfs_racct_write(blksz, 1);
+ zfs_racct_write(os->os_spa, blksz, 1, 0);
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
@@ -1761,23 +1841,22 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
return (err);
}
-typedef struct {
- dbuf_dirty_record_t *dsa_dr;
- dmu_sync_cb_t *dsa_done;
- zgd_t *dsa_zgd;
- dmu_tx_t *dsa_tx;
-} dmu_sync_arg_t;
-
-static void
+void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
(void) buf;
dmu_sync_arg_t *dsa = varg;
- dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
- blkptr_t *bp = zio->io_bp;
if (zio->io_error == 0) {
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
+ blkptr_t *bp = zio->io_bp;
+
if (BP_IS_HOLE(bp)) {
+ dmu_buf_t *db = NULL;
+ if (dr)
+ db = &(dr->dr_dbuf->db);
+ else
+ db = dsa->dsa_zgd->zgd_db;
/*
* A block of zeros may compress to a hole, but the
* block size still needs to be known for replay.
@@ -1796,7 +1875,7 @@ dmu_sync_late_arrival_ready(zio_t *zio)
dmu_sync_ready(zio, NULL, zio->io_private);
}
-static void
+void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
(void) buf;
@@ -1809,7 +1888,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
* Record the vdev(s) backing this blkptr so they can be flushed after
* the writes for the lwb have completed.
*/
- if (zio->io_error == 0) {
+ if (zgd && zio->io_error == 0) {
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
}
@@ -1848,10 +1927,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
} else {
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
}
+
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
- dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+ if (dsa->dsa_done)
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
kmem_free(dsa, sizeof (*dsa));
}
@@ -2120,9 +2201,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dsa->dsa_tx = NULL;
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
- dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
- &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+ dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db),
+ dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL,
+ dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL,
+ &zb));
return (0);
}
@@ -2385,6 +2467,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_nopwrite = nopwrite;
zp->zp_encrypt = encrypt;
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
+ zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -2594,7 +2677,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
ASSERT(db->db_blkid != DMU_SPILL_BLKID);
ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
- dmu_buf_will_clone(dbuf, tx);
+ dmu_buf_will_clone_or_dio(dbuf, tx);
mutex_enter(&db->db_mtx);
@@ -2817,8 +2900,15 @@ EXPORT_SYMBOL(dmu_free_long_range);
EXPORT_SYMBOL(dmu_free_long_object);
EXPORT_SYMBOL(dmu_read);
EXPORT_SYMBOL(dmu_read_by_dnode);
+EXPORT_SYMBOL(dmu_read_uio);
+EXPORT_SYMBOL(dmu_read_uio_dbuf);
+EXPORT_SYMBOL(dmu_read_uio_dnode);
EXPORT_SYMBOL(dmu_write);
EXPORT_SYMBOL(dmu_write_by_dnode);
+EXPORT_SYMBOL(dmu_write_by_dnode_flags);
+EXPORT_SYMBOL(dmu_write_uio);
+EXPORT_SYMBOL(dmu_write_uio_dbuf);
+EXPORT_SYMBOL(dmu_write_uio_dnode);
EXPORT_SYMBOL(dmu_prealloc);
EXPORT_SYMBOL(dmu_object_info);
EXPORT_SYMBOL(dmu_object_info_from_dnode);
diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c
new file mode 100644
index 000000000..91a7fd8df
--- /dev/null
+++ b/module/zfs/dmu_direct.c
@@ -0,0 +1,395 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_racct.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dmu_objset.h>
+
+static abd_t *
+make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,
+ uint64_t size)
+{
+ size_t buf_size = db->db.db_size;
+ abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;
+ size_t buf_off = 0;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (offset > db->db.db_offset) {
+ size_t pre_size = offset - db->db.db_offset;
+ pre_buf = abd_alloc_for_io(pre_size, B_TRUE);
+ buf_size -= pre_size;
+ buf_off = 0;
+ } else {
+ buf_off = db->db.db_offset - offset;
+ size -= buf_off;
+ }
+
+ if (size < buf_size) {
+ size_t post_size = buf_size - size;
+ post_buf = abd_alloc_for_io(post_size, B_TRUE);
+ buf_size -= post_size;
+ }
+
+ ASSERT3U(buf_size, >, 0);
+ abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);
+
+ if (pre_buf || post_buf) {
+ mbuf = abd_alloc_gang();
+ if (pre_buf)
+ abd_gang_add(mbuf, pre_buf, B_TRUE);
+ abd_gang_add(mbuf, buf, B_TRUE);
+ if (post_buf)
+ abd_gang_add(mbuf, post_buf, B_TRUE);
+ } else {
+ mbuf = buf;
+ }
+
+ return (mbuf);
+}
+
+static void
+dmu_read_abd_done(zio_t *zio)
+{
+ abd_free(zio->io_abd);
+}
+
+static void
+dmu_write_direct_ready(zio_t *zio)
+{
+ dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dmu_write_direct_done(zio_t *zio)
+{
+ dmu_sync_arg_t *dsa = zio->io_private;
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3P(db->db_buf, ==, NULL);
+ ASSERT3P(dr->dt.dl.dr_data, ==, NULL);
+ ASSERT3P(db->db.db_data, ==, NULL);
+ db->db_state = DB_UNCACHED;
+ mutex_exit(&db->db_mtx);
+
+ dmu_sync_done(zio, NULL, zio->io_private);
+
+ if (zio->io_error != 0) {
+ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+ ASSERT3U(zio->io_error, ==, EIO);
+
+ /*
+ * In the event of an I/O error this block has been freed in
+ * zio_done() through zio_dva_unallocate(). Calling
+ * dmu_sync_done() above set dr_override_state to
+ * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
+ * dbuf_unoverride(), it will skip doing zio_free() to free
+ * this block as that was already taken care of.
+ *
+ * Since we are undirtying the record in open-context, we must
+ * have a hold on the db, so it should never be evicted after
+ * calling dbuf_undirty().
+ */
+ mutex_enter(&db->db_mtx);
+ VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);
+ mutex_exit(&db->db_mtx);
+ }
+
+ kmem_free(zio->io_bp, sizeof (blkptr_t));
+ zio->io_bp = NULL;
+}
+
+int
+dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
+{
+ objset_t *os = db->db_objset;
+ dsl_dataset_t *ds = dmu_objset_ds(os);
+ zbookmark_phys_t zb;
+ dbuf_dirty_record_t *dr_head;
+
+ SET_BOOKMARK(&zb, ds->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ DB_DNODE_ENTER(db);
+ zio_prop_t zp;
+ dmu_write_policy(os, DB_DNODE(db), db->db_level,
+ WP_DMU_SYNC | WP_DIRECT_WR, &zp);
+ DB_DNODE_EXIT(db);
+
+ /*
+ * Dirty this dbuf with DB_NOFILL since we will not have any data
+ * associated with the dbuf.
+ */
+ dmu_buf_will_clone_or_dio(&db->db, tx);
+
+ mutex_enter(&db->db_mtx);
+
+ uint64_t txg = dmu_tx_get_txg(tx);
+ ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));
+ ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));
+
+ dr_head = list_head(&db->db_dirty_records);
+ ASSERT3U(dr_head->dr_txg, ==, txg);
+ dr_head->dt.dl.dr_diowrite = B_TRUE;
+ dr_head->dr_accounted = db->db.db_size;
+
+ blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ if (db->db_blkptr != NULL) {
+ /*
+ * Fill in bp with the current block pointer so that
+ * the nopwrite code can check if we're writing the same
+ * data that's already on disk.
+ */
+ *bp = *db->db_blkptr;
+ } else {
+ memset(bp, 0, sizeof (blkptr_t));
+ }
+
+ /*
+ * Disable nopwrite if the current block pointer could change
+ * before this TXG syncs.
+ */
+ if (list_next(&db->db_dirty_records, dr_head) != NULL)
+ zp.zp_nopwrite = B_FALSE;
+
+ ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
+ dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+
+ mutex_exit(&db->db_mtx);
+
+ dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);
+
+ dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = dr_head;
+ dsa->dsa_tx = tx;
+
+ zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,
+ db->db.db_size, db->db.db_size, &zp,
+ dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);
+
+ if (pio == NULL)
+ return (zio_wait(zio));
+
+ zio_nowait(zio);
+
+ return (0);
+}
+
+int
+dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
+ abd_t *data, uint32_t flags, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ spa_t *spa = dn->dn_objset->os_spa;
+ int numbufs, err;
+
+ ASSERT(flags & DMU_DIRECTIO);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset,
+ size, B_FALSE, FTAG, &numbufs, &dbp, flags);
+ if (err)
+ return (err);
+
+ zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ for (int i = 0; i < numbufs && err == 0; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+
+ abd_t *abd = abd_get_offset_size(data,
+ db->db.db_offset - offset, dn->dn_datablksz);
+
+ zfs_racct_write(spa, db->db.db_size, 1, flags);
+ err = dmu_write_direct(pio, db, abd, tx);
+ ASSERT0(err);
+ }
+
+ err = zio_wait(pio);
+
+ /*
+ * The dbuf must be held until the Direct I/O write has completed in
+ * the event there was any errors and dbuf_undirty() was called.
+ */
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (err);
+}
+
+int
+dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
+ abd_t *data, uint32_t flags)
+{
+ objset_t *os = dn->dn_objset;
+ spa_t *spa = os->os_spa;
+ dmu_buf_t **dbp;
+ int numbufs, err;
+
+ ASSERT(flags & DMU_DIRECTIO);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset,
+ size, B_FALSE, FTAG, &numbufs, &dbp, flags);
+ if (err)
+ return (err);
+
+ zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ for (int i = 0; i < numbufs; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ abd_t *mbuf;
+ zbookmark_phys_t zb;
+ blkptr_t *bp;
+
+ mutex_enter(&db->db_mtx);
+
+ SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ /*
+ * If there is another read for this dbuf, we will wait for
+ * that to complete first before checking the db_state below.
+ */
+ while (db->db_state == DB_READ)
+ cv_wait(&db->db_changed, &db->db_mtx);
+
+ err = dmu_buf_get_bp_from_dbuf(db, &bp);
+ if (err) {
+ mutex_exit(&db->db_mtx);
+ goto error;
+ }
+
+ /*
+ * There is no need to read if this is a hole or the data is
+ * cached. This will not be considered a direct read for IO
+ * accounting in the same way that an ARC hit is not counted.
+ */
+ if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {
+ size_t aoff = offset < db->db.db_offset ?
+ db->db.db_offset - offset : 0;
+ size_t boff = offset > db->db.db_offset ?
+ offset - db->db.db_offset : 0;
+ size_t len = MIN(size - aoff, db->db.db_size - boff);
+
+ if (db->db_state == DB_CACHED) {
+ /*
+ * We need to untransformed the ARC buf data
+ * before we copy it over.
+ */
+ err = dmu_buf_untransform_direct(db, spa);
+ ASSERT0(err);
+ abd_copy_from_buf_off(data,
+ (char *)db->db.db_data + boff, aoff, len);
+ } else {
+ abd_zero_off(data, aoff, len);
+ }
+
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+
+ mbuf = make_abd_for_dbuf(db, data, offset, size);
+ ASSERT3P(mbuf, !=, NULL);
+
+ /*
+ * The dbuf mutex (db_mtx) must be held when creating the ZIO
+ * for the read. The BP returned from
+ * dmu_buf_get_bp_from_dbuf() could be from a pending block
+ * clone or a yet to be synced Direct I/O write that is in the
+ * dbuf's dirty record. When zio_read() is called, zio_create()
+ * will make a copy of the BP. However, if zio_read() is called
+ * without the mutex being held then the dirty record from the
+ * dbuf could be freed in dbuf_write_done() resulting in garbage
+ * being set for the zio BP.
+ */
+ zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
+ dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL, &zb);
+ mutex_exit(&db->db_mtx);
+
+ zfs_racct_read(spa, db->db.db_size, 1, flags);
+ zio_nowait(cio);
+ }
+
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (zio_wait(rio));
+
+error:
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ (void) zio_wait(rio);
+ return (err);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
+{
+ offset_t offset = zfs_uio_offset(uio);
+ offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
+ int err;
+
+ ASSERT(uio->uio_extflg & UIO_DIRECT);
+ ASSERT3U(page_index, <, uio->uio_dio.npages);
+
+ abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
+ offset & (PAGESIZE - 1), size);
+ err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
+ abd_free(data);
+
+ if (err == 0)
+ zfs_uioskip(uio, size);
+
+ return (err);
+}
+
+int
+dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
+{
+ offset_t offset = zfs_uio_offset(uio);
+ offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
+ int err;
+
+ ASSERT(uio->uio_extflg & UIO_DIRECT);
+ ASSERT3U(page_index, <, uio->uio_dio.npages);
+
+ abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
+ offset & (PAGESIZE - 1), size);
+ err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
+ abd_free(data);
+
+ if (err == 0)
+ zfs_uioskip(uio, size);
+
+ return (err);
+}
+#endif /* _KERNEL */
+
+EXPORT_SYMBOL(dmu_read_uio_direct);
+EXPORT_SYMBOL(dmu_write_uio_direct);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 8f4fefa4f..f030fba22 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -351,6 +351,20 @@ smallblk_changed_cb(void *arg, uint64_t newval)
}
static void
+direct_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD ||
+ newval == ZFS_DIRECT_ALWAYS);
+
+ os->os_direct = newval;
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -633,6 +647,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ZFS_PROP_SPECIAL_SMALL_BLOCKS),
smallblk_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DIRECT),
+ direct_changed_cb, os);
+ }
}
if (err != 0) {
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
index 17ed2a620..45a2f0626 100644
--- a/module/zfs/spa_stats.c
+++ b/module/zfs/spa_stats.c
@@ -895,6 +895,14 @@ static const spa_iostats_t spa_iostats_template = {
{ "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 },
{ "simple_trim_extents_failed", KSTAT_DATA_UINT64 },
{ "simple_trim_bytes_failed", KSTAT_DATA_UINT64 },
+ { "arc_read_count", KSTAT_DATA_UINT64 },
+ { "arc_read_bytes", KSTAT_DATA_UINT64 },
+ { "arc_write_count", KSTAT_DATA_UINT64 },
+ { "arc_write_bytes", KSTAT_DATA_UINT64 },
+ { "direct_read_count", KSTAT_DATA_UINT64 },
+ { "direct_read_bytes", KSTAT_DATA_UINT64 },
+ { "direct_write_count", KSTAT_DATA_UINT64 },
+ { "direct_write_bytes", KSTAT_DATA_UINT64 },
};
#define SPA_IOSTATS_ADD(stat, val) \
@@ -938,6 +946,44 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
}
}
+void
+spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+ kstat_t *ksp = shk->kstat;
+
+ if (ksp == NULL)
+ return;
+
+ spa_iostats_t *iostats = ksp->ks_data;
+ if (flags & DMU_DIRECTIO) {
+ SPA_IOSTATS_ADD(direct_read_count, iops);
+ SPA_IOSTATS_ADD(direct_read_bytes, size);
+ } else {
+ SPA_IOSTATS_ADD(arc_read_count, iops);
+ SPA_IOSTATS_ADD(arc_read_bytes, size);
+ }
+}
+
+void
+spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+ kstat_t *ksp = shk->kstat;
+
+ if (ksp == NULL)
+ return;
+
+ spa_iostats_t *iostats = ksp->ks_data;
+ if (flags & DMU_DIRECTIO) {
+ SPA_IOSTATS_ADD(direct_write_count, iops);
+ SPA_IOSTATS_ADD(direct_write_bytes, size);
+ } else {
+ SPA_IOSTATS_ADD(arc_write_count, iops);
+ SPA_IOSTATS_ADD(arc_write_bytes, size);
+ }
+}
+
static int
spa_iostats_update(kstat_t *ksp, int rw)
{
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 6ae0a1412..9305bd894 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -118,6 +118,11 @@ static unsigned int zfs_slow_io_events_per_second = 20;
static unsigned int zfs_deadman_events_per_second = 1;
/*
+ * Rate limit direct write IO verify failures to this many per scond.
+ */
+static unsigned int zfs_dio_write_verify_events_per_second = 20;
+
+/*
* Rate limit checksum events after this many checksum errors per second.
*/
static unsigned int zfs_checksum_events_per_second = 20;
@@ -153,6 +158,17 @@ int zfs_nocacheflush = 0;
uint_t zfs_vdev_max_auto_ashift = 14;
uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+/*
+ * VDEV checksum verification for Direct I/O writes. This is neccessary for
+ * Linux, because anonymous pages can not be placed under write protection
+ * during Direct I/O writes.
+ */
+#if !defined(__FreeBSD__)
+uint_t zfs_vdev_direct_write_verify = 1;
+#else
+uint_t zfs_vdev_direct_write_verify = 0;
+#endif
+
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
{
@@ -673,6 +689,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
1);
zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
1);
+ zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
+ &zfs_dio_write_verify_events_per_second, 1);
zfs_ratelimit_init(&vd->vdev_checksum_rl,
&zfs_checksum_events_per_second, 1);
@@ -1182,6 +1200,7 @@ vdev_free(vdev_t *vd)
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_deadman_rl);
+ zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
if (vd == spa->spa_root_vdev)
@@ -4475,6 +4494,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_stat.vs_dio_verify_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;
for (int c = 0; c < vd->vdev_children; c++)
@@ -6503,7 +6523,14 @@ ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
"Rate limit hung IO (deadman) events to this many per second");
+ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
+ "Rate Direct I/O write verify events to this many per second");
+
/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
+ "Direct I/O writes will perform for checksum verification before "
+ "commiting write");
+
ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
"Rate limit checksum events to this many checksum errors per second "
"(do not set below ZED threshold).");
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 47346dd5a..9d12bc2eb 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -387,6 +387,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
/* IO delays */
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
+ /* Direct I/O write verify errors */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS,
+ vs->vs_dio_verify_errors);
+
/* Add extended stats nvlist to main nvlist */
fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index f7cecc9af..25b05abd3 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -595,6 +595,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, vs->vs_checksum_errors,
FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
DATA_TYPE_UINT64, vs->vs_slow_ios,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS,
+ DATA_TYPE_UINT64, vs->vs_dio_verify_errors,
NULL);
}
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 53366ad49..e69b98896 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -160,7 +160,6 @@
#include <sys/types.h>
#include <sys/param.h>
#include <sys/errno.h>
-#include <sys/uio_impl.h>
#include <sys/file.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 399f5a011..8d0aebbec 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -607,7 +607,7 @@ static int64_t zfs_immediate_write_sz = 32768;
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
- zil_callback_t callback, void *callback_data)
+ boolean_t o_direct, zil_callback_t callback, void *callback_data)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
uint32_t blocksize = zp->z_blksz;
@@ -622,7 +622,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
return;
}
- if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
write_state = WR_INDIRECT;
else if (!spa_has_slogs(zilog->zl_spa) &&
resid >= zfs_immediate_write_sz)
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index f3db953ea..f9cc5b010 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -35,7 +35,6 @@
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
-#include <sys/uio_impl.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/kmem.h>
@@ -75,6 +74,14 @@ int zfs_bclone_enabled = 1;
static int zfs_bclone_wait_dirty = 0;
/*
+ * Enable Direct I/O. If this setting is 0, then all I/O requests will be
+ * directed through the ARC acting as though the dataset property direct was
+ * set to disabled.
+ */
+static int zfs_dio_enabled = 1;
+
+
+/*
* Maximum bytes to read per chunk in zfs_read().
*/
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
@@ -203,6 +210,77 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
}
/*
+ * Determine if Direct I/O has been requested (either via the O_DIRECT flag or
+ * the "direct" dataset property). When inherited by the property only apply
+ * the O_DIRECT flag to correctly aligned IO requests. The rational for this
+ * is it allows the property to be safely set on a dataset without forcing
+ * all of the applications to be aware of the alignment restrictions. When
+ * O_DIRECT is explicitly requested by an application return EINVAL if the
+ * request is unaligned. In all cases, if the range for this request has
+ * been mmap'ed then we will perform buffered I/O to keep the mapped region
+ * synhronized with the ARC.
+ *
+ * It is possible that a file's pages could be mmap'ed after it is checked
+ * here. If so, that is handled coorarding in zfs_write(). See comments in the
+ * following area for how this is handled:
+ * zfs_write() -> update_pages()
+ */
+static int
+zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
+ int *ioflagp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ objset_t *os = zfsvfs->z_os;
+ int ioflag = *ioflagp;
+ int error = 0;
+
+ if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED ||
+ zn_has_cached_data(zp, zfs_uio_offset(uio),
+ zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
+ /*
+ * Direct I/O is disabled or the region is mmap'ed. In either
+ * case the I/O request will just directed through the ARC.
+ */
+ ioflag &= ~O_DIRECT;
+ goto out;
+ } else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
+ zfs_uio_page_aligned(uio) &&
+ zfs_uio_aligned(uio, PAGE_SIZE)) {
+ if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
+ (rw == UIO_READ)) {
+ ioflag |= O_DIRECT;
+ }
+ } else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
+ /*
+ * Direct I/O was requested through the direct=always, but it
+ * is not properly PAGE_SIZE aligned. The request will be
+ * directed through the ARC.
+ */
+ ioflag &= ~O_DIRECT;
+ }
+
+ if (ioflag & O_DIRECT) {
+ if (!zfs_uio_page_aligned(uio) ||
+ !zfs_uio_aligned(uio, PAGE_SIZE)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ error = zfs_uio_get_dio_pages_alloc(uio, rw);
+ if (error) {
+ goto out;
+ }
+ }
+
+ IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
+ ASSERT0(error);
+
+out:
+ *ioflagp = ioflag;
+ return (error);
+}
+
+/*
* Read bytes from specified file into supplied buffer.
*
* IN: zp - inode of file to be read from.
@@ -286,24 +364,58 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
error = 0;
goto out;
}
-
ASSERT(zfs_uio_offset(uio) < zp->z_size);
+
+ /*
+ * Setting up Direct I/O if requested.
+ */
+ error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag);
+ if (error) {
+ goto out;
+ }
+
#if defined(__linux__)
ssize_t start_offset = zfs_uio_offset(uio);
#endif
+ ssize_t chunk_size = zfs_vnops_read_chunk_size;
ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
ssize_t start_resid = n;
+ ssize_t dio_remaining_resid = 0;
+
+ if (uio->uio_extflg & UIO_DIRECT) {
+ /*
+ * All pages for an O_DIRECT request ahve already been mapped
+ * so there's no compelling reason to handle this uio in
+ * smaller chunks.
+ */
+ chunk_size = DMU_MAX_ACCESS;
+
+ /*
+ * In the event that the O_DIRECT request is reading the entire
+ * file, it is possible file's length is not page sized
+ * aligned. However, lower layers expect that the Direct I/O
+ * request is page-aligned. In this case, as much of the file
+ * that can be read using Direct I/O happens and the remaining
+ * amount will be read through the ARC.
+ *
+ * This is still consistent with the semantics of Direct I/O in
+ * ZFS as at a minimum the I/O request must be page-aligned.
+ */
+ dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
+ if (dio_remaining_resid != 0)
+ n -= dio_remaining_resid;
+ }
while (n > 0) {
- ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
- P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
+ ssize_t nbytes = MIN(n, chunk_size -
+ P2PHASE(zfs_uio_offset(uio), chunk_size));
#ifdef UIO_NOCOPY
if (zfs_uio_segflg(uio) == UIO_NOCOPY)
error = mappedread_sf(zp, nbytes, uio);
else
#endif
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
- zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
+ zfs_uio_offset(uio) + nbytes - 1)) {
error = mappedread(zp, nbytes, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -332,12 +444,40 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
n -= nbytes;
}
+ if (error == 0 && (uio->uio_extflg & UIO_DIRECT) &&
+ dio_remaining_resid != 0) {
+ /*
+ * Temporarily remove the UIO_DIRECT flag from the UIO so the
+ * remainder of the file can be read using the ARC.
+ */
+ uio->uio_extflg &= ~UIO_DIRECT;
+
+ if (zn_has_cached_data(zp, zfs_uio_offset(uio),
+ zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
+ error = mappedread(zp, dio_remaining_resid, uio);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
+ dio_remaining_resid);
+ }
+ uio->uio_extflg |= UIO_DIRECT;
+
+ if (error != 0)
+ n += dio_remaining_resid;
+ } else if (error && (uio->uio_extflg & UIO_DIRECT)) {
+ n += dio_remaining_resid;
+ }
int64_t nread = start_resid - n;
+
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
- task_io_account_read(nread);
out:
zfs_rangelock_exit(lr);
+ /*
+ * Cleanup for Direct I/O if requested.
+ */
+ if (uio->uio_extflg & UIO_DIRECT)
+ zfs_uio_free_dio_pages(uio, UIO_READ);
+
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -422,6 +562,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
int error = 0, error1;
ssize_t start_resid = zfs_uio_resid(uio);
uint64_t clear_setid_bits_txg = 0;
+ boolean_t o_direct_defer = B_FALSE;
/*
* Fasttrack empty write
@@ -475,6 +616,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
}
/*
+ * Setting up Direct I/O if requested.
+ */
+ error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag);
+ if (error) {
+ zfs_exit(zfsvfs, FTAG);
+ return (SET_ERROR(error));
+ }
+
+ /*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
*/
@@ -504,6 +654,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
woff = zp->z_size;
}
zfs_uio_setoffset(uio, woff);
+ /*
+ * We need to update the starting offset as well because it is
+ * set previously in the ZPL (Linux) and VNOPS (FreeBSD)
+ * layers.
+ */
+ zfs_uio_setsoffset(uio, woff);
} else {
/*
* Note that if the file block size will change as a result of
@@ -540,6 +696,33 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
const uint64_t projid = zp->z_projid;
/*
+ * In the event we are increasing the file block size
+ * (lr_length == UINT64_MAX), we will direct the write to the ARC.
+ * Because zfs_grow_blocksize() will read from the ARC in order to
+ * grow the dbuf, we avoid doing Direct I/O here as that would cause
+ * data written to disk to be overwritten by data in the ARC during
+ * the sync phase. Besides writing data twice to disk, we also
+ * want to avoid consistency concerns between data in the the ARC and
+ * on disk while growing the file's blocksize.
+ *
+ * We will only temporarily remove Direct I/O and put it back after
+ * we have grown the blocksize. We do this in the event a request
+ * is larger than max_blksz, so further requests to
+ * dmu_write_uio_dbuf() will still issue the requests using Direct
+ * IO.
+ *
+ * As an example:
+ * The first block to file is being written as a 4k request with
+ * a recorsize of 1K. The first 1K issued in the loop below will go
+ * through the ARC; however, the following 3 1K requests will
+ * use Direct I/O.
+ */
+ if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) {
+ uio->uio_extflg &= ~UIO_DIRECT;
+ o_direct_defer = B_TRUE;
+ }
+
+ /*
* Write the file in reasonable size chunks. Each chunk is written
* in a separate transaction; this keeps the intent log records small
* and allows us to do more fine-grained space accounting.
@@ -580,6 +763,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
ssize_t nbytes = n;
if (n >= blksz && woff >= zp->z_size &&
P2PHASE(woff, blksz) == 0 &&
+ !(uio->uio_extflg & UIO_DIRECT) &&
(blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
/*
* This write covers a full block. "Borrow" a buffer
@@ -705,9 +889,30 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
zfs_uioskip(uio, nbytes);
tx_bytes = nbytes;
}
+ /*
+ * There is a window where a file's pages can be mmap'ed after
+ * zfs_setup_direct() is called. This is due to the fact that
+ * the rangelock in this function is acquired after calling
+ * zfs_setup_direct(). This is done so that
+ * zfs_uio_prefaultpages() does not attempt to fault in pages
+ * on Linux for Direct I/O requests. This is not necessary as
+ * the pages are pinned in memory and can not be faulted out.
+ * Ideally, the rangelock would be held before calling
+ * zfs_setup_direct() and zfs_uio_prefaultpages(); however,
+ * this can lead to a deadlock as zfs_getpage() also acquires
+ * the rangelock as a RL_WRITER and prefaulting the pages can
+ * lead to zfs_getpage() being called.
+ *
+ * In the case of the pages being mapped after
+ * zfs_setup_direct() is called, the call to update_pages()
+ * will still be made to make sure there is consistency between
+ * the ARC and the Linux page cache. This is an ufortunate
+ * situation as the data will be read back into the ARC after
+ * the Direct I/O write has completed, but this is the penality
+ * for writing to a mmap'ed region of a file using Direct I/O.
+ */
if (tx_bytes &&
- zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
- !(ioflag & O_DIRECT)) {
+ zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) {
update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
}
@@ -756,10 +961,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* the TX_WRITE records logged here.
*/
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
- NULL, NULL);
+ uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL,
+ NULL);
dmu_tx_commit(tx);
+ /*
+ * Direct I/O was deferred in order to grow the first block.
+ * At this point it can be re-enabled for subsequent writes.
+ */
+ if (o_direct_defer) {
+ ASSERT(ioflag & O_DIRECT);
+ uio->uio_extflg |= UIO_DIRECT;
+ o_direct_defer = B_FALSE;
+ }
+
if (error != 0)
break;
ASSERT3S(tx_bytes, ==, nbytes);
@@ -767,10 +983,22 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
pfbytes -= nbytes;
}
+ if (o_direct_defer) {
+ ASSERT(ioflag & O_DIRECT);
+ uio->uio_extflg |= UIO_DIRECT;
+ o_direct_defer = B_FALSE;
+ }
+
zfs_znode_update_vfs(zp);
zfs_rangelock_exit(lr);
/*
+ * Cleanup for Direct I/O if requested.
+ */
+ if (uio->uio_extflg & UIO_DIRECT)
+ zfs_uio_free_dio_pages(uio, UIO_WRITE);
+
+ /*
* If we're in replay mode, or we made no progress, or the
* uio data is inaccessible return an error. Otherwise, it's
* at least a partial write, so it's successful.
@@ -784,9 +1012,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
if (commit)
zil_commit(zilog, zp->z_id);
- const int64_t nwritten = start_resid - zfs_uio_resid(uio);
+ int64_t nwritten = start_resid - zfs_uio_resid(uio);
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
- task_io_account_write(nwritten);
zfs_exit(zfsvfs, FTAG);
return (0);
@@ -846,7 +1073,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
- dmu_buf_t *db;
zgd_t *zgd;
int error = 0;
uint64_t zp_gen;
@@ -890,8 +1116,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
+ size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
@@ -929,18 +1155,44 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
zil_fault_io = 0;
}
#endif
+
+ dmu_buf_t *dbp;
if (error == 0)
error = dmu_buf_hold_noread(os, object, offset, zgd,
- &db);
+ &dbp);
if (error == 0) {
- blkptr_t *bp = &lr->lr_blkptr;
+ zgd->zgd_db = dbp;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp;
+ boolean_t direct_write = B_FALSE;
+ mutex_enter(&db->db_mtx);
+ dbuf_dirty_record_t *dr =
+ dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg);
+ if (dr != NULL && dr->dt.dl.dr_diowrite)
+ direct_write = B_TRUE;
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * All Direct I/O writes will have already completed and
+ * the block pointer can be immediately stored in the
+ * log record.
+ */
+ if (direct_write) {
+ /*
+ * A Direct I/O write always covers an entire
+ * block.
+ */
+ ASSERT3U(dbp->db_size, ==, zp->z_blksz);
+ lr->lr_blkptr = dr->dt.dl.dr_overridden_by;
+ zfs_get_done(zgd, 0);
+ return (0);
+ }
- zgd->zgd_db = db;
+ blkptr_t *bp = &lr->lr_blkptr;
zgd->zgd_bp = bp;
- ASSERT(db->db_offset == offset);
- ASSERT(db->db_size == size);
+ ASSERT3U(dbp->db_offset, ==, offset);
+ ASSERT3U(dbp->db_size, ==, size);
error = dmu_sync(zio, lr->lr_common.lrc_txg,
zfs_get_done, zgd);
@@ -975,7 +1227,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
return (error);
}
-
static void
zfs_get_done(zgd_t *zgd, int error)
{
@@ -1559,3 +1810,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
"Wait for dirty blocks when cloning");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
+ "Enable Direct I/O");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 53992931e..66a8a9fef 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -803,6 +803,12 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0);
+ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+ ASSERT3U(*errorp, ==, EIO);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ }
+
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
@@ -1282,20 +1288,14 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_flag_t flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
+ enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ?
+ ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE;
- ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
- zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
- zp->zp_compress >= ZIO_COMPRESS_OFF &&
- zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
- DMU_OT_IS_VALID(zp->zp_type) &&
- zp->zp_level < 32 &&
- zp->zp_copies > 0 &&
- zp->zp_copies <= spa_max_replication(spa));
zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
- ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
+ ZIO_STAGE_OPEN, pipeline);
zio->io_ready = ready;
zio->io_children_ready = children_ready;
@@ -1572,6 +1572,19 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
*/
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+ } else if (type == ZIO_TYPE_WRITE &&
+ pio->io_prop.zp_direct_write == B_TRUE) {
+ /*
+ * By default we only will verify checksums for Direct I/O
+ * writes for Linux. FreeBSD is able to place user pages under
+ * write protection before issuing them to the ZIO pipeline.
+ *
+ * Checksum validation errors will only be reported through
+ * the top-level VDEV, which is set by this child ZIO.
+ */
+ ASSERT3P(bp, !=, NULL);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY;
}
if (vd->vdev_ops->vdev_op_leaf) {
@@ -3104,6 +3117,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zp.zp_nopwrite = B_FALSE;
zp.zp_encrypt = gio->io_prop.zp_encrypt;
zp.zp_byteorder = gio->io_prop.zp_byteorder;
+ zp.zp_direct_write = B_FALSE;
memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -3577,6 +3591,13 @@ zio_ddt_write(zio_t *zio)
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
+ /*
+ * Deduplication will not take place for Direct I/O writes. The
+ * ddt_tree will be emptied in syncing context. Direct I/O writes take
+ * place in the open-context. Direct I/O write can not attempt to
+ * modify the ddt_tree while issuing out a write.
+ */
+ ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE);
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp);
@@ -4509,6 +4530,19 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_vsd = NULL;
}
+ /*
+ * If a Direct I/O write checksum verify error has occurred then this
+ * I/O should not attempt to be issued again. Instead the EIO will
+ * be returned.
+ */
+ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ ASSERT3U(zio->io_error, ==, EIO);
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ return (zio);
+ }
+
+
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO);
@@ -4822,6 +4856,49 @@ zio_checksum_verify(zio_t *zio)
return (zio);
}
+static zio_t *
+zio_dio_checksum_verify(zio_t *zio)
+{
+ zio_t *pio = zio_unique_parent(zio);
+ int error;
+
+ ASSERT3P(zio->io_vd, !=, NULL);
+ ASSERT3P(zio->io_bp, !=, NULL);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+
+ if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0)
+ goto out;
+
+ if ((error = zio_checksum_error(zio, NULL)) != 0) {
+ zio->io_error = error;
+ if (error == ECKSUM) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_dio_verify_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+ zio->io_error = SET_ERROR(EIO);
+ zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+
+ /*
+ * The EIO error must be propagated up to the logical
+ * parent ZIO in zio_notify_parent() so it can be
+ * returned to dmu_write_abd().
+ */
+ zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE;
+
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark,
+ zio, 0);
+ }
+ }
+
+out:
+ return (zio);
+}
+
+
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
@@ -5152,7 +5229,8 @@ zio_done(zio_t *zio)
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
- !vdev_is_dead(zio->io_vd)) {
+ !vdev_is_dead(zio->io_vd) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
if (ret != EALREADY) {
@@ -5167,6 +5245,7 @@ zio_done(zio_t *zio)
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
zio == zio->io_logical) {
/*
* For logical I/O requests, tell the SPA to log the
@@ -5188,7 +5267,8 @@ zio_done(zio_t *zio)
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
if (IO_IS_ALLOCATING(zio) &&
- !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ !(zio->io_flags & ZIO_FLAG_CANFAIL) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
@@ -5239,6 +5319,14 @@ zio_done(zio_t *zio)
if (zio->io_reexecute) {
/*
+ * A Direct I/O write that has a checksum verify error should
+ * not attempt to reexecute. Instead, EAGAIN should just be
+ * propagated back up so the write can be attempt to be issued
+ * through the ARC.
+ */
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
+
+ /*
* This is a logical I/O that wants to reexecute.
*
* Reexecute is top-down. When an i/o fails, if it's not
@@ -5398,6 +5486,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_vdev_io_done,
zio_vdev_io_assess,
zio_checksum_verify,
+ zio_dio_checksum_verify,
zio_done
};