aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/zio.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/zio.c')
-rw-r--r--module/zfs/zio.c113
1 files changed, 101 insertions, 12 deletions
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 53992931e..66a8a9fef 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -803,6 +803,12 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0);
+ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+ ASSERT3U(*errorp, ==, EIO);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ }
+
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
@@ -1282,20 +1288,14 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_flag_t flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
+ enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ?
+ ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE;
- ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
- zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
- zp->zp_compress >= ZIO_COMPRESS_OFF &&
- zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
- DMU_OT_IS_VALID(zp->zp_type) &&
- zp->zp_level < 32 &&
- zp->zp_copies > 0 &&
- zp->zp_copies <= spa_max_replication(spa));
zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
- ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
+ ZIO_STAGE_OPEN, pipeline);
zio->io_ready = ready;
zio->io_children_ready = children_ready;
@@ -1572,6 +1572,19 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
*/
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+ } else if (type == ZIO_TYPE_WRITE &&
+ pio->io_prop.zp_direct_write == B_TRUE) {
+ /*
+ * By default we only will verify checksums for Direct I/O
+ * writes for Linux. FreeBSD is able to place user pages under
+ * write protection before issuing them to the ZIO pipeline.
+ *
+ * Checksum validation errors will only be reported through
+ * the top-level VDEV, which is set by this child ZIO.
+ */
+ ASSERT3P(bp, !=, NULL);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY;
}
if (vd->vdev_ops->vdev_op_leaf) {
@@ -3104,6 +3117,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zp.zp_nopwrite = B_FALSE;
zp.zp_encrypt = gio->io_prop.zp_encrypt;
zp.zp_byteorder = gio->io_prop.zp_byteorder;
+ zp.zp_direct_write = B_FALSE;
memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -3577,6 +3591,13 @@ zio_ddt_write(zio_t *zio)
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
+ /*
+ * Deduplication will not take place for Direct I/O writes. The
+ * ddt_tree will be emptied in syncing context. Direct I/O writes take
+ * place in the open-context. Direct I/O write can not attempt to
+ * modify the ddt_tree while issuing out a write.
+ */
+ ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE);
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp);
@@ -4509,6 +4530,19 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_vsd = NULL;
}
+ /*
+ * If a Direct I/O write checksum verify error has occurred then this
+ * I/O should not attempt to be issued again. Instead the EIO will
+ * be returned.
+ */
+ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+ ASSERT3U(zio->io_error, ==, EIO);
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ return (zio);
+ }
+
+
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO);
@@ -4822,6 +4856,49 @@ zio_checksum_verify(zio_t *zio)
return (zio);
}
+static zio_t *
+zio_dio_checksum_verify(zio_t *zio)
+{
+ zio_t *pio = zio_unique_parent(zio);
+ int error;
+
+ ASSERT3P(zio->io_vd, !=, NULL);
+ ASSERT3P(zio->io_bp, !=, NULL);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE);
+ ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
+
+ if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0)
+ goto out;
+
+ if ((error = zio_checksum_error(zio, NULL)) != 0) {
+ zio->io_error = error;
+ if (error == ECKSUM) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_dio_verify_errors++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+ zio->io_error = SET_ERROR(EIO);
+ zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+
+ /*
+ * The EIO error must be propagated up to the logical
+ * parent ZIO in zio_notify_parent() so it can be
+ * returned to dmu_write_abd().
+ */
+ zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE;
+
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark,
+ zio, 0);
+ }
+ }
+
+out:
+ return (zio);
+}
+
+
/*
* Called by RAID-Z to ensure we don't compute the checksum twice.
*/
@@ -5152,7 +5229,8 @@ zio_done(zio_t *zio)
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
- !vdev_is_dead(zio->io_vd)) {
+ !vdev_is_dead(zio->io_vd) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
if (ret != EALREADY) {
@@ -5167,6 +5245,7 @@ zio_done(zio_t *zio)
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
zio == zio->io_logical) {
/*
* For logical I/O requests, tell the SPA to log the
@@ -5188,7 +5267,8 @@ zio_done(zio_t *zio)
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
if (IO_IS_ALLOCATING(zio) &&
- !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ !(zio->io_flags & ZIO_FLAG_CANFAIL) &&
+ !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
@@ -5239,6 +5319,14 @@ zio_done(zio_t *zio)
if (zio->io_reexecute) {
/*
+ * A Direct I/O write that has a checksum verify error should
+ * not attempt to reexecute. Instead, EAGAIN should just be
+ * propagated back up so the write can be attempt to be issued
+ * through the ARC.
+ */
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
+
+ /*
* This is a logical I/O that wants to reexecute.
*
* Reexecute is top-down. When an i/o fails, if it's not
@@ -5398,6 +5486,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_vdev_io_done,
zio_vdev_io_assess,
zio_checksum_verify,
+ zio_dio_checksum_verify,
zio_done
};