diff options
Diffstat (limited to 'module/zfs/zio.c')
-rw-r--r-- | module/zfs/zio.c | 355 |
1 files changed, 330 insertions, 25 deletions
diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1d69d8d8d..959b9a5a8 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -43,6 +43,7 @@ #include <sys/time.h> #include <sys/trace_zio.h> #include <sys/abd.h> +#include <sys/dsl_crypt.h> /* * ========================================================================== @@ -368,7 +369,7 @@ zio_pop_transforms(zio_t *zio) /* * ========================================================================== - * I/O transform callbacks for subblocks and decompression + * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void @@ -394,6 +395,126 @@ zio_decompress(zio_t *zio, abd_t *data, uint64_t size) } } +static void +zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) +{ + int ret; + void *tmp; + blkptr_t *bp = zio->io_bp; + uint64_t lsize = BP_GET_LSIZE(bp); + dmu_object_type_t ot = BP_GET_TYPE(bp); + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(size, !=, 0); + + if (zio->io_error != 0) + return; + + /* + * Verify the cksum of MACs stored in an indirect bp. It will always + * be possible to verify this since it does not require an encryption + * key. + */ + if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { + zio_crypt_decode_mac_bp(bp, mac); + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + /* + * We haven't decompressed the data yet, but + * zio_crypt_do_indirect_mac_checksum() requires + * decompressed data to be able to parse out the MACs + * from the indirect block. We decompress it now and + * throw away the result after we are finished. + */ + tmp = zio_buf_alloc(lsize); + ret = zio_decompress_data(BP_GET_COMPRESS(bp), + zio->io_abd, tmp, zio->io_size, lsize); + if (ret != 0) { + ret = SET_ERROR(EIO); + goto error; + } + ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, + tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); + zio_buf_free(tmp, lsize); + } else { + ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, + zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); + } + abd_copy(data, zio->io_abd, size); + + if (ret != 0) + goto error; + + return; + } + + /* + * If this is an authenticated block, just check the MAC. It would be + * nice to separate this out into its own flag, but for the moment + * enum zio_flag is out of bits. + */ + if (BP_IS_AUTHENTICATED(bp)) { + if (ot == DMU_OT_OBJSET) { + ret = spa_do_crypt_objset_mac_abd(B_FALSE, zio->io_spa, + zio->io_bookmark.zb_objset, zio->io_abd, size, + BP_SHOULD_BYTESWAP(bp)); + } else { + zio_crypt_decode_mac_bp(bp, mac); + ret = spa_do_crypt_mac_abd(B_FALSE, zio->io_spa, + zio->io_bookmark.zb_objset, zio->io_abd, size, mac); + } + abd_copy(data, zio->io_abd, size); + + if (ret != 0) + goto error; + + return; + } + + zio_crypt_decode_params_bp(bp, salt, iv); + + if (ot == DMU_OT_INTENT_LOG) { + tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); + zio_crypt_decode_mac_zil(tmp, mac); + abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); + } else { + zio_crypt_decode_mac_bp(bp, mac); + } + + ret = spa_do_crypt_abd(B_FALSE, zio->io_spa, zio->io_bookmark.zb_objset, + bp, bp->blk_birth, size, data, zio->io_abd, iv, mac, salt, + &no_crypt); + if (no_crypt) + abd_copy(data, zio->io_abd, size); + + if (ret != 0) + goto error; + + return; + +error: + /* assert that the key was found unless this was speculative */ + ASSERT(ret != ENOENT || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); + + /* + * If there was a decryption / authentication error return EIO as + * the io_error. If this was not a speculative zio, create an ereport. + */ + if (ret == ECKSUM) { + ret = SET_ERROR(EIO); + if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { + zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0); + } + } else { + zio->io_error = ret; + } +} + /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks @@ -606,7 +727,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); - IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); + IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); @@ -844,9 +965,12 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to - * dedup (just take the already-allocated BP verbatim). + * dedup (just take the already-allocated BP verbatim). Encrypted + * dedup blocks need data as well so we also disable dedup in this + * case. */ - if (data == NULL && zio->io_prop.zp_dedup_verify) { + if (data == NULL && + (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } @@ -1186,16 +1310,23 @@ static int zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; + uint64_t psize = + BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && - !(zio->io_flags & ZIO_FLAG_RAW)) { - uint64_t psize = - BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); + !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } + if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || + BP_HAS_INDIRECT_MAC_CKSUM(bp)) && + zio->io_child_type == ZIO_CHILD_LOGICAL) { + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decrypt); + } + if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); @@ -1222,7 +1353,6 @@ zio_read_bp_init(zio_t *zio) static int zio_write_bp_init(zio_t *zio) { - if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); @@ -1261,7 +1391,8 @@ zio_write_bp_init(zio_t *zio) ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); - if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { + if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && + !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); @@ -1290,8 +1421,6 @@ zio_write_compress(zio_t *zio) uint64_t psize = zio->io_size; int pass = 1; - EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); - /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. @@ -1341,13 +1470,15 @@ zio_write_compress(zio_t *zio) } /* If it's a compressed write that is not raw, compress the buffer. */ - if (compress != ZIO_COMPRESS_OFF && psize == lsize) { + if (compress != ZIO_COMPRESS_OFF && + !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); - } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && + } else if (!zp->zp_dedup && !zp->zp_encrypt && + psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, @@ -1445,6 +1576,8 @@ zio_write_compress(zio_t *zio) if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!zp->zp_encrypt || + DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { @@ -1868,7 +2001,8 @@ zio_suspend(spa_t *spa, zio_t *zio) cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); - zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); + zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, + NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); @@ -2298,11 +2432,19 @@ zio_write_gang_block(zio_t *pio) uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; - int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); + int gbh_copies; zio_prop_t zp; int g, error; - int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; + + /* + * encrypted blocks need DVA[2] free so encrypted gang headers can't + * have a third copy. + */ + gbh_copies = MIN(copies + 1, spa_max_replication(spa)); + if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP) + gbh_copies = SPA_DVAS_PER_BP - 1; + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); @@ -2376,12 +2518,16 @@ zio_write_gang_block(zio_t *pio) zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; + zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; + bzero(zp.zp_salt, ZIO_DATA_SALT_LEN); + bzero(zp.zp_iv, ZIO_DATA_IV_LEN); + bzero(zp.zp_mac, ZIO_DATA_MAC_LEN); cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, @@ -2460,6 +2606,7 @@ zio_nop_write(zio_t *zio) if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || + BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || @@ -2609,7 +2756,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these - * cases we can compare the io_data directly. This is useful because + * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). @@ -3097,8 +3244,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, - boolean_t *slog) +zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, + uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; @@ -3130,6 +3277,23 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); + + /* + * encrypted blocks will require an IV and salt. We generate + * these now since we will not be rewriting the bp at + * rewrite time. + */ + if (os->os_encrypted) { + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t salt[ZIO_DATA_SALT_LEN]; + + BP_SET_CRYPT(new_bp, B_TRUE); + VERIFY0(spa_crypt_get_salt(spa, + dmu_objset_id(os), salt)); + VERIFY0(zio_crypt_generate_iv(iv)); + + zio_crypt_encode_params_bp(new_bp, salt, iv); + } } return (error); @@ -3464,6 +3628,146 @@ zio_vdev_io_bypass(zio_t *zio) /* * ========================================================================== + * Encrypt and store encryption parameters + * ========================================================================== + */ + + +/* + * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for + * managing the storage of encryption parameters and passing them to the + * lower-level encryption functions. + */ +static int +zio_encrypt(zio_t *zio) +{ + zio_prop_t *zp = &zio->io_prop; + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + uint64_t psize = BP_GET_PSIZE(bp); + dmu_object_type_t ot = BP_GET_TYPE(bp); + void *enc_buf = NULL; + abd_t *eabd = NULL; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + /* the root zio already encrypted the data */ + if (zio->io_child_type == ZIO_CHILD_GANG) + return (ZIO_PIPELINE_CONTINUE); + + /* only ZIL blocks are re-encrypted on rewrite */ + if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) + return (ZIO_PIPELINE_CONTINUE); + + if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { + BP_SET_CRYPT(bp, B_FALSE); + return (ZIO_PIPELINE_CONTINUE); + } + + /* if we are doing raw encryption set the provided encryption params */ + if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { + BP_SET_CRYPT(bp, B_TRUE); + BP_SET_BYTEORDER(bp, zp->zp_byteorder); + if (ot != DMU_OT_OBJSET) + zio_crypt_encode_mac_bp(bp, zp->zp_mac); + if (DMU_OT_IS_ENCRYPTED(ot)) + zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); + return (ZIO_PIPELINE_CONTINUE); + } + + /* indirect blocks only maintain a cksum of the lower level MACs */ + if (BP_GET_LEVEL(bp) > 0) { + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, + zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), + mac)); + zio_crypt_encode_mac_bp(bp, mac); + return (ZIO_PIPELINE_CONTINUE); + } + + /* + * Objset blocks are a special case since they have 2 256-bit MACs + * embedded within them. + */ + if (ot == DMU_OT_OBJSET) { + ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, + zio->io_bookmark.zb_objset, zio->io_abd, psize, + BP_SHOULD_BYTESWAP(bp))); + return (ZIO_PIPELINE_CONTINUE); + } + + /* unencrypted object types are only authenticated with a MAC */ + if (!DMU_OT_IS_ENCRYPTED(ot)) { + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, + zio->io_bookmark.zb_objset, zio->io_abd, psize, mac)); + zio_crypt_encode_mac_bp(bp, mac); + return (ZIO_PIPELINE_CONTINUE); + } + + /* + * Later passes of sync-to-convergence may decide to rewrite data + * in place to avoid more disk reallocations. This presents a problem + * for encryption because this consitutes rewriting the new data with + * the same encryption key and IV. However, this only applies to blocks + * in the MOS (particularly the spacemaps) and we do not encrypt the + * MOS. We assert that the zio is allocating or an intent log write + * to enforce this. + */ + ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); + ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); + ASSERT3U(psize, !=, 0); + + enc_buf = zio_buf_alloc(psize); + eabd = abd_get_from_buf(enc_buf, psize); + abd_take_ownership_of_buf(eabd, B_TRUE); + + /* + * For an explanation of what encryption parameters are stored + * where, see the block comment in zio_crypt.c. + */ + if (ot == DMU_OT_INTENT_LOG) { + zio_crypt_decode_params_bp(bp, salt, iv); + } else { + BP_SET_CRYPT(bp, B_TRUE); + } + + /* Perform the encryption. This should not fail */ + VERIFY0(spa_do_crypt_abd(B_TRUE, spa, zio->io_bookmark.zb_objset, bp, + zio->io_txg, psize, zio->io_abd, eabd, iv, mac, salt, &no_crypt)); + + /* encode encryption metadata into the bp */ + if (ot == DMU_OT_INTENT_LOG) { + /* + * ZIL blocks store the MAC in the embedded checksum, so the + * transform must always be applied. + */ + zio_crypt_encode_mac_zil(enc_buf, mac); + zio_push_transform(zio, eabd, psize, psize, NULL); + } else { + BP_SET_CRYPT(bp, B_TRUE); + zio_crypt_encode_params_bp(bp, salt, iv); + zio_crypt_encode_mac_bp(bp, mac); + + if (no_crypt) { + ASSERT3U(ot, ==, DMU_OT_DNODE); + abd_free(eabd); + } else { + zio_push_transform(zio, eabd, psize, psize, NULL); + } + } + + return (ZIO_PIPELINE_CONTINUE); +} + +/* + * ========================================================================== * Generate and verify checksums * ========================================================================== */ @@ -3523,8 +3827,8 @@ zio_checksum_verify(zio_t *zio) if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_start_checksum(zio->io_spa, - zio->io_vd, zio, zio->io_offset, - zio->io_size, NULL, &info); + zio->io_vd, &zio->io_bookmark, zio, + zio->io_offset, zio->io_size, NULL, &info); } } @@ -3824,7 +4128,7 @@ zio_done(zio_t *zio) if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, - zio->io_vd, zio, 0, 0); + zio->io_vd, &zio->io_bookmark, zio, 0, 0); } if (zio->io_error) { @@ -3837,7 +4141,7 @@ zio_done(zio_t *zio) if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, - zio->io_vd, zio, 0, 0); + zio->io_vd, &zio->io_bookmark, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && @@ -3846,9 +4150,9 @@ zio_done(zio_t *zio) * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ - spa_log_error(zio->io_spa, zio); + spa_log_error(zio->io_spa, &zio->io_bookmark); zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, - NULL, zio, 0, 0); + NULL, &zio->io_bookmark, zio, 0, 0); } } @@ -4046,6 +4350,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_free_bp_init, zio_issue_async, zio_write_compress, + zio_encrypt, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, |