diff options
author | Tom Caputi <[email protected]> | 2017-08-14 13:36:48 -0400 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2017-08-14 10:36:48 -0700 |
commit | b52563034230b35f0562b6f40ad1a00f02bd9a05 (patch) | |
tree | 794ccc5160e997e280cb6e36c7778ce9f7a96548 /module/zfs/arc.c | |
parent | 376994828fd3753aba75d492859727ca76f6a293 (diff) |
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Jorgen Lundman <[email protected]>
Signed-off-by: Tom Caputi <[email protected]>
Closes #494
Closes #5769
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r-- | module/zfs/arc.c | 1611 |
1 files changed, 1352 insertions, 259 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 157a28d4b..d7ad101c3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -258,6 +258,21 @@ * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. + * + * The L1ARC has a slightly different system for storing encrypted data. + * Raw (encrypted + possibly compressed) data has a few subtle differences from + * data that is just compressed. The biggest difference is that it is not + * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded. + * The other difference is that encryption cannot be treated as a suggestion. + * If a caller would prefer compressed data, but they actually wind up with + * uncompressed data the worst thing that could happen is there might be a + * performance hit. If the caller requests encrypted data, however, we must be + * sure they actually get it or else secret information could be leaked. Raw + * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, + * may have both an encrypted version and a decrypted version of its data at + * once. When a caller needs a raw arc_buf_t, it is allocated and the data is + * copied out of this header. To avoid complications with b_pabd, raw buffers + * cannot be shared. */ #include <sys/spa.h> @@ -274,6 +289,8 @@ #include <sys/zio_checksum.h> #include <sys/multilist.h> #include <sys/abd.h> +#include <sys/zil.h> +#include <sys/fm/fs/zfs.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <vm/anon.h> @@ -645,6 +662,7 @@ typedef struct arc_stats { kstat_named_t arcstat_demand_hit_predictive_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; + kstat_named_t arcstat_raw_size; } arc_stats_t; static arc_stats_t arc_stats = { @@ -739,7 +757,8 @@ static arc_stats_t arc_stats = { { "sync_wait_for_async", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, - { "arc_sys_free", KSTAT_DATA_UINT64 } + { "arc_sys_free", KSTAT_DATA_UINT64 }, + { "arc_raw_size", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -815,6 +834,8 @@ static arc_state_t *arc_l2c_only; #define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ +/* size of all b_rabd's in entire arc */ +#define arc_raw_size ARCSTAT(arcstat_raw_size) /* compressed size of entire arc */ #define arc_compressed_size ARCSTAT(arcstat_compressed_size) /* uncompressed size of entire arc */ @@ -844,6 +865,8 @@ static taskq_t *arc_prune_taskq; #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) +#define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ @@ -852,6 +875,13 @@ static taskq_t *arc_prune_taskq; #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +#define HDR_HAS_RABD(hdr) \ + (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ + (hdr)->b_crypt_hdr.b_rabd != NULL) +#define HDR_ENCRYPTED(hdr) \ + (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) +#define HDR_AUTHENTICATED(hdr) \ + (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) @@ -864,12 +894,14 @@ static taskq_t *arc_prune_taskq; #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) +#define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ -#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* @@ -967,6 +999,14 @@ typedef struct l2arc_data_free { list_node_t l2df_list_node; } l2arc_data_free_t; +typedef enum arc_fill_flags { + ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ + ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ + ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ + ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ + ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ +} arc_fill_flags_t; + static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; @@ -977,8 +1017,8 @@ static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); -static void arc_hdr_free_pabd(arc_buf_hdr_t *); -static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); +static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); +static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); @@ -1130,7 +1170,9 @@ buf_hash_remove(arc_buf_hdr_t *hdr) /* * Global data structures and functions for the buf kmem cache. */ + static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; @@ -1153,6 +1195,7 @@ buf_fini(void) for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -1181,6 +1224,19 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) /* ARGSUSED */ static int +hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + hdr_full_cons(vbuf, unused, kmflag); + bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr)); + arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; @@ -1224,6 +1280,16 @@ hdr_full_dest(void *vbuf, void *unused) /* ARGSUSED */ static void +hdr_full_crypt_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + + hdr_full_dest(vbuf, unused); + arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void hdr_l2only_dest(void *vbuf, void *unused) { ASSERTV(arc_buf_hdr_t *hdr = vbuf); @@ -1294,6 +1360,9 @@ retry: hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); + hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", + HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, + hdr_recl, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, NULL, NULL, 0); @@ -1330,6 +1399,46 @@ arc_buf_lsize(arc_buf_t *buf) return (HDR_GET_LSIZE(buf->b_hdr)); } +/* + * This function will return B_TRUE if the buffer is encrypted in memory. + * This buffer can be decrypted by calling arc_untransform(). + */ +boolean_t +arc_is_encrypted(arc_buf_t *buf) +{ + return (ARC_BUF_ENCRYPTED(buf) != 0); +} + +/* + * Returns B_TRUE if the buffer represents data that has not had its MAC + * verified yet. + */ +boolean_t +arc_is_unauthenticated(arc_buf_t *buf) +{ + return (HDR_NOAUTH(buf->b_hdr) != 0); +} + +void +arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, + uint8_t *iv, uint8_t *mac) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(HDR_PROTECTED(hdr)); + + bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); + bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); + bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); + *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? + ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; +} + +/* + * Indicates how this buffer is compressed in memory. If it is not compressed + * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with + * arc_untransform() as long as it is also unencrypted. + */ enum zio_compress arc_get_compression(arc_buf_t *buf) { @@ -1337,6 +1446,18 @@ arc_get_compression(arc_buf_t *buf) HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } +/* + * Return the compression algorithm used to store this data in the ARC. If ARC + * compression is enabled or this is an encrypted block, this will be the same + * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. + */ +static inline enum zio_compress +arc_hdr_get_compress(arc_buf_hdr_t *hdr) +{ + return (HDR_COMPRESSION_ENABLED(hdr) ? + HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); +} + static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { @@ -1364,6 +1485,7 @@ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); @@ -1374,6 +1496,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr) /* * Return true iff at least one of the bufs on hdr is not compressed. + * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) @@ -1421,58 +1544,18 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } +/* + * This function makes the assumption that data stored in the L2ARC + * will be transformed exactly as it is in the main pool. Because of + * this we can verify the checksum against the reading process's bp. + */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { - enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); - boolean_t valid_cksum; - ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* - * We rely on the blkptr's checksum to determine if the block - * is valid or not. When compressed arc is enabled, the l2arc - * writes the block to the l2arc just as it appears in the pool. - * This allows us to use the blkptr's checksum to validate the - * data that we just read off of the l2arc without having to store - * a separate checksum in the arc_buf_hdr_t. However, if compressed - * arc is disabled, then the data written to the l2arc is always - * uncompressed and won't match the block as it exists in the main - * pool. When this is the case, we must first compress it if it is - * compressed on the main pool before we can validate the checksum. - */ - if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { - uint64_t lsize; - uint64_t csize; - void *cbuf; - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - - cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); - lsize = HDR_GET_LSIZE(hdr); - csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); - - ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); - if (csize < HDR_GET_PSIZE(hdr)) { - /* - * Compressed blocks are always a multiple of the - * smallest ashift in the pool. Ideally, we would - * like to round up the csize to the next - * spa_min_ashift but that value may have changed - * since the block was last written. Instead, - * we rely on the fact that the hdr's psize - * was set to the psize of the block when it was - * last written. We set the csize to that value - * and zero out any part that should not contain - * data. - */ - bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); - csize = HDR_GET_PSIZE(hdr); - } - zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); - } - - /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an @@ -1485,11 +1568,9 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ - valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); - zio_pop_transforms(zio); - return (valid_cksum); } /* @@ -1518,6 +1599,7 @@ arc_cksum_compute(arc_buf_t *buf) return; } + ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); @@ -1684,15 +1766,14 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); - HDR_SET_COMPRESS(hdr, cmp); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } + + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* @@ -1733,6 +1814,254 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) } /* + * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +static int +arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) +{ + int ret; + uint64_t csize; + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + void *tmpbuf = NULL; + abd_t *abd = hdr->b_l1hdr.b_pabd; + + ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT(HDR_AUTHENTICATED(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + /* + * The MAC is calculated on the compressed data that is stored on disk. + * However, if compressed arc is disabled we will only have the + * decompressed data available to us now. Compress it into a temporary + * abd so we can verify the MAC. The performance overhead of this will + * be relatively low, since most objects in an encrypted objset will + * be encrypted (instead of authenticated) anyway. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + tmpbuf = zio_buf_alloc(lsize); + abd = abd_get_from_buf(tmpbuf, lsize); + abd_take_ownership_of_buf(abd, B_TRUE); + + csize = zio_compress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmpbuf, lsize); + ASSERT3U(csize, <=, psize); + abd_zero_off(abd, csize, psize - csize); + } + + /* + * Authentication is best effort. We authenticate whenever the key is + * available. If we succeed we clear ARC_FLAG_NOAUTH. + */ + if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + ASSERT3U(lsize, ==, psize); + ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, + psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + } else { + ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, + hdr->b_crypt_hdr.b_mac); + } + + if (ret == 0) + arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); + else if (ret != ENOENT) + goto error; + + if (tmpbuf != NULL) + abd_free(abd); + + return (0); + +error: + if (tmpbuf != NULL) + abd_free(abd); + + return (ret); +} + +/* + * This function will take a header that only has raw encrypted data in + * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in + * b_l1hdr.b_pabd. If designated in the header flags, this function will + * also decompress the data. + */ +static int +arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + abd_t *cabd = NULL; + void *tmp = NULL; + boolean_t no_crypt = B_FALSE; + boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + + ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT(HDR_ENCRYPTED(hdr)); + + arc_hdr_alloc_abd(hdr, B_FALSE); + + /* + * We must be careful to use the passed-in dsobj value here and + * not the value in b_dsobj. b_dsobj is meant to be a best guess for + * the L2ARC, which has the luxury of being able to fail without real + * consequences (the data simply won't make it to the L2ARC). In + * reality, the dsobj stored in the header may belong to a dataset + * that has been unmounted or otherwise disowned, meaning the key + * won't be accessible via that dsobj anymore. + */ + ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); + if (ret != 0) { + ret = SET_ERROR(EACCES); + goto error; + } + + ret = zio_do_crypt_abd(B_FALSE, &dck->dck_key, + hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_ot, + hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac, + HDR_GET_PSIZE(hdr), bswap, hdr->b_l1hdr.b_pabd, + hdr->b_crypt_hdr.b_rabd, &no_crypt); + if (ret != 0) + goto error; + + if (no_crypt) { + abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, + HDR_GET_PSIZE(hdr)); + } + + /* + * If this header has disabled arc compression but the b_pabd is + * compressed after decrypting it, we need to decompress the newly + * decrypted data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + /* + * We want to make sure that we are correctly honoring the + * zfs_abd_scatter_enabled setting, so we allocate an abd here + * and then loan a buffer from it, rather than allocating a + * linear buffer and wrapping it in an abd later. + */ + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); + + ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (ret != 0) { + abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); + goto error; + } + + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = cabd; + } + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + return (0); + +error: + arc_hdr_free_abd(hdr, B_FALSE); + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + if (cabd != NULL) + arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); + + return (ret); +} + +/* + * This function is called during arc_buf_fill() to prepare the header's + * abd plaintext pointer for use. This involves authenticated protected + * data and decrypting encrypted data into the plaintext abd. + */ +static int +arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, + uint64_t dsobj, boolean_t noauth) +{ + int ret; + + ASSERT(HDR_PROTECTED(hdr)); + + if (hash_lock != NULL) + mutex_enter(hash_lock); + + if (HDR_NOAUTH(hdr) && !noauth) { + /* + * The caller requested authenticated data but our data has + * not been authenticated yet. Verify the MAC now if we can. + */ + ret = arc_hdr_authenticate(hdr, spa, dsobj); + if (ret != 0) + goto error; + } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { + /* + * If we only have the encrypted version of the data, but the + * unencrypted version was requested we take this opportunity + * to store the decrypted version in the header for future use. + */ + ret = arc_hdr_decrypt(hdr, spa, dsobj); + if (ret != 0) + goto error; + } + + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + if (hash_lock != NULL) + mutex_exit(hash_lock); + + return (0); + +error: + if (hash_lock != NULL) + mutex_exit(hash_lock); + + return (ret); +} + +/* + * This function is used by the dbuf code to decrypt bonus buffers in place. + * The dbuf code itself doesn't have any locking for decrypting a shared dnode + * block, so we use the hash lock here to protect against concurrent calls to + * arc_buf_fill(). + */ +static void +arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(HDR_ENCRYPTED(hdr)); + ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); + ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + hdr->b_crypt_hdr.b_ebufcnt -= 1; +} + +/* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr @@ -1746,15 +2075,79 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) * the correct-sized data buffer. */ static int -arc_buf_fill(arc_buf_t *buf, boolean_t compressed) +arc_buf_fill(arc_buf_t *buf, spa_t *spa, uint64_t dsobj, arc_fill_flags_t flags) { + int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; - boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + boolean_t hdr_compressed = + (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); + boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; + boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); - IMPLY(compressed, hdr_compressed); + IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); + IMPLY(encrypted, HDR_ENCRYPTED(hdr)); + IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); + IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); + IMPLY(encrypted, !ARC_BUF_SHARED(buf)); + + /* + * If the caller wanted encrypted data we just need to copy it from + * b_rabd and potentially byteswap it. We won't be able to do any + * further transforms on it. + */ + if (encrypted) { + ASSERT(HDR_HAS_RABD(hdr)); + abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, + HDR_GET_PSIZE(hdr)); + goto byteswap; + } + + /* + * Adjust encrypted and authenticated headers to accomodate the + * request if needed. + */ + if (HDR_PROTECTED(hdr)) { + error = arc_fill_hdr_crypt(hdr, hash_lock, spa, + dsobj, !!(flags & ARC_FILL_NOAUTH)); + if (error != 0) + return (error); + } + + /* + * There is a special case here for dnode blocks which are + * decrypting their bonus buffers. These blocks may request to + * be decrypted in-place. This is necessary because there may + * be many dnodes pointing into this buffer and there is + * currently no method to synchronize replacing the backing + * b_data buffer and updating all of the pointers. Here we use + * the hash lock to ensure there are no races. If the need + * arises for other types to be decrypted in-place, they must + * add handling here as well. + */ + if ((flags & ARC_FILL_IN_PLACE) != 0) { + ASSERT(!hdr_compressed); + ASSERT(!compressed); + ASSERT(!encrypted); + + if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { + ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); + + if (hash_lock != NULL) + mutex_enter(hash_lock); + arc_buf_untransform_in_place(buf, hash_lock); + if (hash_lock != NULL) + mutex_exit(hash_lock); + + /* Compute the hdr's checksum if necessary */ + arc_cksum_compute(buf); + } + + return (0); + } if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { @@ -1809,7 +2202,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); return (0); } else { - int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); @@ -1820,13 +2213,14 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) if (error != 0) { zfs_dbgmsg( "hdr %p, compress %d, psize %d, lsize %d", - hdr, HDR_GET_COMPRESS(hdr), + hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); return (SET_ERROR(EIO)); } } } +byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); @@ -1840,28 +2234,21 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) return (0); } -int -arc_decompress(arc_buf_t *buf) -{ - return (arc_buf_fill(buf, B_FALSE)); -} - /* - * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. + * If this function is being called to decrypt an encrypted buffer or verify an + * authenticated one, the key must be loaded and a mapping must be made + * available in the keystore via spa_keystore_create_mapping() or one of its + * callers. */ -static uint64_t -arc_hdr_size(arc_buf_hdr_t *hdr) +int +arc_untransform(arc_buf_t *buf, spa_t *spa, uint64_t dsobj, boolean_t in_place) { - uint64_t size; + arc_fill_flags_t flags = 0; - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - HDR_GET_PSIZE(hdr) > 0) { - size = HDR_GET_PSIZE(hdr); - } else { - ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); - size = HDR_GET_LSIZE(hdr); - } - return (size); + if (in_place) + flags |= ARC_FILL_IN_PLACE; + + return (arc_buf_fill(buf, spa, dsobj, flags)); } /* @@ -1881,6 +2268,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); (void) refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; @@ -1891,6 +2279,11 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) (void) refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } + if (HDR_HAS_RABD(hdr)) { + (void) refcount_add_many(&state->arcs_esize[type], + HDR_GET_PSIZE(hdr), hdr); + } + for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; @@ -1916,6 +2309,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); (void) refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; @@ -1926,6 +2320,11 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) (void) refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } + if (HDR_HAS_RABD(hdr)) { + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_PSIZE(hdr), hdr); + } + for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; @@ -2069,7 +2468,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); } else { old_state = arc_l2c_only; refcnt = 0; @@ -2139,6 +2539,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); } else { arc_buf_t *buf; uint32_t buffers = 0; @@ -2171,8 +2572,11 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&new_state->arcs_size, arc_hdr_size(hdr), hdr); - } else { - ASSERT(GHOST_STATE(old_state)); + } + + if (HDR_HAS_RABD(hdr)) { + (void) refcount_add_many(&new_state->arcs_size, + HDR_GET_PSIZE(hdr), hdr); } } } @@ -2182,6 +2586,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, @@ -2222,9 +2627,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, buf); } ASSERT3U(bufcnt, ==, buffers); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - (void) refcount_remove_many( - &old_state->arcs_size, arc_hdr_size(hdr), hdr); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), + hdr); + } + + if (HDR_HAS_RABD(hdr)) { + (void) refcount_remove_many( + &old_state->arcs_size, HDR_GET_PSIZE(hdr), + hdr); + } } } @@ -2327,12 +2743,13 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: - * 1. the hdr's compression matches the buf's compression - * 2. the hdr doesn't need to be byteswapped - * 3. the hdr isn't already being shared - * 4. the buf is either compressed or it is the last buf in the hdr list + * 1. the buffer is not encrypted + * 2. the hdr's compression matches the buf's compression + * 3. the hdr doesn't need to be byteswapped + * 4. the hdr isn't already being shared + * 5. the buf is either compressed or it is the last buf in the hdr list * - * Criterion #4 maintains the invariant that shared uncompressed + * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create @@ -2347,9 +2764,11 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); - boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; + boolean_t hdr_compressed = + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; - return (buf_compressed == hdr_compressed && + return (!ARC_BUF_ENCRYPTED(buf) && + buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); @@ -2361,10 +2780,12 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) * copy was made successfully, or an error code otherwise. */ static int -arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj, void *tag, + boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; + arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); @@ -2372,6 +2793,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); + IMPLY(encrypted, compressed); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; @@ -2395,10 +2817,23 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, /* * Only honor requests for compressed bufs if the hdr is actually - * compressed. + * compressed. This must be overriden if the buffer is encrypted since + * encrypted buffers cannot be decompressed. */ - if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + if (encrypted) { + buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; + flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; + } else if (compressed && + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + flags |= ARC_FILL_COMPRESSED; + } + + if (noauth) { + ASSERT0(encrypted); + flags |= ARC_FILL_NOAUTH; + } /* * If the hdr's data can be shared then we share the data buffer and @@ -2414,7 +2849,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, * need to be ABD-aware. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && - abd_is_linear(hdr->b_l1hdr.b_pabd); + hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { @@ -2430,13 +2865,15 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; + if (encrypted) + hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { - return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); + return (arc_buf_fill(buf, spa, dsobj, flags)); } return (0); @@ -2482,6 +2919,19 @@ arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, return (buf); } +arc_buf_t * +arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, + dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, + byteorder, salt, iv, mac, ot, psize, lsize, compression_type); + + atomic_add_64(&arc_loaned_bytes, psize); + return (buf); +} + /* * Return a loaned arc buffer to the arc. @@ -2527,11 +2977,11 @@ l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) } static void -arc_hdr_free_on_write(arc_buf_hdr_t *hdr) +arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t size = arc_hdr_size(hdr); + uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { @@ -2549,7 +2999,11 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr) arc_space_return(size, ARC_SPACE_DATA); } - l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); + if (free_rdata) { + l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); + } else { + l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); + } } /* @@ -2562,6 +3016,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2689,6 +3144,18 @@ arc_buf_destroy_impl(arc_buf_t *buf) ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; + + if (ARC_BUF_ENCRYPTED(buf)) + hdr->b_crypt_hdr.b_ebufcnt -= 1; + + /* + * if we have no more encrypted buffers and we've already + * gotten a copy of the decrypted data we can free b_rabd to + * save some space. + */ + if (hdr->b_crypt_hdr.b_ebufcnt == 0 && HDR_HAS_RABD(hdr) && + hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_abd(hdr, B_TRUE); } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); @@ -2703,16 +3170,17 @@ arc_buf_destroy_impl(arc_buf_t *buf) * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be - * wasted temporarily. + * wasted temporarily. We must also be careful not to share + * encrypted buffers, since they cannot be shared. */ - if (lastbuf != NULL) { + if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - arc_hdr_free_pabd(hdr); + arc_hdr_free_abd(hdr, B_FALSE); /* * We must setup a new shared block between the @@ -2733,7 +3201,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* @@ -2750,26 +3218,43 @@ arc_buf_destroy_impl(arc_buf_t *buf) } static void -arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) +arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata) { + uint64_t size; + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); + IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + if (alloc_rdata) { + size = HDR_GET_PSIZE(hdr); + ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); + hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr); + ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); + ARCSTAT_INCR(arcstat_raw_size, size); + } else { + size = arc_hdr_size(hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + } + + ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void -arc_hdr_free_pabd(arc_buf_hdr_t *hdr) +arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { + uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); + IMPLY(free_rdata, HDR_HAS_RABD(hdr)); /* * If the hdr is currently being written to the l2arc then @@ -2778,28 +3263,42 @@ arc_hdr_free_pabd(arc_buf_hdr_t *hdr) * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { - arc_hdr_free_on_write(hdr); + arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else if (free_rdata) { + arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { - arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, - arc_hdr_size(hdr), hdr); + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); } - hdr->b_l1hdr.b_pabd = NULL; - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + if (free_rdata) { + hdr->b_crypt_hdr.b_rabd = NULL; + ARCSTAT_INCR(arcstat_raw_size, -size); + } else { + hdr->b_l1hdr.b_pabd = NULL; + } + + if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - enum zio_compress compression_type, arc_buf_contents_t type) + boolean_t protected, enum zio_compress compression_type, + arc_buf_contents_t type, boolean_t alloc_rdata) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + if (protected) { + hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); + } else { + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + } - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); HDR_SET_PSIZE(hdr, psize); @@ -2809,6 +3308,8 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; @@ -2820,7 +3321,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ - arc_hdr_alloc_pabd(hdr); + arc_hdr_alloc_abd(hdr, alloc_rdata); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -2843,6 +3344,16 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); + /* + * if the caller wanted a new full header and the header is to be + * encrypted we will actually allocate the header from the full crypt + * cache instead. The same applies to freeing from the old cache. + */ + if (HDR_PROTECTED(hdr) && new == hdr_full_cache) + new = hdr_full_crypt_cache; + if (HDR_PROTECTED(hdr) && old == hdr_full_cache) + old = hdr_full_crypt_cache; + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); @@ -2850,7 +3361,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - if (new == hdr_full_cache) { + if (new == hdr_full_cache || new == hdr_full_crypt_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a @@ -2861,6 +3372,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); @@ -2883,6 +3395,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } @@ -2925,6 +3438,111 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) } /* + * This function allows an L1 header to be reallocated as a crypt + * header and vice versa. If we are going to a crypt header, the + * new fields will be zeroed out. + */ +static arc_buf_hdr_t * +arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) +{ + arc_buf_hdr_t *nhdr; + arc_buf_t *buf; + kmem_cache_t *ncache, *ocache; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + if (need_crypt) { + ncache = hdr_full_crypt_cache; + ocache = hdr_full_cache; + } else { + ncache = hdr_full_cache; + ocache = hdr_full_crypt_cache; + } + + nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; + nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; + nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; + nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; + nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; + nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; + nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; + nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; + nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; + nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits; + nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; + nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; + nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; + + /* + * This refcount_add() exists only to ensure that the individual + * arc buffers always point to a header that is referenced, avoiding + * a small race condition that could trigger ASSERTs. + */ + (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); + + for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + mutex_enter(&buf->b_evict_lock); + buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); + } + + refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); + (void) refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); + + if (need_crypt) { + arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); + } else { + arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); + } + + buf_discard_identity(hdr); + kmem_cache_free(ocache, hdr); + + return (nhdr); +} + +/* + * This function is used by the send / receive code to convert a newly + * allocated arc_buf_t to one that is suitable for a raw encrypted write. It + * is also used to allow the root objset block to be uupdated without altering + * its embedded MACs. Both block types will always be uncompressed so we do not + * have to worry about compression type or psize. + */ +void +arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, + dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + + buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); + if (!HDR_PROTECTED(hdr)) + hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); + hdr->b_crypt_hdr.b_dsobj = dsobj; + hdr->b_crypt_hdr.b_ot = ot; + hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? + DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); + if (!arc_hdr_has_uncompressed_buf(hdr)) + arc_cksum_free(hdr); + + if (salt != NULL) + bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + if (iv != NULL) + bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + if (mac != NULL) + bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); +} + +/* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ @@ -2932,11 +3550,12 @@ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - ZIO_COMPRESS_OFF, type); + B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; - VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); + VERIFY0(arc_buf_alloc_impl(hdr, spa, 0, tag, B_FALSE, B_FALSE, + B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); @@ -2952,33 +3571,76 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); - ASSERT(compression_type > ZIO_COMPRESS_OFF); - ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); + ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); + ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - compression_type, ARC_BUFC_DATA); + B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; - VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); + VERIFY0(arc_buf_alloc_impl(hdr, spa, 0, tag, B_FALSE, + B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); if (!arc_buf_is_shared(buf)) { /* * To ensure that the hdr has the correct data in it if we call - * arc_decompress() on this buf before it's been written to + * arc_untransform() on this buf before it's been written to * disk, it's easiest if we just set up sharing between the * buf and the hdr. */ ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); - arc_hdr_free_pabd(hdr); + arc_hdr_free_abd(hdr, B_FALSE); arc_share_buf(hdr, buf); } return (buf); } +arc_buf_t * +arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, + dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? + ARC_BUFC_METADATA : ARC_BUFC_DATA; + + ASSERT3U(lsize, >, 0); + ASSERT3U(lsize, >=, psize); + ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); + ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); + + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, + compression_type, type, B_TRUE); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + + hdr->b_crypt_hdr.b_dsobj = dsobj; + hdr->b_crypt_hdr.b_ot = ot; + hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? + DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); + bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); + + /* + * This buffer will be considered encrypted even if the ot is not an + * encrypted type. It will become authenticated instead in + * arc_write_ready(). + */ + buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, spa, dsobj, tag, B_TRUE, B_TRUE, + B_FALSE, B_FALSE, &buf)); + arc_buf_thaw(buf); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + return (buf); +} + static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { @@ -3044,15 +3706,25 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); - if (hdr->b_l1hdr.b_pabd != NULL) - arc_hdr_free_pabd(hdr); + if (hdr->b_l1hdr.b_pabd != NULL) { + arc_hdr_free_abd(hdr, B_FALSE); + } + + if (HDR_HAS_RABD(hdr)) { + arc_hdr_free_abd(hdr, B_TRUE); + } } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - kmem_cache_free(hdr_full_cache, hdr); + + if (!HDR_PROTECTED(hdr)) { + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_full_crypt_cache, hdr); + } } else { kmem_cache_free(hdr_l2only_cache, hdr); } @@ -3129,6 +3801,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); + ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -3195,7 +3868,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * This ensures that the accounting is updated correctly * in arc_free_data_impl(). */ - arc_hdr_free_pabd(hdr); + if (hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_abd(hdr, B_FALSE); + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); @@ -4876,22 +5553,22 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } } -/* a generic arc_done_func_t which you can use */ +/* a generic arc_read_done_func_t which you can use */ /* ARGSUSED */ void -arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg) { - if (zio == NULL || zio->io_error == 0) + if (error == 0) bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } -/* a generic arc_done_func_t */ +/* a generic arc_read_done_func_t */ void -arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; - if (zio && zio->io_error) { + if (error != 0) { arc_buf_destroy(buf, arg); *bufp = NULL; } else { @@ -4905,20 +5582,22 @@ arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); } } static void arc_read_done(zio_t *zio) { + blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; @@ -4951,6 +5630,26 @@ arc_read_done(zio_t *zio) ASSERT3P(hash_lock, !=, NULL); } + if (BP_IS_PROTECTED(bp)) { + hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); + hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; + zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, + hdr->b_crypt_hdr.b_iv); + + if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { + void *tmpbuf; + + tmpbuf = abd_borrow_buf_copy(zio->io_abd, + sizeof (zil_chain_t)); + zio_crypt_decode_mac_zil(tmpbuf, + hdr->b_crypt_hdr.b_mac); + abd_return_buf(zio->io_abd, tmpbuf, + sizeof (zil_chain_t)); + } else { + zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } + } + if (no_zio_error) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { @@ -4996,8 +5695,33 @@ arc_read_done(zio_t *zio) /* This is a demand read since prefetches don't use callbacks */ callback_cnt++; - int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, no_zio_error, &acb->acb_buf); + int error = arc_buf_alloc_impl(hdr, zio->io_spa, + zio->io_bookmark.zb_objset, acb->acb_private, + acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, + no_zio_error, &acb->acb_buf); + + /* + * assert non-speculative zios didn't fail because an + * encryption key wasn't loaded + */ + ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || + error == 0 || error != ENOENT); + + /* + * If we failed to decrypt, report an error now (as the zio + * layer would have done if it had done the transforms). + */ + if (error == ECKSUM) { + ASSERT(BP_IS_PROTECTED(bp)); + error = SET_ERROR(EIO); + spa_log_error(zio->io_spa, &zio->io_bookmark); + if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { + zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + zio->io_spa, NULL, &zio->io_bookmark, zio, + 0, 0); + } + } + if (no_zio_error) { zio->io_error = error; } @@ -5005,9 +5729,8 @@ arc_read_done(zio_t *zio) hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (callback_cnt == 0) { - ASSERT(HDR_PREFETCH(hdr)); - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT(HDR_PREFETCH(hdr) || HDR_HAS_RABD(hdr)); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || @@ -5046,8 +5769,10 @@ arc_read_done(zio_t *zio) /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { - if (acb->acb_done) - acb->acb_done(zio, acb->acb_buf, acb->acb_private); + if (acb->acb_done) { + acb->acb_done(zio, zio->io_error, acb->acb_buf, + acb->acb_private); + } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; @@ -5081,15 +5806,19 @@ arc_read_done(zio_t *zio) * for readers of this block. */ int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, - arc_flags_t *arc_flags, const zbookmark_phys_t *zb) +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + arc_read_done_func_t *done, void *private, zio_priority_t priority, + int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); - boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; + boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; + boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && + (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; + boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && + (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; int rc = 0; ASSERT(!BP_IS_EMBEDDED(bp) || @@ -5104,7 +5833,15 @@ top: hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { + /* + * Determine if we have an L1 cache hit or a cache miss. For simplicity + * we maintain encrypted data seperately from compressed / uncompressed + * data. If the user is requesting raw encrypted data and we don't have + * that in the header we will read from disk to guarantee that we can + * get it even if the encryption keys aren't loaded. + */ + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || + (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; @@ -5191,8 +5928,12 @@ top: ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ - VERIFY0(arc_buf_alloc_impl(hdr, private, - compressed_read, B_TRUE, &buf)); + rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset, + private, encrypted_read, compressed_read, + noauth_read, B_TRUE, &buf); + + ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || + rc == 0 || rc != ENOENT); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); @@ -5208,7 +5949,7 @@ top: data, metadata, hits); if (done) - done(NULL, buf, private); + done(NULL, rc, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); @@ -5217,6 +5958,7 @@ top: uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; + void *hdr_abd; /* * Gracefully handle a damaged logical block size as a @@ -5232,7 +5974,8 @@ top: arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_GET_COMPRESS(bp), type); + BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type, + encrypted_read); if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); @@ -5248,26 +5991,42 @@ top: } } else { /* - * This block is in the ghost cache. If it was L2-only - * (and thus didn't have an L1 hdr), we realloc the - * header to add an L1 hdr. + * This block is in the ghost cache or encrypted data + * was requested and we didn't have it. If it was + * L2-only (and thus didn't have an L1 hdr), + * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + if (GHOST_STATE(hdr->b_l1hdr.b_state)) { + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + } else if (HDR_IO_IN_PROGRESS(hdr)) { + /* + * If this header already had an IO in progress + * and we are performing another IO to fetch + * encrypted data we must wait until the first + * IO completes so as not to confuse + * arc_read_done(). This should be very rare + * and so the performance impact shouldn't + * matter. + */ + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } /* * This is a delicate dance that we play here. - * This hdr is in the ghost list so we access it - * to move it out of the ghost list before we + * This hdr might be in the ghost list so we access + * it to move it out of the ghost list before we * initiate the read. If it's a prefetch then * it won't have a callback so we'll remove the * reference that arc_buf_alloc_impl() created. We @@ -5275,25 +6034,41 @@ top: * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); - arc_hdr_alloc_pabd(hdr); + arc_hdr_alloc_abd(hdr, encrypted_read); } - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - size = arc_hdr_size(hdr); - /* - * If compression is enabled on the hdr, then will do - * RAW I/O and will store the compressed data in the hdr's - * data block. Otherwise, the hdr's data block will contain - * the uncompressed data. - */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + if (encrypted_read) { + ASSERT(HDR_HAS_RABD(hdr)); + size = HDR_GET_PSIZE(hdr); + hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; + } else { + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + size = arc_hdr_size(hdr); + hdr_abd = hdr->b_l1hdr.b_pabd; + + if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW_COMPRESS; + } + + /* + * For authenticated bp's, we do not ask the ZIO layer + * to authenticate them since this will cause the entire + * IO to fail if the key isn't loaded. Instead, we + * defer authentication until arc_buf_fill(), which will + * verify the data when the key is available. + */ + if (BP_IS_AUTHENTICATED(bp)) + zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } - if (*arc_flags & ARC_FLAG_PREFETCH) + if (*arc_flags & ARC_FLAG_PREFETCH && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_IS_AUTHENTICATED(bp)) + arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) @@ -5304,6 +6079,8 @@ top: acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; + acb->acb_encrypted = encrypted_read; + acb->acb_noauth = noauth_read; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; @@ -5376,7 +6153,7 @@ top: HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { - abd = hdr->b_l1hdr.b_pabd; + abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && @@ -5389,7 +6166,7 @@ top: * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ - ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, @@ -5402,7 +6179,8 @@ top: DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, size); + ARCSTAT_INCR(arcstat_l2_read_bytes, + HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); @@ -5432,7 +6210,7 @@ top: } } - rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, + rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) { @@ -5626,7 +6404,8 @@ arc_release(arc_buf_t *buf, void *tag) uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); - enum zio_compress compress = HDR_GET_COMPRESS(hdr); + boolean_t protected = HDR_PROTECTED(hdr); + enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); @@ -5669,7 +6448,7 @@ arc_release(arc_buf_t *buf, void *tag) if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_pabd(hdr); + arc_hdr_alloc_abd(hdr, B_FALSE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } @@ -5684,10 +6463,11 @@ arc_release(arc_buf_t *buf, void *tag) * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, @@ -5700,6 +6480,9 @@ arc_release(arc_buf_t *buf, void *tag) } hdr->b_l1hdr.b_bufcnt -= 1; + if (ARC_BUF_ENCRYPTED(buf)) + hdr->b_crypt_hdr.b_ebufcnt -= 1; + arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -5713,7 +6496,8 @@ arc_release(arc_buf_t *buf, void *tag) * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ - nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + nhdr = arc_hdr_alloc(spa, psize, lsize, protected, + compress, type, HDR_HAS_RABD(hdr)); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); @@ -5722,6 +6506,8 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; + if (ARC_BUF_ENCRYPTED(buf)) + nhdr->b_crypt_hdr.b_ebufcnt = 1; nhdr->b_l1hdr.b_mru_hits = 0; nhdr->b_l1hdr.b_mru_ghost_hits = 0; nhdr->b_l1hdr.b_mfu_hits = 0; @@ -5746,8 +6532,8 @@ arc_release(arc_buf_t *buf, void *tag) hdr->b_l1hdr.b_l2_hits = 0; arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; - mutex_exit(hash_lock); + mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } @@ -5784,7 +6570,8 @@ arc_write_ready(zio_t *zio) arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); + blkptr_t *bp = zio->io_bp; + uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); enum zio_compress compress; fstrans_cookie_t cookie = spl_fstrans_mark(); @@ -5804,11 +6591,15 @@ arc_write_ready(zio_t *zio) if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pabd(hdr); + arc_hdr_free_abd(hdr, B_FALSE); } } + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); @@ -5817,21 +6608,51 @@ arc_write_ready(zio_t *zio) if (HDR_IO_IN_PROGRESS(hdr)) ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); - arc_cksum_compute(buf); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) + hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); + + if (BP_IS_PROTECTED(bp)) { + /* ZIL blocks are written through zio_rewrite */ + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); + ASSERT(HDR_PROTECTED(hdr)); + + hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); + hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; + zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, + hdr->b_crypt_hdr.b_iv); + zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } + + /* + * If this block was written for raw encryption but the zio layer + * ended up only authenticating it, adjust the buffer flags now. + */ + if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { + arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); + buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; + if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + } + + /* this must be done after the buffer flags are adjusted */ + arc_cksum_compute(buf); + + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { - ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); - compress = BP_GET_COMPRESS(zio->io_bp); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); /* - * Fill the hdr with data. If the hdr is compressed, the data we want - * is available from the zio, otherwise we can take it from the buf. + * Fill the hdr with data. If the buffer is encrypted we have no choice + * but to copy the data into b_radb. If the hdr is compressed, the data + * we want is available from the zio, otherwise we can take it from + * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a @@ -5841,23 +6662,28 @@ arc_write_ready(zio_t *zio) * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ - if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { - arc_hdr_alloc_pabd(hdr); - + if (ARC_BUF_ENCRYPTED(buf)) { + ASSERT(ARC_BUF_COMPRESSED(buf)); + arc_hdr_alloc_abd(hdr, B_TRUE); + abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); + } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { - ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, - ZIO_COMPRESS_OFF); + if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - + arc_hdr_alloc_abd(hdr, B_TRUE); + abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); + } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && + !ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(psize, >, 0); + arc_hdr_alloc_abd(hdr, B_FALSE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - + arc_hdr_alloc_abd(hdr, B_FALSE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -5869,7 +6695,7 @@ arc_write_ready(zio_t *zio) arc_share_buf(hdr, buf); } - arc_hdr_verify(hdr, zio->io_bp); + arc_hdr_verify(hdr, bp); spl_fstrans_unmark(cookie); } @@ -5981,9 +6807,9 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, - const zio_prop_t *zp, arc_done_func_t *ready, - arc_done_func_t *children_ready, arc_done_func_t *physdone, - arc_done_func_t *done, void *private, zio_priority_t priority, + const zio_prop_t *zp, arc_write_done_func_t *ready, + arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, + arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -5999,16 +6825,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); - if (ARC_BUF_COMPRESSED(buf)) { - /* - * We're writing a pre-compressed buffer. Make the - * compression algorithm requested by the zio_prop_t match - * the pre-compressed buffer's compression algorithm. - */ - localprop.zp_compress = HDR_GET_COMPRESS(hdr); - ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); + if (ARC_BUF_ENCRYPTED(buf)) { + ASSERT(ARC_BUF_COMPRESSED(buf)); + localprop.zp_encrypt = B_TRUE; + localprop.zp_compress = HDR_GET_COMPRESS(hdr); + localprop.zp_byteorder = + (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? + ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; + bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt, + ZIO_DATA_SALT_LEN); + bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv, + ZIO_DATA_IV_LEN); + bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac, + ZIO_DATA_MAC_LEN); + if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { + localprop.zp_nopwrite = B_FALSE; + localprop.zp_copies = + MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); + } zio_flags |= ZIO_FLAG_RAW; + } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); + localprop.zp_compress = HDR_GET_COMPRESS(hdr); + zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; @@ -6032,11 +6872,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pabd(hdr); + arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); - arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_abd(hdr, B_TRUE); + + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); @@ -7037,6 +7882,102 @@ top: kmem_free(cb, sizeof (l2arc_write_callback_t)); } +static int +l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) +{ + int ret; + spa_t *spa = zio->io_spa; + arc_buf_hdr_t *hdr = cb->l2rcb_hdr; + blkptr_t *bp = zio->io_bp; + dsl_crypto_key_t *dck = NULL; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + /* + * ZIL data is never be written to the L2ARC, so we don't need + * special handling for its unique MAC storage. + */ + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + + /* If the data was encrypted, decrypt it now */ + if (HDR_ENCRYPTED(hdr)) { + abd_t *eabd = arc_get_data_abd(hdr, + arc_hdr_size(hdr), hdr); + + zio_crypt_decode_params_bp(bp, salt, iv); + zio_crypt_decode_mac_bp(bp, mac); + + ret = spa_keystore_lookup_key(spa, + cb->l2rcb_zb.zb_objset, FTAG, &dck); + if (ret != 0) { + arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); + goto error; + } + + ret = zio_do_crypt_abd(B_FALSE, &dck->dck_key, + salt, BP_GET_TYPE(bp), iv, mac, HDR_GET_PSIZE(hdr), + BP_SHOULD_BYTESWAP(bp), eabd, hdr->b_l1hdr.b_pabd, + &no_crypt); + if (ret != 0) { + arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); + spa_keystore_dsl_key_rele(spa, dck, FTAG); + goto error; + } + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + /* + * If we actually performed decryption, replace b_pabd + * with the decrypted data. Otherwise we can just throw + * our decryption buffer away. + */ + if (!no_crypt) { + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = eabd; + zio->io_abd = eabd; + } else { + arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); + } + } + + /* + * If the L2ARC block was compressed, but ARC compression + * is disabled we decompress the data into a new buffer and + * replace the existing data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); + + ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (ret != 0) { + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); + goto error; + } + + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = cabd; + zio->io_abd = cabd; + zio->io_size = HDR_GET_LSIZE(hdr); + } + + return (0); + +error: + return (ret); +} + + /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. @@ -7044,10 +7985,11 @@ top: static void l2arc_read_done(zio_t *zio) { + int tfm_error = 0; l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - boolean_t valid_cksum; + boolean_t valid_cksum, using_rdata; ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); @@ -7095,12 +8037,25 @@ l2arc_read_done(zio_t *zio) /* * Check this survived the L2ARC journey. */ - ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); + ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || + (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ valid_cksum = arc_cksum_is_equal(hdr, zio); - if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + using_rdata = (HDR_HAS_RABD(hdr) && + zio->io_abd == hdr->b_crypt_hdr.b_rabd); + + /* + * b_rabd will always match the data as it exists on disk if it is + * being used. Therefore if we are reading into b_rabd we do not + * attempt to untransform the data. + */ + if (valid_cksum && !using_rdata) + tfm_error = l2arc_untransform(zio, cb); + + if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && + !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); @@ -7115,7 +8070,7 @@ l2arc_read_done(zio_t *zio) } else { zio->io_error = SET_ERROR(EIO); } - if (!valid_cksum) + if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -7125,11 +8080,13 @@ l2arc_read_done(zio_t *zio) */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); + void *abd = (using_rdata) ? + hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, - hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, + abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } @@ -7291,6 +8248,117 @@ top: } /* + * Handle any abd transforms that might be required for writing to the L2ARC. + * If successful, this function will always return an abd with the data + * transformed as it is on disk in a new abd of asize bytes. + */ +static int +l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, + abd_t **abd_out) +{ + int ret; + void *tmp = NULL; + abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; + enum zio_compress compress = HDR_GET_COMPRESS(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t size = arc_hdr_size(hdr); + boolean_t ismd = HDR_ISTYPE_METADATA(hdr); + boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + dsl_crypto_key_t *dck = NULL; + uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; + boolean_t no_crypt; + + ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) || + HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); + ASSERT3U(psize, <=, asize); + + /* + * If this data simply needs its own buffer, we simply allocate it + * and copy the data. This may be done to elimiate a depedency on a + * shared buffer or to reallocate the buffer to match asize. + */ + if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && + !HDR_ENCRYPTED(hdr)) { + ASSERT3U(size, ==, psize); + to_write = abd_alloc_for_io(asize, ismd); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + if (size != asize) + abd_zero_off(to_write, size, asize - size); + goto out; + } + + if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { + cabd = abd_alloc_for_io(asize, ismd); + tmp = abd_borrow_buf(cabd, asize); + + psize = zio_compress_data(compress, to_write, tmp, size); + ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); + if (psize < asize) + bzero((char *)tmp + psize, asize - psize); + psize = HDR_GET_PSIZE(hdr); + abd_return_buf_copy(cabd, tmp, asize); + to_write = cabd; + } + + if (HDR_ENCRYPTED(hdr)) { + eabd = abd_alloc_for_io(asize, ismd); + + /* + * If the dataset was disowned before the buffer + * made it to this point, the key to re-encrypt + * it won't be available. In this case we simply + * won't write the buffer to the L2ARC. + */ + ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, + FTAG, &dck); + if (ret != 0) + goto error; + + ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, + hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_ot, + hdr->b_crypt_hdr.b_iv, mac, psize, bswap, to_write, + eabd, &no_crypt); + if (ret != 0) + goto error; + + if (no_crypt) { + spa_keystore_dsl_key_rele(spa, dck, FTAG); + abd_free(eabd); + goto out; + } + + if (psize != asize) + abd_zero_off(eabd, psize, asize - psize); + + /* assert that the MAC we got here matches the one we saved */ + ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + if (to_write == cabd) + abd_free(cabd); + + to_write = eabd; + } + +out: + ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); + *abd_out = to_write; + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + if (cabd != NULL) + abd_free(cabd); + if (eabd != NULL) + abd_free(eabd); + + *abd_out = NULL; + return (ret); +} + +/* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid @@ -7346,6 +8414,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; + abd_t *to_write = NULL; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -7383,9 +8452,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); - uint64_t psize = arc_hdr_size(hdr); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); @@ -7395,6 +8465,57 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + ASSERT3U(arc_hdr_size(hdr), >, 0); + + /* + * If this header has b_rabd, we can use this since it + * must always match the data exactly as it exists on + * disk. Otherwise, the L2ARC can normally use the + * hdr's data, but if we're sharing data between the + * hdr and one of its bufs, L2ARC needs its own copy of + * the data so that the ZIO below can't race with the + * buf consumer. To ensure that this copy will be + * available for the lifetime of the ZIO and be cleaned + * up afterwards, we add it to the l2arc_free_on_write + * queue. If we need to apply any transforms to the + * data (compression, encryption) we will also need the + * extra buffer. + */ + if (HDR_HAS_RABD(hdr) && psize == asize) { + to_write = hdr->b_crypt_hdr.b_rabd; + } else if ((HDR_COMPRESSION_ENABLED(hdr) || + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && + !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && + psize == asize) { + to_write = hdr->b_l1hdr.b_pabd; + } else { + int ret; + arc_buf_contents_t type = arc_buf_type(hdr); + + ret = l2arc_apply_transforms(spa, hdr, asize, + &to_write); + if (ret != 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_L2_WRITING); + mutex_exit(hash_lock); + continue; + } + + l2arc_free_abd_on_write(to_write, asize, type); + } + if (pio == NULL) { /* * Insert a dummy header on the buflist so @@ -7417,43 +8538,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - arc_hdr_set_flags(hdr, - ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); - (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr); + (void) refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); - /* - * Normally the L2ARC can use the hdr's data, but if - * we're sharing data between the hdr and one of its - * bufs, L2ARC needs its own copy of the data so that - * the ZIO below can't race with the buf consumer. - * Another case where we need to create a copy of the - * data is when the buffer size is not device-aligned - * and we need to pad the block to make it such. - * That also keeps the clock hand suitably aligned. - * - * To ensure that the copy will be available for the - * lifetime of the ZIO and be cleaned up afterwards, we - * add it to the l2arc_free_on_write queue. - */ - abd_t *to_write; - if (!HDR_SHARED_DATA(hdr) && psize == asize) { - to_write = hdr->b_l1hdr.b_pabd; - } else { - to_write = abd_alloc_for_io(asize, - HDR_ISTYPE_METADATA(hdr)); - abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); - if (asize != psize) { - abd_zero_off(to_write, psize, - asize - psize); - } - l2arc_free_abd_on_write(to_write, asize, - arc_buf_type(hdr)); - } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, |