aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/Makefile.in2
-rw-r--r--module/zfs/arc.c1611
-rw-r--r--module/zfs/bptree.c3
-rw-r--r--module/zfs/dbuf.c209
-rw-r--r--module/zfs/ddt.c23
-rw-r--r--module/zfs/dmu.c268
-rw-r--r--module/zfs/dmu_objset.c296
-rw-r--r--module/zfs/dmu_send.c853
-rw-r--r--module/zfs/dmu_traverse.c43
-rw-r--r--module/zfs/dnode.c111
-rw-r--r--module/zfs/dnode_sync.c13
-rw-r--r--module/zfs/dsl_crypt.c2611
-rw-r--r--module/zfs/dsl_dataset.c123
-rw-r--r--module/zfs/dsl_destroy.c14
-rw-r--r--module/zfs/dsl_dir.c43
-rw-r--r--module/zfs/dsl_pool.c19
-rw-r--r--module/zfs/dsl_prop.c3
-rw-r--r--module/zfs/dsl_scan.c17
-rw-r--r--module/zfs/spa.c83
-rw-r--r--module/zfs/spa_config.c2
-rw-r--r--module/zfs/spa_errlog.c3
-rw-r--r--module/zfs/spa_history.c7
-rw-r--r--module/zfs/spa_misc.c12
-rw-r--r--module/zfs/vdev.c9
-rw-r--r--module/zfs/vdev_raidz.c9
-rw-r--r--module/zfs/zfeature.c4
-rw-r--r--module/zfs/zfs_acl.c2
-rw-r--r--module/zfs/zfs_fm.c54
-rw-r--r--module/zfs/zfs_ioctl.c268
-rw-r--r--module/zfs/zfs_vfsops.c12
-rw-r--r--module/zfs/zil.c66
-rw-r--r--module/zfs/zio.c355
-rw-r--r--module/zfs/zio_checksum.c65
-rw-r--r--module/zfs/zio_crypt.c2037
-rw-r--r--module/zfs/zvol.c20
35 files changed, 8410 insertions, 860 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index b849d9010..72f28a89d 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -33,6 +33,7 @@ $(MODULE)-objs += dsl_deadlist.o
$(MODULE)-objs += dsl_deleg.o
$(MODULE)-objs += dsl_bookmark.o
$(MODULE)-objs += dsl_dir.o
+$(MODULE)-objs += dsl_crypt.o
$(MODULE)-objs += dsl_pool.o
$(MODULE)-objs += dsl_prop.o
$(MODULE)-objs += dsl_scan.o
@@ -103,6 +104,7 @@ $(MODULE)-objs += zil.o
$(MODULE)-objs += zio.o
$(MODULE)-objs += zio_checksum.o
$(MODULE)-objs += zio_compress.o
+$(MODULE)-objs += zio_crypt.o
$(MODULE)-objs += zio_inject.o
$(MODULE)-objs += zle.o
$(MODULE)-objs += zpl_ctldir.o
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 157a28d4b..d7ad101c3 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -258,6 +258,21 @@
* ARC is disabled, then the L2ARC's block must be transformed to look
* like the physical block in the main data pool before comparing the
* checksum and determining its validity.
+ *
+ * The L1ARC has a slightly different system for storing encrypted data.
+ * Raw (encrypted + possibly compressed) data has a few subtle differences from
+ * data that is just compressed. The biggest difference is that it is not
+ * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded.
+ * The other difference is that encryption cannot be treated as a suggestion.
+ * If a caller would prefer compressed data, but they actually wind up with
+ * uncompressed data the worst thing that could happen is there might be a
+ * performance hit. If the caller requests encrypted data, however, we must be
+ * sure they actually get it or else secret information could be leaked. Raw
+ * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
+ * may have both an encrypted version and a decrypted version of its data at
+ * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
+ * copied out of this header. To avoid complications with b_pabd, raw buffers
+ * cannot be shared.
*/
#include <sys/spa.h>
@@ -274,6 +289,8 @@
#include <sys/zio_checksum.h>
#include <sys/multilist.h>
#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
@@ -645,6 +662,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_demand_hit_predictive_prefetch;
kstat_named_t arcstat_need_free;
kstat_named_t arcstat_sys_free;
+ kstat_named_t arcstat_raw_size;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -739,7 +757,8 @@ static arc_stats_t arc_stats = {
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 },
- { "arc_sys_free", KSTAT_DATA_UINT64 }
+ { "arc_sys_free", KSTAT_DATA_UINT64 },
+ { "arc_raw_size", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -815,6 +834,8 @@ static arc_state_t *arc_l2c_only;
#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
+/* size of all b_rabd's in entire arc */
+#define arc_raw_size ARCSTAT(arcstat_raw_size)
/* compressed size of entire arc */
#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
/* uncompressed size of entire arc */
@@ -844,6 +865,8 @@ static taskq_t *arc_prune_taskq;
#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED)
+#define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH)
#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
#define HDR_ISTYPE_METADATA(hdr) \
@@ -852,6 +875,13 @@ static taskq_t *arc_prune_taskq;
#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+#define HDR_HAS_RABD(hdr) \
+ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \
+ (hdr)->b_crypt_hdr.b_rabd != NULL)
+#define HDR_ENCRYPTED(hdr) \
+ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
+#define HDR_AUTHENTICATED(hdr) \
+ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
/* For storing compression mode in b_flags */
#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
@@ -864,12 +894,14 @@ static taskq_t *arc_prune_taskq;
#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
+#define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
/*
* Other sizes
*/
-#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
/*
@@ -967,6 +999,14 @@ typedef struct l2arc_data_free {
list_node_t l2df_list_node;
} l2arc_data_free_t;
+typedef enum arc_fill_flags {
+ ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */
+ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */
+ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */
+ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */
+ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
+} arc_fill_flags_t;
+
static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
@@ -977,8 +1017,8 @@ static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
-static void arc_hdr_free_pabd(arc_buf_hdr_t *);
-static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
+static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
+static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
@@ -1130,7 +1170,9 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
/*
* Global data structures and functions for the buf kmem cache.
*/
+
static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_full_crypt_cache;
static kmem_cache_t *hdr_l2only_cache;
static kmem_cache_t *buf_cache;
@@ -1153,6 +1195,7 @@ buf_fini(void)
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
kmem_cache_destroy(hdr_full_cache);
+ kmem_cache_destroy(hdr_full_crypt_cache);
kmem_cache_destroy(hdr_l2only_cache);
kmem_cache_destroy(buf_cache);
}
@@ -1181,6 +1224,19 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag)
/* ARGSUSED */
static int
+hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ hdr_full_cons(vbuf, unused, kmflag);
+ bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
+ arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
{
arc_buf_hdr_t *hdr = vbuf;
@@ -1224,6 +1280,16 @@ hdr_full_dest(void *vbuf, void *unused)
/* ARGSUSED */
static void
+hdr_full_crypt_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ hdr_full_dest(vbuf, unused);
+ arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
hdr_l2only_dest(void *vbuf, void *unused)
{
ASSERTV(arc_buf_hdr_t *hdr = vbuf);
@@ -1294,6 +1360,9 @@ retry:
hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+ hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
+ HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
+ hdr_recl, NULL, NULL, 0);
hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
NULL, NULL, 0);
@@ -1330,6 +1399,46 @@ arc_buf_lsize(arc_buf_t *buf)
return (HDR_GET_LSIZE(buf->b_hdr));
}
+/*
+ * This function will return B_TRUE if the buffer is encrypted in memory.
+ * This buffer can be decrypted by calling arc_untransform().
+ */
+boolean_t
+arc_is_encrypted(arc_buf_t *buf)
+{
+ return (ARC_BUF_ENCRYPTED(buf) != 0);
+}
+
+/*
+ * Returns B_TRUE if the buffer represents data that has not had its MAC
+ * verified yet.
+ */
+boolean_t
+arc_is_unauthenticated(arc_buf_t *buf)
+{
+ return (HDR_NOAUTH(buf->b_hdr) != 0);
+}
+
+void
+arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
+ uint8_t *iv, uint8_t *mac)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(HDR_PROTECTED(hdr));
+
+ bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
+ bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
+ bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
+ *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
+ ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
+}
+
+/*
+ * Indicates how this buffer is compressed in memory. If it is not compressed
+ * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
+ * arc_untransform() as long as it is also unencrypted.
+ */
enum zio_compress
arc_get_compression(arc_buf_t *buf)
{
@@ -1337,6 +1446,18 @@ arc_get_compression(arc_buf_t *buf)
HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
}
+/*
+ * Return the compression algorithm used to store this data in the ARC. If ARC
+ * compression is enabled or this is an encrypted block, this will be the same
+ * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
+ */
+static inline enum zio_compress
+arc_hdr_get_compress(arc_buf_hdr_t *hdr)
+{
+ return (HDR_COMPRESSION_ENABLED(hdr) ?
+ HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
+}
+
static inline boolean_t
arc_buf_is_shared(arc_buf_t *buf)
{
@@ -1364,6 +1485,7 @@ static inline void
arc_cksum_free(arc_buf_hdr_t *hdr)
{
ASSERT(HDR_HAS_L1HDR(hdr));
+
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
@@ -1374,6 +1496,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
/*
* Return true iff at least one of the bufs on hdr is not compressed.
+ * Encrypted buffers count as compressed.
*/
static boolean_t
arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
@@ -1421,58 +1544,18 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
+/*
+ * This function makes the assumption that data stored in the L2ARC
+ * will be transformed exactly as it is in the main pool. Because of
+ * this we can verify the checksum against the reading process's bp.
+ */
static boolean_t
arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
{
- enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
- boolean_t valid_cksum;
-
ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
/*
- * We rely on the blkptr's checksum to determine if the block
- * is valid or not. When compressed arc is enabled, the l2arc
- * writes the block to the l2arc just as it appears in the pool.
- * This allows us to use the blkptr's checksum to validate the
- * data that we just read off of the l2arc without having to store
- * a separate checksum in the arc_buf_hdr_t. However, if compressed
- * arc is disabled, then the data written to the l2arc is always
- * uncompressed and won't match the block as it exists in the main
- * pool. When this is the case, we must first compress it if it is
- * compressed on the main pool before we can validate the checksum.
- */
- if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
- uint64_t lsize;
- uint64_t csize;
- void *cbuf;
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
-
- cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
- lsize = HDR_GET_LSIZE(hdr);
- csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
-
- ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
- if (csize < HDR_GET_PSIZE(hdr)) {
- /*
- * Compressed blocks are always a multiple of the
- * smallest ashift in the pool. Ideally, we would
- * like to round up the csize to the next
- * spa_min_ashift but that value may have changed
- * since the block was last written. Instead,
- * we rely on the fact that the hdr's psize
- * was set to the psize of the block when it was
- * last written. We set the csize to that value
- * and zero out any part that should not contain
- * data.
- */
- bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize);
- csize = HDR_GET_PSIZE(hdr);
- }
- zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL);
- }
-
- /*
* Block pointers always store the checksum for the logical data.
* If the block pointer has the gang bit set, then the checksum
* it represents is for the reconstituted data and not for an
@@ -1485,11 +1568,9 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
* generated using the correct checksum algorithm and accounts for the
* logical I/O size and not just a gang fragment.
*/
- valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
zio->io_offset, NULL) == 0);
- zio_pop_transforms(zio);
- return (valid_cksum);
}
/*
@@ -1518,6 +1599,7 @@ arc_cksum_compute(arc_buf_t *buf)
return;
}
+ ASSERT(!ARC_BUF_ENCRYPTED(buf));
ASSERT(!ARC_BUF_COMPRESSED(buf));
hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
KM_SLEEP);
@@ -1684,15 +1766,14 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
*/
if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
- HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
- HDR_SET_COMPRESS(hdr, cmp);
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
ASSERT(HDR_COMPRESSION_ENABLED(hdr));
}
+
+ HDR_SET_COMPRESS(hdr, cmp);
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
}
/*
@@ -1733,6 +1814,254 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
}
/*
+ * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
+ */
+static uint64_t
+arc_hdr_size(arc_buf_hdr_t *hdr)
+{
+ uint64_t size;
+
+ if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
+ HDR_GET_PSIZE(hdr) > 0) {
+ size = HDR_GET_PSIZE(hdr);
+ } else {
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
+ size = HDR_GET_LSIZE(hdr);
+ }
+ return (size);
+}
+
+static int
+arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
+{
+ int ret;
+ uint64_t csize;
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ void *tmpbuf = NULL;
+ abd_t *abd = hdr->b_l1hdr.b_pabd;
+
+ ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr)));
+ ASSERT(HDR_AUTHENTICATED(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ /*
+ * The MAC is calculated on the compressed data that is stored on disk.
+ * However, if compressed arc is disabled we will only have the
+ * decompressed data available to us now. Compress it into a temporary
+ * abd so we can verify the MAC. The performance overhead of this will
+ * be relatively low, since most objects in an encrypted objset will
+ * be encrypted (instead of authenticated) anyway.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr)) {
+ tmpbuf = zio_buf_alloc(lsize);
+ abd = abd_get_from_buf(tmpbuf, lsize);
+ abd_take_ownership_of_buf(abd, B_TRUE);
+
+ csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pabd, tmpbuf, lsize);
+ ASSERT3U(csize, <=, psize);
+ abd_zero_off(abd, csize, psize - csize);
+ }
+
+ /*
+ * Authentication is best effort. We authenticate whenever the key is
+ * available. If we succeed we clear ARC_FLAG_NOAUTH.
+ */
+ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, psize);
+ ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
+ psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+ } else {
+ ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
+ hdr->b_crypt_hdr.b_mac);
+ }
+
+ if (ret == 0)
+ arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
+ else if (ret != ENOENT)
+ goto error;
+
+ if (tmpbuf != NULL)
+ abd_free(abd);
+
+ return (0);
+
+error:
+ if (tmpbuf != NULL)
+ abd_free(abd);
+
+ return (ret);
+}
+
+/*
+ * This function will take a header that only has raw encrypted data in
+ * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
+ * b_l1hdr.b_pabd. If designated in the header flags, this function will
+ * also decompress the data.
+ */
+static int
+arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
+{
+ int ret;
+ dsl_crypto_key_t *dck = NULL;
+ abd_t *cabd = NULL;
+ void *tmp = NULL;
+ boolean_t no_crypt = B_FALSE;
+ boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+
+ ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr)));
+ ASSERT(HDR_ENCRYPTED(hdr));
+
+ arc_hdr_alloc_abd(hdr, B_FALSE);
+
+ /*
+ * We must be careful to use the passed-in dsobj value here and
+ * not the value in b_dsobj. b_dsobj is meant to be a best guess for
+ * the L2ARC, which has the luxury of being able to fail without real
+ * consequences (the data simply won't make it to the L2ARC). In
+ * reality, the dsobj stored in the header may belong to a dataset
+ * that has been unmounted or otherwise disowned, meaning the key
+ * won't be accessible via that dsobj anymore.
+ */
+ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+ if (ret != 0) {
+ ret = SET_ERROR(EACCES);
+ goto error;
+ }
+
+ ret = zio_do_crypt_abd(B_FALSE, &dck->dck_key,
+ hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_ot,
+ hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac,
+ HDR_GET_PSIZE(hdr), bswap, hdr->b_l1hdr.b_pabd,
+ hdr->b_crypt_hdr.b_rabd, &no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (no_crypt) {
+ abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
+ HDR_GET_PSIZE(hdr));
+ }
+
+ /*
+ * If this header has disabled arc compression but the b_pabd is
+ * compressed after decrypting it, we need to decompress the newly
+ * decrypted data.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr)) {
+ /*
+ * We want to make sure that we are correctly honoring the
+ * zfs_abd_scatter_enabled setting, so we allocate an abd here
+ * and then loan a buffer from it, rather than allocating a
+ * linear buffer and wrapping it in an abd later.
+ */
+ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
+ tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
+
+ ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+ HDR_GET_LSIZE(hdr));
+ if (ret != 0) {
+ abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
+ goto error;
+ }
+
+ abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+ arc_hdr_size(hdr), hdr);
+ hdr->b_l1hdr.b_pabd = cabd;
+ }
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ return (0);
+
+error:
+ arc_hdr_free_abd(hdr, B_FALSE);
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ if (cabd != NULL)
+ arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
+
+ return (ret);
+}
+
+/*
+ * This function is called during arc_buf_fill() to prepare the header's
+ * abd plaintext pointer for use. This involves authenticated protected
+ * data and decrypting encrypted data into the plaintext abd.
+ */
+static int
+arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
+ uint64_t dsobj, boolean_t noauth)
+{
+ int ret;
+
+ ASSERT(HDR_PROTECTED(hdr));
+
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
+
+ if (HDR_NOAUTH(hdr) && !noauth) {
+ /*
+ * The caller requested authenticated data but our data has
+ * not been authenticated yet. Verify the MAC now if we can.
+ */
+ ret = arc_hdr_authenticate(hdr, spa, dsobj);
+ if (ret != 0)
+ goto error;
+ } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
+ /*
+ * If we only have the encrypted version of the data, but the
+ * unencrypted version was requested we take this opportunity
+ * to store the decrypted version in the header for future use.
+ */
+ ret = arc_hdr_decrypt(hdr, spa, dsobj);
+ if (ret != 0)
+ goto error;
+ }
+
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ return (0);
+
+error:
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ return (ret);
+}
+
+/*
+ * This function is used by the dbuf code to decrypt bonus buffers in place.
+ * The dbuf code itself doesn't have any locking for decrypting a shared dnode
+ * block, so we use the hash lock here to protect against concurrent calls to
+ * arc_buf_fill().
+ */
+static void
+arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(HDR_ENCRYPTED(hdr));
+ ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
+ ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr)));
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
+ arc_buf_size(buf));
+ buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+ hdr->b_crypt_hdr.b_ebufcnt -= 1;
+}
+
+/*
* Given a buf that has a data buffer attached to it, this function will
* efficiently fill the buf with data of the specified compression setting from
* the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
@@ -1746,15 +2075,79 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
* the correct-sized data buffer.
*/
static int
-arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
+arc_buf_fill(arc_buf_t *buf, spa_t *spa, uint64_t dsobj, arc_fill_flags_t flags)
{
+ int error = 0;
arc_buf_hdr_t *hdr = buf->b_hdr;
- boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
+ boolean_t hdr_compressed =
+ (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
+ boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
+ boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+ kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
ASSERT3P(buf->b_data, !=, NULL);
- IMPLY(compressed, hdr_compressed);
+ IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
+ IMPLY(encrypted, HDR_ENCRYPTED(hdr));
+ IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
+ IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
+ IMPLY(encrypted, !ARC_BUF_SHARED(buf));
+
+ /*
+ * If the caller wanted encrypted data we just need to copy it from
+ * b_rabd and potentially byteswap it. We won't be able to do any
+ * further transforms on it.
+ */
+ if (encrypted) {
+ ASSERT(HDR_HAS_RABD(hdr));
+ abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
+ HDR_GET_PSIZE(hdr));
+ goto byteswap;
+ }
+
+ /*
+ * Adjust encrypted and authenticated headers to accomodate the
+ * request if needed.
+ */
+ if (HDR_PROTECTED(hdr)) {
+ error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
+ dsobj, !!(flags & ARC_FILL_NOAUTH));
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * There is a special case here for dnode blocks which are
+ * decrypting their bonus buffers. These blocks may request to
+ * be decrypted in-place. This is necessary because there may
+ * be many dnodes pointing into this buffer and there is
+ * currently no method to synchronize replacing the backing
+ * b_data buffer and updating all of the pointers. Here we use
+ * the hash lock to ensure there are no races. If the need
+ * arises for other types to be decrypted in-place, they must
+ * add handling here as well.
+ */
+ if ((flags & ARC_FILL_IN_PLACE) != 0) {
+ ASSERT(!hdr_compressed);
+ ASSERT(!compressed);
+ ASSERT(!encrypted);
+
+ if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
+ ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
+
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
+ arc_buf_untransform_in_place(buf, hash_lock);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
+ /* Compute the hdr's checksum if necessary */
+ arc_cksum_compute(buf);
+ }
+
+ return (0);
+ }
if (hdr_compressed == compressed) {
if (!arc_buf_is_shared(buf)) {
@@ -1809,7 +2202,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
return (0);
} else {
- int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pabd, buf->b_data,
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
@@ -1820,13 +2213,14 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
if (error != 0) {
zfs_dbgmsg(
"hdr %p, compress %d, psize %d, lsize %d",
- hdr, HDR_GET_COMPRESS(hdr),
+ hdr, arc_hdr_get_compress(hdr),
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
return (SET_ERROR(EIO));
}
}
}
+byteswap:
/* Byteswap the buf's data if necessary */
if (bswap != DMU_BSWAP_NUMFUNCS) {
ASSERT(!HDR_SHARED_DATA(hdr));
@@ -1840,28 +2234,21 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
return (0);
}
-int
-arc_decompress(arc_buf_t *buf)
-{
- return (arc_buf_fill(buf, B_FALSE));
-}
-
/*
- * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
+ * If this function is being called to decrypt an encrypted buffer or verify an
+ * authenticated one, the key must be loaded and a mapping must be made
+ * available in the keystore via spa_keystore_create_mapping() or one of its
+ * callers.
*/
-static uint64_t
-arc_hdr_size(arc_buf_hdr_t *hdr)
+int
+arc_untransform(arc_buf_t *buf, spa_t *spa, uint64_t dsobj, boolean_t in_place)
{
- uint64_t size;
+ arc_fill_flags_t flags = 0;
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
- HDR_GET_PSIZE(hdr) > 0) {
- size = HDR_GET_PSIZE(hdr);
- } else {
- ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
- size = HDR_GET_LSIZE(hdr);
- }
- return (size);
+ if (in_place)
+ flags |= ARC_FILL_IN_PLACE;
+
+ return (arc_buf_fill(buf, spa, dsobj, flags));
}
/*
@@ -1881,6 +2268,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
(void) refcount_add_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
@@ -1891,6 +2279,11 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
(void) refcount_add_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
+ if (HDR_HAS_RABD(hdr)) {
+ (void) refcount_add_many(&state->arcs_esize[type],
+ HDR_GET_PSIZE(hdr), hdr);
+ }
+
for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
if (arc_buf_is_shared(buf))
continue;
@@ -1916,6 +2309,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
(void) refcount_remove_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
@@ -1926,6 +2320,11 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
(void) refcount_remove_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
+ if (HDR_HAS_RABD(hdr)) {
+ (void) refcount_remove_many(&state->arcs_esize[type],
+ HDR_GET_PSIZE(hdr), hdr);
+ }
+
for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
if (arc_buf_is_shared(buf))
continue;
@@ -2069,7 +2468,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
old_state = hdr->b_l1hdr.b_state;
refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
bufcnt = hdr->b_l1hdr.b_bufcnt;
- update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
+ update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
+ HDR_HAS_RABD(hdr));
} else {
old_state = arc_l2c_only;
refcnt = 0;
@@ -2139,6 +2539,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
(void) refcount_add_many(&new_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
} else {
arc_buf_t *buf;
uint32_t buffers = 0;
@@ -2171,8 +2572,11 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&new_state->arcs_size,
arc_hdr_size(hdr), hdr);
- } else {
- ASSERT(GHOST_STATE(old_state));
+ }
+
+ if (HDR_HAS_RABD(hdr)) {
+ (void) refcount_add_many(&new_state->arcs_size,
+ HDR_GET_PSIZE(hdr), hdr);
}
}
}
@@ -2182,6 +2586,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
/*
* When moving a header off of a ghost state,
@@ -2222,9 +2627,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
buf);
}
ASSERT3U(bufcnt, ==, buffers);
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- (void) refcount_remove_many(
- &old_state->arcs_size, arc_hdr_size(hdr), hdr);
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+ HDR_HAS_RABD(hdr));
+
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ (void) refcount_remove_many(
+ &old_state->arcs_size, arc_hdr_size(hdr),
+ hdr);
+ }
+
+ if (HDR_HAS_RABD(hdr)) {
+ (void) refcount_remove_many(
+ &old_state->arcs_size, HDR_GET_PSIZE(hdr),
+ hdr);
+ }
}
}
@@ -2327,12 +2743,13 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
/*
* The criteria for sharing a hdr's data are:
- * 1. the hdr's compression matches the buf's compression
- * 2. the hdr doesn't need to be byteswapped
- * 3. the hdr isn't already being shared
- * 4. the buf is either compressed or it is the last buf in the hdr list
+ * 1. the buffer is not encrypted
+ * 2. the hdr's compression matches the buf's compression
+ * 3. the hdr doesn't need to be byteswapped
+ * 4. the hdr isn't already being shared
+ * 5. the buf is either compressed or it is the last buf in the hdr list
*
- * Criterion #4 maintains the invariant that shared uncompressed
+ * Criterion #5 maintains the invariant that shared uncompressed
* bufs must be the final buf in the hdr's b_buf list. Reading this, you
* might ask, "if a compressed buf is allocated first, won't that be the
* last thing in the list?", but in that case it's impossible to create
@@ -2347,9 +2764,11 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
* sharing if the new buf isn't the first to be added.
*/
ASSERT3P(buf->b_hdr, ==, hdr);
- boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
+ boolean_t hdr_compressed =
+ arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
- return (buf_compressed == hdr_compressed &&
+ return (!ARC_BUF_ENCRYPTED(buf) &&
+ buf_compressed == hdr_compressed &&
hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
!HDR_SHARED_DATA(hdr) &&
(ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
@@ -2361,10 +2780,12 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
* copy was made successfully, or an error code otherwise.
*/
static int
-arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj, void *tag,
+ boolean_t encrypted, boolean_t compressed, boolean_t noauth,
boolean_t fill, arc_buf_t **ret)
{
arc_buf_t *buf;
+ arc_fill_flags_t flags = ARC_FILL_LOCKED;
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
@@ -2372,6 +2793,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
hdr->b_type == ARC_BUFC_METADATA);
ASSERT3P(ret, !=, NULL);
ASSERT3P(*ret, ==, NULL);
+ IMPLY(encrypted, compressed);
hdr->b_l1hdr.b_mru_hits = 0;
hdr->b_l1hdr.b_mru_ghost_hits = 0;
@@ -2395,10 +2817,23 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
/*
* Only honor requests for compressed bufs if the hdr is actually
- * compressed.
+ * compressed. This must be overriden if the buffer is encrypted since
+ * encrypted buffers cannot be decompressed.
*/
- if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ if (encrypted) {
+ buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+ buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
+ flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
+ } else if (compressed &&
+ arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+ flags |= ARC_FILL_COMPRESSED;
+ }
+
+ if (noauth) {
+ ASSERT0(encrypted);
+ flags |= ARC_FILL_NOAUTH;
+ }
/*
* If the hdr's data can be shared then we share the data buffer and
@@ -2414,7 +2849,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
* need to be ABD-aware.
*/
boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
- abd_is_linear(hdr->b_l1hdr.b_pabd);
+ hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
/* Set up b_data and sharing */
if (can_share) {
@@ -2430,13 +2865,15 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
hdr->b_l1hdr.b_buf = buf;
hdr->b_l1hdr.b_bufcnt += 1;
+ if (encrypted)
+ hdr->b_crypt_hdr.b_ebufcnt += 1;
/*
* If the user wants the data from the hdr, we need to either copy or
* decompress the data.
*/
if (fill) {
- return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
+ return (arc_buf_fill(buf, spa, dsobj, flags));
}
return (0);
@@ -2482,6 +2919,19 @@ arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
return (buf);
}
+arc_buf_t *
+arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
+ const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
+ dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
+ byteorder, salt, iv, mac, ot, psize, lsize, compression_type);
+
+ atomic_add_64(&arc_loaned_bytes, psize);
+ return (buf);
+}
+
/*
* Return a loaned arc buffer to the arc.
@@ -2527,11 +2977,11 @@ l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
}
static void
-arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
+arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
- uint64_t size = arc_hdr_size(hdr);
+ uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
/* protected by hash lock, if in the hash table */
if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
@@ -2549,7 +2999,11 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
arc_space_return(size, ARC_SPACE_DATA);
}
- l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
+ if (free_rdata) {
+ l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
+ } else {
+ l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
+ }
}
/*
@@ -2562,6 +3016,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
ASSERT(arc_can_share(hdr, buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!ARC_BUF_ENCRYPTED(buf));
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
@@ -2689,6 +3144,18 @@ arc_buf_destroy_impl(arc_buf_t *buf)
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
hdr->b_l1hdr.b_bufcnt -= 1;
+
+ if (ARC_BUF_ENCRYPTED(buf))
+ hdr->b_crypt_hdr.b_ebufcnt -= 1;
+
+ /*
+ * if we have no more encrypted buffers and we've already
+ * gotten a copy of the decrypted data we can free b_rabd to
+ * save some space.
+ */
+ if (hdr->b_crypt_hdr.b_ebufcnt == 0 && HDR_HAS_RABD(hdr) &&
+ hdr->b_l1hdr.b_pabd != NULL)
+ arc_hdr_free_abd(hdr, B_TRUE);
}
arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
@@ -2703,16 +3170,17 @@ arc_buf_destroy_impl(arc_buf_t *buf)
* There is an equivalent case for compressed bufs, but since
* they aren't guaranteed to be the last buf in the list and
* that is an exceedingly rare case, we just allow that space be
- * wasted temporarily.
+ * wasted temporarily. We must also be careful not to share
+ * encrypted buffers, since they cannot be shared.
*/
- if (lastbuf != NULL) {
+ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
/* Only one buf can be shared at once */
VERIFY(!arc_buf_is_shared(lastbuf));
/* hdr is uncompressed so can't have compressed buf */
VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- arc_hdr_free_pabd(hdr);
+ arc_hdr_free_abd(hdr, B_FALSE);
/*
* We must setup a new shared block between the
@@ -2733,7 +3201,7 @@ arc_buf_destroy_impl(arc_buf_t *buf)
*/
ASSERT3P(lastbuf, !=, NULL);
ASSERT(arc_buf_is_shared(lastbuf) ||
- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
+ arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
}
/*
@@ -2750,26 +3218,43 @@ arc_buf_destroy_impl(arc_buf_t *buf)
}
static void
-arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
+arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata)
{
+ uint64_t size;
+
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
+ IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
- ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+ if (alloc_rdata) {
+ size = HDR_GET_PSIZE(hdr);
+ ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
+ hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr);
+ ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
+ ARCSTAT_INCR(arcstat_raw_size, size);
+ } else {
+ size = arc_hdr_size(hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ }
+
+ ARCSTAT_INCR(arcstat_compressed_size, size);
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
}
static void
-arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
+arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
{
+ uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
+
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+ IMPLY(free_rdata, HDR_HAS_RABD(hdr));
/*
* If the hdr is currently being written to the l2arc then
@@ -2778,28 +3263,42 @@ arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
* writing it to the l2arc device.
*/
if (HDR_L2_WRITING(hdr)) {
- arc_hdr_free_on_write(hdr);
+ arc_hdr_free_on_write(hdr, free_rdata);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else if (free_rdata) {
+ arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
} else {
- arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
- arc_hdr_size(hdr), hdr);
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
}
- hdr->b_l1hdr.b_pabd = NULL;
- hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
- ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+ if (free_rdata) {
+ hdr->b_crypt_hdr.b_rabd = NULL;
+ ARCSTAT_INCR(arcstat_raw_size, -size);
+ } else {
+ hdr->b_l1hdr.b_pabd = NULL;
+ }
+
+ if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+
+ ARCSTAT_INCR(arcstat_compressed_size, -size);
ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
}
static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
- enum zio_compress compression_type, arc_buf_contents_t type)
+ boolean_t protected, enum zio_compress compression_type,
+ arc_buf_contents_t type, boolean_t alloc_rdata)
{
arc_buf_hdr_t *hdr;
VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
+ if (protected) {
+ hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
+ } else {
+ hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
+ }
- hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
ASSERT(HDR_EMPTY(hdr));
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
HDR_SET_PSIZE(hdr, psize);
@@ -2809,6 +3308,8 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
hdr->b_flags = 0;
arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
arc_hdr_set_compress(hdr, compression_type);
+ if (protected)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
hdr->b_l1hdr.b_state = arc_anon;
hdr->b_l1hdr.b_arc_access = 0;
@@ -2820,7 +3321,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
* the compressed or uncompressed data depending on the block
* it references and compressed arc enablement.
*/
- arc_hdr_alloc_pabd(hdr);
+ arc_hdr_alloc_abd(hdr, alloc_rdata);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
return (hdr);
@@ -2843,6 +3344,16 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
(old == hdr_l2only_cache && new == hdr_full_cache));
+ /*
+ * if the caller wanted a new full header and the header is to be
+ * encrypted we will actually allocate the header from the full crypt
+ * cache instead. The same applies to freeing from the old cache.
+ */
+ if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
+ new = hdr_full_crypt_cache;
+ if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
+ old = hdr_full_crypt_cache;
+
nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
@@ -2850,7 +3361,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
- if (new == hdr_full_cache) {
+ if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
/*
* arc_access and arc_change_state need to be aware that a
@@ -2861,6 +3372,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
/* Verify previous threads set to NULL before freeing */
ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
} else {
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(hdr->b_l1hdr.b_bufcnt);
@@ -2883,6 +3395,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
*/
VERIFY(!HDR_L2_WRITING(hdr));
VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
}
@@ -2925,6 +3438,111 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
}
/*
+ * This function allows an L1 header to be reallocated as a crypt
+ * header and vice versa. If we are going to a crypt header, the
+ * new fields will be zeroed out.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
+{
+ arc_buf_hdr_t *nhdr;
+ arc_buf_t *buf;
+ kmem_cache_t *ncache, *ocache;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ if (need_crypt) {
+ ncache = hdr_full_crypt_cache;
+ ocache = hdr_full_cache;
+ } else {
+ ncache = hdr_full_cache;
+ ocache = hdr_full_crypt_cache;
+ }
+
+ nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
+ bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+ nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
+ nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
+ nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
+ nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
+ nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
+ nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
+ nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
+ nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
+ nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
+ nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits;
+ nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
+ nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
+ nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
+
+ /*
+ * This refcount_add() exists only to ensure that the individual
+ * arc buffers always point to a header that is referenced, avoiding
+ * a small race condition that could trigger ASSERTs.
+ */
+ (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
+
+ for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
+ mutex_enter(&buf->b_evict_lock);
+ buf->b_hdr = nhdr;
+ mutex_exit(&buf->b_evict_lock);
+ }
+
+ refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
+ (void) refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
+
+ if (need_crypt) {
+ arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
+ } else {
+ arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
+ }
+
+ buf_discard_identity(hdr);
+ kmem_cache_free(ocache, hdr);
+
+ return (nhdr);
+}
+
+/*
+ * This function is used by the send / receive code to convert a newly
+ * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
+ * is also used to allow the root objset block to be uupdated without altering
+ * its embedded MACs. Both block types will always be uncompressed so we do not
+ * have to worry about compression type or psize.
+ */
+void
+arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
+ dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
+ const uint8_t *mac)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+
+ buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
+ if (!HDR_PROTECTED(hdr))
+ hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
+ hdr->b_crypt_hdr.b_dsobj = dsobj;
+ hdr->b_crypt_hdr.b_ot = ot;
+ hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
+ DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
+ if (!arc_hdr_has_uncompressed_buf(hdr))
+ arc_cksum_free(hdr);
+
+ if (salt != NULL)
+ bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+ if (iv != NULL)
+ bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+ if (mac != NULL)
+ bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+}
+
+/*
* Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
* The buf is returned thawed since we expect the consumer to modify it.
*/
@@ -2932,11 +3550,12 @@ arc_buf_t *
arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
{
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
- ZIO_COMPRESS_OFF, type);
+ B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE);
ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
arc_buf_t *buf = NULL;
- VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
+ VERIFY0(arc_buf_alloc_impl(hdr, spa, 0, tag, B_FALSE, B_FALSE,
+ B_FALSE, B_FALSE, &buf));
arc_buf_thaw(buf);
return (buf);
@@ -2952,33 +3571,76 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
{
ASSERT3U(lsize, >, 0);
ASSERT3U(lsize, >=, psize);
- ASSERT(compression_type > ZIO_COMPRESS_OFF);
- ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
+ ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
- compression_type, ARC_BUFC_DATA);
+ B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE);
ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
arc_buf_t *buf = NULL;
- VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
+ VERIFY0(arc_buf_alloc_impl(hdr, spa, 0, tag, B_FALSE,
+ B_TRUE, B_FALSE, B_FALSE, &buf));
arc_buf_thaw(buf);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
if (!arc_buf_is_shared(buf)) {
/*
* To ensure that the hdr has the correct data in it if we call
- * arc_decompress() on this buf before it's been written to
+ * arc_untransform() on this buf before it's been written to
* disk, it's easiest if we just set up sharing between the
* buf and the hdr.
*/
ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
- arc_hdr_free_pabd(hdr);
+ arc_hdr_free_abd(hdr, B_FALSE);
arc_share_buf(hdr, buf);
}
return (buf);
}
+arc_buf_t *
+arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
+ const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
+ dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
+ ARC_BUFC_METADATA : ARC_BUFC_DATA;
+
+ ASSERT3U(lsize, >, 0);
+ ASSERT3U(lsize, >=, psize);
+ ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
+ ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
+
+ hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
+ compression_type, type, B_TRUE);
+ ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
+
+ hdr->b_crypt_hdr.b_dsobj = dsobj;
+ hdr->b_crypt_hdr.b_ot = ot;
+ hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
+ DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
+ bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+ bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+ bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+
+ /*
+ * This buffer will be considered encrypted even if the ot is not an
+ * encrypted type. It will become authenticated instead in
+ * arc_write_ready().
+ */
+ buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, spa, dsobj, tag, B_TRUE, B_TRUE,
+ B_FALSE, B_FALSE, &buf));
+ arc_buf_thaw(buf);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+ return (buf);
+}
+
static void
arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
{
@@ -3044,15 +3706,25 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
while (hdr->b_l1hdr.b_buf != NULL)
arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
- if (hdr->b_l1hdr.b_pabd != NULL)
- arc_hdr_free_pabd(hdr);
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ arc_hdr_free_abd(hdr, B_FALSE);
+ }
+
+ if (HDR_HAS_RABD(hdr)) {
+ arc_hdr_free_abd(hdr, B_TRUE);
+ }
}
ASSERT3P(hdr->b_hash_next, ==, NULL);
if (HDR_HAS_L1HDR(hdr)) {
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
- kmem_cache_free(hdr_full_cache, hdr);
+
+ if (!HDR_PROTECTED(hdr)) {
+ kmem_cache_free(hdr_full_cache, hdr);
+ } else {
+ kmem_cache_free(hdr_full_crypt_cache, hdr);
+ }
} else {
kmem_cache_free(hdr_l2only_cache, hdr);
}
@@ -3129,6 +3801,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
if (HDR_HAS_L2HDR(hdr)) {
ASSERT(hdr->b_l1hdr.b_pabd == NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
/*
* This buffer is cached on the 2nd Level ARC;
* don't destroy the header.
@@ -3195,7 +3868,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* This ensures that the accounting is updated correctly
* in arc_free_data_impl().
*/
- arc_hdr_free_pabd(hdr);
+ if (hdr->b_l1hdr.b_pabd != NULL)
+ arc_hdr_free_abd(hdr, B_FALSE);
+
+ if (HDR_HAS_RABD(hdr))
+ arc_hdr_free_abd(hdr, B_TRUE);
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
@@ -4876,22 +5553,22 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
}
}
-/* a generic arc_done_func_t which you can use */
+/* a generic arc_read_done_func_t which you can use */
/* ARGSUSED */
void
-arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
{
- if (zio == NULL || zio->io_error == 0)
+ if (error == 0)
bcopy(buf->b_data, arg, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
-/* a generic arc_done_func_t */
+/* a generic arc_read_done_func_t */
void
-arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
{
arc_buf_t **bufp = arg;
- if (zio && zio->io_error) {
+ if (error != 0) {
arc_buf_destroy(buf, arg);
*bufp = NULL;
} else {
@@ -4905,20 +5582,22 @@ arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
{
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
} else {
if (HDR_COMPRESSION_ENABLED(hdr)) {
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
+ ASSERT3U(arc_hdr_get_compress(hdr), ==,
BP_GET_COMPRESS(bp));
}
ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
+ ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
}
}
static void
arc_read_done(zio_t *zio)
{
+ blkptr_t *bp = zio->io_bp;
arc_buf_hdr_t *hdr = zio->io_private;
kmutex_t *hash_lock = NULL;
arc_callback_t *callback_list;
@@ -4951,6 +5630,26 @@ arc_read_done(zio_t *zio)
ASSERT3P(hash_lock, !=, NULL);
}
+ if (BP_IS_PROTECTED(bp)) {
+ hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
+ hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
+ zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
+ hdr->b_crypt_hdr.b_iv);
+
+ if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
+ void *tmpbuf;
+
+ tmpbuf = abd_borrow_buf_copy(zio->io_abd,
+ sizeof (zil_chain_t));
+ zio_crypt_decode_mac_zil(tmpbuf,
+ hdr->b_crypt_hdr.b_mac);
+ abd_return_buf(zio->io_abd, tmpbuf,
+ sizeof (zil_chain_t));
+ } else {
+ zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+ }
+ }
+
if (no_zio_error) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
@@ -4996,8 +5695,33 @@ arc_read_done(zio_t *zio)
/* This is a demand read since prefetches don't use callbacks */
callback_cnt++;
- int error = arc_buf_alloc_impl(hdr, acb->acb_private,
- acb->acb_compressed, no_zio_error, &acb->acb_buf);
+ int error = arc_buf_alloc_impl(hdr, zio->io_spa,
+ zio->io_bookmark.zb_objset, acb->acb_private,
+ acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth,
+ no_zio_error, &acb->acb_buf);
+
+ /*
+ * assert non-speculative zios didn't fail because an
+ * encryption key wasn't loaded
+ */
+ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
+ error == 0 || error != ENOENT);
+
+ /*
+ * If we failed to decrypt, report an error now (as the zio
+ * layer would have done if it had done the transforms).
+ */
+ if (error == ECKSUM) {
+ ASSERT(BP_IS_PROTECTED(bp));
+ error = SET_ERROR(EIO);
+ spa_log_error(zio->io_spa, &zio->io_bookmark);
+ if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
+ zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+ zio->io_spa, NULL, &zio->io_bookmark, zio,
+ 0, 0);
+ }
+ }
+
if (no_zio_error) {
zio->io_error = error;
}
@@ -5005,9 +5729,8 @@ arc_read_done(zio_t *zio)
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
if (callback_cnt == 0) {
- ASSERT(HDR_PREFETCH(hdr));
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ ASSERT(HDR_PREFETCH(hdr) || HDR_HAS_RABD(hdr));
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
}
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
@@ -5046,8 +5769,10 @@ arc_read_done(zio_t *zio)
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
- if (acb->acb_done)
- acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+ if (acb->acb_done) {
+ acb->acb_done(zio, zio->io_error, acb->acb_buf,
+ acb->acb_private);
+ }
if (acb->acb_zio_dummy != NULL) {
acb->acb_zio_dummy->io_error = zio->io_error;
@@ -5081,15 +5806,19 @@ arc_read_done(zio_t *zio)
* for readers of this block.
*/
int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
- void *private, zio_priority_t priority, int zio_flags,
- arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ arc_read_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = NULL;
kmutex_t *hash_lock = NULL;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
- boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
+ boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
+ boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
+ (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
+ boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
+ (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
int rc = 0;
ASSERT(!BP_IS_EMBEDDED(bp) ||
@@ -5104,7 +5833,15 @@ top:
hdr = buf_hash_find(guid, bp, &hash_lock);
}
- if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
+ /*
+ * Determine if we have an L1 cache hit or a cache miss. For simplicity
+ * we maintain encrypted data seperately from compressed / uncompressed
+ * data. If the user is requesting raw encrypted data and we don't have
+ * that in the header we will read from disk to guarantee that we can
+ * get it even if the encryption keys aren't loaded.
+ */
+ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
+ (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
arc_buf_t *buf = NULL;
*arc_flags |= ARC_FLAG_CACHED;
@@ -5191,8 +5928,12 @@ top:
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
/* Get a buf with the desired data in it. */
- VERIFY0(arc_buf_alloc_impl(hdr, private,
- compressed_read, B_TRUE, &buf));
+ rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset,
+ private, encrypted_read, compressed_read,
+ noauth_read, B_TRUE, &buf);
+
+ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
+ rc == 0 || rc != ENOENT);
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
@@ -5208,7 +5949,7 @@ top:
data, metadata, hits);
if (done)
- done(NULL, buf, private);
+ done(NULL, rc, buf, private);
} else {
uint64_t lsize = BP_GET_LSIZE(bp);
uint64_t psize = BP_GET_PSIZE(bp);
@@ -5217,6 +5958,7 @@ top:
uint64_t addr = 0;
boolean_t devw = B_FALSE;
uint64_t size;
+ void *hdr_abd;
/*
* Gracefully handle a damaged logical block size as a
@@ -5232,7 +5974,8 @@ top:
arc_buf_hdr_t *exists = NULL;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
- BP_GET_COMPRESS(bp), type);
+ BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type,
+ encrypted_read);
if (!BP_IS_EMBEDDED(bp)) {
hdr->b_dva = *BP_IDENTITY(bp);
@@ -5248,26 +5991,42 @@ top:
}
} else {
/*
- * This block is in the ghost cache. If it was L2-only
- * (and thus didn't have an L1 hdr), we realloc the
- * header to add an L1 hdr.
+ * This block is in the ghost cache or encrypted data
+ * was requested and we didn't have it. If it was
+ * L2-only (and thus didn't have an L1 hdr),
+ * we realloc the header to add an L1 hdr.
*/
if (!HDR_HAS_L1HDR(hdr)) {
hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
hdr_full_cache);
}
- ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ } else if (HDR_IO_IN_PROGRESS(hdr)) {
+ /*
+ * If this header already had an IO in progress
+ * and we are performing another IO to fetch
+ * encrypted data we must wait until the first
+ * IO completes so as not to confuse
+ * arc_read_done(). This should be very rare
+ * and so the performance impact shouldn't
+ * matter.
+ */
+ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
/*
* This is a delicate dance that we play here.
- * This hdr is in the ghost list so we access it
- * to move it out of the ghost list before we
+ * This hdr might be in the ghost list so we access
+ * it to move it out of the ghost list before we
* initiate the read. If it's a prefetch then
* it won't have a callback so we'll remove the
* reference that arc_buf_alloc_impl() created. We
@@ -5275,25 +6034,41 @@ top:
* avoid hitting an assert in remove_reference().
*/
arc_access(hdr, hash_lock);
- arc_hdr_alloc_pabd(hdr);
+ arc_hdr_alloc_abd(hdr, encrypted_read);
}
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- size = arc_hdr_size(hdr);
- /*
- * If compression is enabled on the hdr, then will do
- * RAW I/O and will store the compressed data in the hdr's
- * data block. Otherwise, the hdr's data block will contain
- * the uncompressed data.
- */
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+ if (encrypted_read) {
+ ASSERT(HDR_HAS_RABD(hdr));
+ size = HDR_GET_PSIZE(hdr);
+ hdr_abd = hdr->b_crypt_hdr.b_rabd;
zio_flags |= ZIO_FLAG_RAW;
+ } else {
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ size = arc_hdr_size(hdr);
+ hdr_abd = hdr->b_l1hdr.b_pabd;
+
+ if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
+ zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+ }
+
+ /*
+ * For authenticated bp's, we do not ask the ZIO layer
+ * to authenticate them since this will cause the entire
+ * IO to fail if the key isn't loaded. Instead, we
+ * defer authentication until arc_buf_fill(), which will
+ * verify the data when the key is available.
+ */
+ if (BP_IS_AUTHENTICATED(bp))
+ zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
}
- if (*arc_flags & ARC_FLAG_PREFETCH)
+ if (*arc_flags & ARC_FLAG_PREFETCH &&
+ refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ if (BP_IS_AUTHENTICATED(bp))
+ arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
if (BP_GET_LEVEL(bp) > 0)
arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
@@ -5304,6 +6079,8 @@ top:
acb->acb_done = done;
acb->acb_private = private;
acb->acb_compressed = compressed_read;
+ acb->acb_encrypted = encrypted_read;
+ acb->acb_noauth = noauth_read;
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
hdr->b_l1hdr.b_acb = acb;
@@ -5376,7 +6153,7 @@ top:
HDR_ISTYPE_METADATA(hdr));
cb->l2rcb_abd = abd;
} else {
- abd = hdr->b_l1hdr.b_pabd;
+ abd = hdr_abd;
}
ASSERT(addr >= VDEV_LABEL_START_SIZE &&
@@ -5389,7 +6166,7 @@ top:
* Issue a null zio if the underlying buffer
* was squashed to zero size by compression.
*/
- ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
+ ASSERT3U(arc_hdr_get_compress(hdr), !=,
ZIO_COMPRESS_EMPTY);
rzio = zio_read_phys(pio, vd, addr,
asize, abd,
@@ -5402,7 +6179,8 @@ top:
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
- ARCSTAT_INCR(arcstat_l2_read_bytes, size);
+ ARCSTAT_INCR(arcstat_l2_read_bytes,
+ HDR_GET_PSIZE(hdr));
if (*arc_flags & ARC_FLAG_NOWAIT) {
zio_nowait(rzio);
@@ -5432,7 +6210,7 @@ top:
}
}
- rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
+ rzio = zio_read(pio, spa, bp, hdr_abd, size,
arc_read_done, hdr, priority, zio_flags, zb);
if (*arc_flags & ARC_FLAG_WAIT) {
@@ -5626,7 +6404,8 @@ arc_release(arc_buf_t *buf, void *tag)
uint64_t spa = hdr->b_spa;
uint64_t psize = HDR_GET_PSIZE(hdr);
uint64_t lsize = HDR_GET_LSIZE(hdr);
- enum zio_compress compress = HDR_GET_COMPRESS(hdr);
+ boolean_t protected = HDR_PROTECTED(hdr);
+ enum zio_compress compress = arc_hdr_get_compress(hdr);
arc_buf_contents_t type = arc_buf_type(hdr);
VERIFY3U(hdr->b_type, ==, type);
@@ -5669,7 +6448,7 @@ arc_release(arc_buf_t *buf, void *tag)
if (arc_can_share(hdr, lastbuf)) {
arc_share_buf(hdr, lastbuf);
} else {
- arc_hdr_alloc_pabd(hdr);
+ arc_hdr_alloc_abd(hdr, B_FALSE);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
buf->b_data, psize);
}
@@ -5684,10 +6463,11 @@ arc_release(arc_buf_t *buf, void *tag)
* if we have a compressed, shared buffer.
*/
ASSERT(arc_buf_is_shared(lastbuf) ||
- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
+ arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
ASSERT(!ARC_BUF_SHARED(buf));
}
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_size,
@@ -5700,6 +6480,9 @@ arc_release(arc_buf_t *buf, void *tag)
}
hdr->b_l1hdr.b_bufcnt -= 1;
+ if (ARC_BUF_ENCRYPTED(buf))
+ hdr->b_crypt_hdr.b_ebufcnt -= 1;
+
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
@@ -5713,7 +6496,8 @@ arc_release(arc_buf_t *buf, void *tag)
* Allocate a new hdr. The new hdr will contain a b_pabd
* buffer which will be freed in arc_write().
*/
- nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
+ nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
+ compress, type, HDR_HAS_RABD(hdr));
ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(nhdr->b_l1hdr.b_bufcnt);
ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
@@ -5722,6 +6506,8 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_l1hdr.b_buf = buf;
nhdr->b_l1hdr.b_bufcnt = 1;
+ if (ARC_BUF_ENCRYPTED(buf))
+ nhdr->b_crypt_hdr.b_ebufcnt = 1;
nhdr->b_l1hdr.b_mru_hits = 0;
nhdr->b_l1hdr.b_mru_ghost_hits = 0;
nhdr->b_l1hdr.b_mfu_hits = 0;
@@ -5746,8 +6532,8 @@ arc_release(arc_buf_t *buf, void *tag)
hdr->b_l1hdr.b_l2_hits = 0;
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_l1hdr.b_arc_access = 0;
- mutex_exit(hash_lock);
+ mutex_exit(hash_lock);
buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
@@ -5784,7 +6570,8 @@ arc_write_ready(zio_t *zio)
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
+ blkptr_t *bp = zio->io_bp;
+ uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
enum zio_compress compress;
fstrans_cookie_t cookie = spl_fstrans_mark();
@@ -5804,11 +6591,15 @@ arc_write_ready(zio_t *zio)
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
- arc_hdr_free_pabd(hdr);
+ arc_hdr_free_abd(hdr, B_FALSE);
}
}
+
+ if (HDR_HAS_RABD(hdr))
+ arc_hdr_free_abd(hdr, B_TRUE);
}
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ ASSERT(!HDR_HAS_RABD(hdr));
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT(!arc_buf_is_shared(buf));
@@ -5817,21 +6608,51 @@ arc_write_ready(zio_t *zio)
if (HDR_IO_IN_PROGRESS(hdr))
ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
- arc_cksum_compute(buf);
arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
- if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+ if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
+ hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
+
+ if (BP_IS_PROTECTED(bp)) {
+ /* ZIL blocks are written through zio_rewrite */
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
+ ASSERT(HDR_PROTECTED(hdr));
+
+ hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
+ hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
+ zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
+ hdr->b_crypt_hdr.b_iv);
+ zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+ }
+
+ /*
+ * If this block was written for raw encryption but the zio layer
+ * ended up only authenticating it, adjust the buffer flags now.
+ */
+ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
+ arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
+ buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
+ if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+ }
+
+ /* this must be done after the buffer flags are adjusted */
+ arc_cksum_compute(buf);
+
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
compress = ZIO_COMPRESS_OFF;
} else {
- ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
- compress = BP_GET_COMPRESS(zio->io_bp);
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+ compress = BP_GET_COMPRESS(bp);
}
HDR_SET_PSIZE(hdr, psize);
arc_hdr_set_compress(hdr, compress);
/*
- * Fill the hdr with data. If the hdr is compressed, the data we want
- * is available from the zio, otherwise we can take it from the buf.
+ * Fill the hdr with data. If the buffer is encrypted we have no choice
+ * but to copy the data into b_radb. If the hdr is compressed, the data
+ * we want is available from the zio, otherwise we can take it from
+ * the buf.
*
* We might be able to share the buf's data with the hdr here. However,
* doing so would cause the ARC to be full of linear ABDs if we write a
@@ -5841,23 +6662,28 @@ arc_write_ready(zio_t *zio)
* written. Therefore, if they're allowed then we allocate one and copy
* the data into it; otherwise, we share the data directly if we can.
*/
- if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
- arc_hdr_alloc_pabd(hdr);
-
+ if (ARC_BUF_ENCRYPTED(buf)) {
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+ arc_hdr_alloc_abd(hdr, B_TRUE);
+ abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
+ } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
/*
* Ideally, we would always copy the io_abd into b_pabd, but the
* user may have disabled compressed ARC, thus we must check the
* hdr's compression setting rather than the io_bp's.
*/
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
- ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
- ZIO_COMPRESS_OFF);
+ if (BP_IS_ENCRYPTED(bp)) {
ASSERT3U(psize, >, 0);
-
+ arc_hdr_alloc_abd(hdr, B_TRUE);
+ abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
+ } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
+ !ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3U(psize, >, 0);
+ arc_hdr_alloc_abd(hdr, B_FALSE);
abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
} else {
ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
-
+ arc_hdr_alloc_abd(hdr, B_FALSE);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
arc_buf_size(buf));
}
@@ -5869,7 +6695,7 @@ arc_write_ready(zio_t *zio)
arc_share_buf(hdr, buf);
}
- arc_hdr_verify(hdr, zio->io_bp);
+ arc_hdr_verify(hdr, bp);
spl_fstrans_unmark(cookie);
}
@@ -5981,9 +6807,9 @@ arc_write_done(zio_t *zio)
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
- const zio_prop_t *zp, arc_done_func_t *ready,
- arc_done_func_t *children_ready, arc_done_func_t *physdone,
- arc_done_func_t *done, void *private, zio_priority_t priority,
+ const zio_prop_t *zp, arc_write_done_func_t *ready,
+ arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
+ arc_write_done_func_t *done, void *private, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
@@ -5999,16 +6825,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
if (l2arc)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
- if (ARC_BUF_COMPRESSED(buf)) {
- /*
- * We're writing a pre-compressed buffer. Make the
- * compression algorithm requested by the zio_prop_t match
- * the pre-compressed buffer's compression algorithm.
- */
- localprop.zp_compress = HDR_GET_COMPRESS(hdr);
- ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
+ if (ARC_BUF_ENCRYPTED(buf)) {
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+ localprop.zp_encrypt = B_TRUE;
+ localprop.zp_compress = HDR_GET_COMPRESS(hdr);
+ localprop.zp_byteorder =
+ (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
+ ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
+ bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
+ ZIO_DATA_SALT_LEN);
+ bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
+ ZIO_DATA_IV_LEN);
+ bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
+ ZIO_DATA_MAC_LEN);
+ if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
+ localprop.zp_nopwrite = B_FALSE;
+ localprop.zp_copies =
+ MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
+ }
zio_flags |= ZIO_FLAG_RAW;
+ } else if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
+ localprop.zp_compress = HDR_GET_COMPRESS(hdr);
+ zio_flags |= ZIO_FLAG_RAW_COMPRESS;
}
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
@@ -6032,11 +6872,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
- arc_hdr_free_pabd(hdr);
+ arc_hdr_free_abd(hdr, B_FALSE);
}
VERIFY3P(buf->b_data, !=, NULL);
- arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
}
+
+ if (HDR_HAS_RABD(hdr))
+ arc_hdr_free_abd(hdr, B_TRUE);
+
+ arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
+
ASSERT(!arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
@@ -7037,6 +7882,102 @@ top:
kmem_free(cb, sizeof (l2arc_write_callback_t));
}
+static int
+l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
+{
+ int ret;
+ spa_t *spa = zio->io_spa;
+ arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
+ blkptr_t *bp = zio->io_bp;
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+
+ /*
+ * ZIL data is never be written to the L2ARC, so we don't need
+ * special handling for its unique MAC storage.
+ */
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+
+ /* If the data was encrypted, decrypt it now */
+ if (HDR_ENCRYPTED(hdr)) {
+ abd_t *eabd = arc_get_data_abd(hdr,
+ arc_hdr_size(hdr), hdr);
+
+ zio_crypt_decode_params_bp(bp, salt, iv);
+ zio_crypt_decode_mac_bp(bp, mac);
+
+ ret = spa_keystore_lookup_key(spa,
+ cb->l2rcb_zb.zb_objset, FTAG, &dck);
+ if (ret != 0) {
+ arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
+ goto error;
+ }
+
+ ret = zio_do_crypt_abd(B_FALSE, &dck->dck_key,
+ salt, BP_GET_TYPE(bp), iv, mac, HDR_GET_PSIZE(hdr),
+ BP_SHOULD_BYTESWAP(bp), eabd, hdr->b_l1hdr.b_pabd,
+ &no_crypt);
+ if (ret != 0) {
+ arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ goto error;
+ }
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ /*
+ * If we actually performed decryption, replace b_pabd
+ * with the decrypted data. Otherwise we can just throw
+ * our decryption buffer away.
+ */
+ if (!no_crypt) {
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+ arc_hdr_size(hdr), hdr);
+ hdr->b_l1hdr.b_pabd = eabd;
+ zio->io_abd = eabd;
+ } else {
+ arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
+ }
+ }
+
+ /*
+ * If the L2ARC block was compressed, but ARC compression
+ * is disabled we decompress the data into a new buffer and
+ * replace the existing data.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr)) {
+ abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
+ void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
+
+ ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+ HDR_GET_LSIZE(hdr));
+ if (ret != 0) {
+ abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+ arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
+ goto error;
+ }
+
+ abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+ arc_hdr_size(hdr), hdr);
+ hdr->b_l1hdr.b_pabd = cabd;
+ zio->io_abd = cabd;
+ zio->io_size = HDR_GET_LSIZE(hdr);
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+
/*
* A read to a cache device completed. Validate buffer contents before
* handing over to the regular ARC routines.
@@ -7044,10 +7985,11 @@ top:
static void
l2arc_read_done(zio_t *zio)
{
+ int tfm_error = 0;
l2arc_read_callback_t *cb;
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
- boolean_t valid_cksum;
+ boolean_t valid_cksum, using_rdata;
ASSERT3P(zio->io_vd, !=, NULL);
ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
@@ -7095,12 +8037,25 @@ l2arc_read_done(zio_t *zio)
/*
* Check this survived the L2ARC journey.
*/
- ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
+ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
+ (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
valid_cksum = arc_cksum_is_equal(hdr, zio);
- if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+ using_rdata = (HDR_HAS_RABD(hdr) &&
+ zio->io_abd == hdr->b_crypt_hdr.b_rabd);
+
+ /*
+ * b_rabd will always match the data as it exists on disk if it is
+ * being used. Therefore if we are reading into b_rabd we do not
+ * attempt to untransform the data.
+ */
+ if (valid_cksum && !using_rdata)
+ tfm_error = l2arc_untransform(zio, cb);
+
+ if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
+ !HDR_L2_EVICTED(hdr)) {
mutex_exit(hash_lock);
zio->io_private = hdr;
arc_read_done(zio);
@@ -7115,7 +8070,7 @@ l2arc_read_done(zio_t *zio)
} else {
zio->io_error = SET_ERROR(EIO);
}
- if (!valid_cksum)
+ if (!valid_cksum || tfm_error != 0)
ARCSTAT_BUMP(arcstat_l2_cksum_bad);
/*
@@ -7125,11 +8080,13 @@ l2arc_read_done(zio_t *zio)
*/
if (zio->io_waiter == NULL) {
zio_t *pio = zio_unique_parent(zio);
+ void *abd = (using_rdata) ?
+ hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
- hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
+ abd, zio->io_size, arc_read_done,
hdr, zio->io_priority, cb->l2rcb_flags,
&cb->l2rcb_zb));
}
@@ -7291,6 +8248,117 @@ top:
}
/*
+ * Handle any abd transforms that might be required for writing to the L2ARC.
+ * If successful, this function will always return an abd with the data
+ * transformed as it is on disk in a new abd of asize bytes.
+ */
+static int
+l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
+ abd_t **abd_out)
+{
+ int ret;
+ void *tmp = NULL;
+ abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
+ enum zio_compress compress = HDR_GET_COMPRESS(hdr);
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t size = arc_hdr_size(hdr);
+ boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
+ boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
+ boolean_t no_crypt;
+
+ ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr)) ||
+ HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
+ ASSERT3U(psize, <=, asize);
+
+ /*
+ * If this data simply needs its own buffer, we simply allocate it
+ * and copy the data. This may be done to elimiate a depedency on a
+ * shared buffer or to reallocate the buffer to match asize.
+ */
+ if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
+ !HDR_ENCRYPTED(hdr)) {
+ ASSERT3U(size, ==, psize);
+ to_write = abd_alloc_for_io(asize, ismd);
+ abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
+ if (size != asize)
+ abd_zero_off(to_write, size, asize - size);
+ goto out;
+ }
+
+ if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
+ cabd = abd_alloc_for_io(asize, ismd);
+ tmp = abd_borrow_buf(cabd, asize);
+
+ psize = zio_compress_data(compress, to_write, tmp, size);
+ ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
+ if (psize < asize)
+ bzero((char *)tmp + psize, asize - psize);
+ psize = HDR_GET_PSIZE(hdr);
+ abd_return_buf_copy(cabd, tmp, asize);
+ to_write = cabd;
+ }
+
+ if (HDR_ENCRYPTED(hdr)) {
+ eabd = abd_alloc_for_io(asize, ismd);
+
+ /*
+ * If the dataset was disowned before the buffer
+ * made it to this point, the key to re-encrypt
+ * it won't be available. In this case we simply
+ * won't write the buffer to the L2ARC.
+ */
+ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
+ FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
+ hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_ot,
+ hdr->b_crypt_hdr.b_iv, mac, psize, bswap, to_write,
+ eabd, &no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (no_crypt) {
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ abd_free(eabd);
+ goto out;
+ }
+
+ if (psize != asize)
+ abd_zero_off(eabd, psize, asize - psize);
+
+ /* assert that the MAC we got here matches the one we saved */
+ ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ if (to_write == cabd)
+ abd_free(cabd);
+
+ to_write = eabd;
+ }
+
+out:
+ ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
+ *abd_out = to_write;
+ return (0);
+
+error:
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ if (cabd != NULL)
+ abd_free(cabd);
+ if (eabd != NULL)
+ abd_free(eabd);
+
+ *abd_out = NULL;
+ return (ret);
+}
+
+/*
* Find and write ARC buffers to the L2ARC device.
*
* An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
@@ -7346,6 +8414,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
for (; hdr; hdr = hdr_prev) {
kmutex_t *hash_lock;
+ abd_t *to_write = NULL;
if (arc_warm == B_FALSE)
hdr_prev = multilist_sublist_next(mls, hdr);
@@ -7383,9 +8452,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3U(arc_hdr_size(hdr), >, 0);
- uint64_t psize = arc_hdr_size(hdr);
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+ HDR_HAS_RABD(hdr));
+ uint64_t psize = HDR_GET_PSIZE(hdr);
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
psize);
@@ -7395,6 +8465,57 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break;
}
+ /*
+ * We rely on the L1 portion of the header below, so
+ * it's invalid for this header to have been evicted out
+ * of the ghost cache, prior to being written out. The
+ * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+ */
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+ ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+ HDR_HAS_RABD(hdr));
+ ASSERT3U(arc_hdr_size(hdr), >, 0);
+
+ /*
+ * If this header has b_rabd, we can use this since it
+ * must always match the data exactly as it exists on
+ * disk. Otherwise, the L2ARC can normally use the
+ * hdr's data, but if we're sharing data between the
+ * hdr and one of its bufs, L2ARC needs its own copy of
+ * the data so that the ZIO below can't race with the
+ * buf consumer. To ensure that this copy will be
+ * available for the lifetime of the ZIO and be cleaned
+ * up afterwards, we add it to the l2arc_free_on_write
+ * queue. If we need to apply any transforms to the
+ * data (compression, encryption) we will also need the
+ * extra buffer.
+ */
+ if (HDR_HAS_RABD(hdr) && psize == asize) {
+ to_write = hdr->b_crypt_hdr.b_rabd;
+ } else if ((HDR_COMPRESSION_ENABLED(hdr) ||
+ HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
+ !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
+ psize == asize) {
+ to_write = hdr->b_l1hdr.b_pabd;
+ } else {
+ int ret;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ ret = l2arc_apply_transforms(spa, hdr, asize,
+ &to_write);
+ if (ret != 0) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_L2_WRITING);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ l2arc_free_abd_on_write(to_write, asize, type);
+ }
+
if (pio == NULL) {
/*
* Insert a dummy header on the buflist so
@@ -7417,43 +8538,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
hdr->b_l2hdr.b_hits = 0;
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
- arc_hdr_set_flags(hdr,
- ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
+ arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, hdr);
mutex_exit(&dev->l2ad_mtx);
- (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr);
+ (void) refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(hdr), hdr);
- /*
- * Normally the L2ARC can use the hdr's data, but if
- * we're sharing data between the hdr and one of its
- * bufs, L2ARC needs its own copy of the data so that
- * the ZIO below can't race with the buf consumer.
- * Another case where we need to create a copy of the
- * data is when the buffer size is not device-aligned
- * and we need to pad the block to make it such.
- * That also keeps the clock hand suitably aligned.
- *
- * To ensure that the copy will be available for the
- * lifetime of the ZIO and be cleaned up afterwards, we
- * add it to the l2arc_free_on_write queue.
- */
- abd_t *to_write;
- if (!HDR_SHARED_DATA(hdr) && psize == asize) {
- to_write = hdr->b_l1hdr.b_pabd;
- } else {
- to_write = abd_alloc_for_io(asize,
- HDR_ISTYPE_METADATA(hdr));
- abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
- if (asize != psize) {
- abd_zero_off(to_write, psize,
- asize - psize);
- }
- l2arc_free_abd_on_write(to_write, asize,
- arc_buf_type(hdr));
- }
wzio = zio_write_phys(pio, dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, asize, to_write,
ZIO_CHECKSUM_OFF, NULL, hdr,
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c
index 6cd2b019f..8f78e8de5 100644
--- a/module/zfs/bptree.c
+++ b/module/zfs/bptree.c
@@ -212,7 +212,8 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
err = 0;
for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
bptree_entry_phys_t bte;
- int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
+ int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST |
+ TRAVERSE_NO_DECRYPT;
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 625e06701..745715861 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -964,7 +964,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
}
static void
-dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
@@ -984,7 +984,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
- } else if (zio == NULL || zio->io_error == 0) {
+ } else if (err == 0) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
@@ -1003,7 +1003,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
dnode_t *dn;
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
- int err;
+ int err, zio_flags = 0;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -1021,6 +1021,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ arc_buf_t *dn_buf = (dn->dn_dbuf != NULL) ?
+ dn->dn_dbuf->db_buf : NULL;
+
+ /* if the underlying dnode block is encrypted, decrypt it */
+ if (dn_buf != NULL && dn->dn_objset->os_encrypted &&
+ DMU_OT_IS_ENCRYPTED(dn->dn_bonustype) &&
+ (flags & DB_RF_NO_DECRYPT) == 0 &&
+ arc_is_encrypted(dn_buf)) {
+ err = arc_untransform(dn_buf, dn->dn_objset->os_spa,
+ dmu_objset_id(dn->dn_objset), B_TRUE);
+ if (err != 0) {
+ DB_DNODE_EXIT(db);
+ mutex_exit(&db->db_mtx);
+ return (err);
+ }
+ }
ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
@@ -1088,11 +1104,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
db->db.db_object, db->db_level, db->db_blkid);
+ /*
+ * All bps of an encrypted os should have the encryption bit set.
+ * If this is not true it indicates tampering and we report an error.
+ */
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
+ spa_log_error(db->db_objset->os_spa, &zb);
+ zfs_panic_recover("unencrypted block in encrypted "
+ "object set %llu", dmu_objset_id(db->db_objset));
+ return (SET_ERROR(EIO));
+ }
+
dbuf_add_ref(db, NULL);
+ zio_flags = (flags & DB_RF_CANFAIL) ?
+ ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
+
+ if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
+ zio_flags |= ZIO_FLAG_RAW;
+
err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
- dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
- (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);
return (err);
@@ -1141,18 +1173,31 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ dnode_t *dn = DB_DNODE(db);
int size = arc_buf_size(db->db_buf);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa = db->db_objset->os_spa;
enum zio_compress compress_type =
arc_get_compression(db->db_buf);
- if (compress_type == ZIO_COMPRESS_OFF) {
- dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
- } else {
+ if (arc_is_encrypted(db->db_buf)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(db->db_buf, &byteorder, salt,
+ iv, mac);
+ dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
+ dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
+ mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
+ compress_type);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
size, arc_buf_lsize(db->db_buf), compress_type);
+ } else {
+ dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
@@ -1188,16 +1233,21 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
+ spa_t *spa = dn->dn_objset->os_spa;
+
/*
- * If the arc buf is compressed, we need to decompress it to
- * read the data. This could happen during the "zfs receive" of
- * a stream which is compressed and deduplicated.
+ * If the arc buf is compressed or encrypted, we need to
+ * untransform it to read the data. This could happen during
+ * the "zfs receive" of a stream which is deduplicated and
+ * either raw or compressed. We do not need to do this if the
+ * caller wants raw encrypted data.
*/
- if (db->db_buf != NULL &&
- arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
- dbuf_fix_old_data(db,
- spa_syncing_txg(dmu_objset_spa(db->db_objset)));
- err = arc_decompress(db->db_buf);
+ if (db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 &&
+ (arc_is_encrypted(db->db_buf) ||
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ dbuf_fix_old_data(db, spa_syncing_txg(spa));
+ err = arc_untransform(db->db_buf, spa,
+ dmu_objset_id(db->db_objset), B_FALSE);
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
@@ -1316,6 +1366,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
+ dr->dt.dl.dr_raw = B_FALSE;
/*
* Release the already-written buffer, so we leave it in
@@ -1908,11 +1959,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
}
-void
-dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+static void
+dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
dbuf_dirty_record_t *dr;
ASSERT(tx->tx_txg != 0);
@@ -1944,13 +1994,20 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
DB_DNODE_ENTER(db);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
- rf |= DB_RF_HAVESTRUCT;
+ flags |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
- (void) dbuf_read(db, NULL, rf);
+ (void) dbuf_read(db, NULL, flags);
(void) dbuf_dirty(db, tx);
}
void
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty_impl(db_fake,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
+}
+
+void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@@ -1977,6 +2034,29 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
(void) dbuf_dirty(db, tx);
}
+/*
+ * This function is effectively the same as dmu_buf_will_dirty(), but
+ * indicates the caller expects raw encrypted data in the db. It will
+ * also set the raw flag on the created dirty record.
+ */
+void
+dmu_buf_will_change_crypt_params(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dbuf_dirty_record_t *dr;
+
+ dmu_buf_will_dirty_impl(db_fake,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
+
+ dr = db->db_last_dirty;
+ while (dr != NULL && dr->dr_txg > tx->tx_txg)
+ dr = dr->dr_next;
+
+ ASSERT3P(dr, !=, NULL);
+ ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+ dr->dt.dl.dr_raw = B_TRUE;
+}
+
#pragma weak dmu_buf_fill_done = dbuf_fill_done
/* ARGSUSED */
void
@@ -2117,10 +2197,11 @@ dbuf_destroy(dmu_buf_impl_t *db)
if (db->db_blkid == DMU_BONUS_BLKID) {
int slots = DB_DNODE(db)->dn_num_slots;
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
- ASSERT(db->db.db_data != NULL);
- kmem_free(db->db.db_data, bonuslen);
- arc_space_return(bonuslen, ARC_SPACE_BONUS);
- db->db_state = DB_UNCACHED;
+ if (db->db.db_data != NULL) {
+ kmem_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
+ db->db_state = DB_UNCACHED;
+ }
}
dbuf_clear_data(db);
@@ -2416,7 +2497,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
* prefetch if the next block down is our target.
*/
static void
-dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
uint64_t nextblkid;
@@ -2438,7 +2519,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
*/
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
- if (zio->io_flags & ZIO_FLAG_RAW) {
+ if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
} else {
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
@@ -2463,7 +2544,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+ if (BP_IS_HOLE(bp) || err != 0) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
@@ -2491,7 +2572,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
- * complete.
+ * complete. Note that the prefetch might fail if the dataset is encrypted and
+ * the encryption key is unmapped before the IO completes.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
@@ -3121,6 +3203,41 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
}
/*
+ * Ensure the dbuf's data is untransformed if the associated dirty
+ * record requires it. This is used by dbuf_sync_leaf() to ensure
+ * that a dnode block is decrypted before we write new data to it.
+ * For raw writes we assert that the buffer is already encrypted.
+ */
+static void
+dbuf_check_crypt(dbuf_dirty_record_t *dr)
+{
+ int err;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!dr->dt.dl.dr_raw && arc_is_encrypted(db->db_buf)) {
+ /*
+ * Unfortunately, there is currently no mechanism for
+ * syncing context to handle decryption errors. An error
+ * here is only possible if an attacker maliciously
+ * changed a dnode block and updated the associated
+ * checksums going up the block tree.
+ */
+ err = arc_untransform(db->db_buf, db->db_objset->os_spa,
+ dmu_objset_id(db->db_objset), B_TRUE);
+ if (err)
+ panic("Invalid dnode block MAC");
+ } else if (dr->dt.dl.dr_raw) {
+ /*
+ * Writing raw encrypted data requires the db's arc buffer
+ * to be converted to raw by the caller.
+ */
+ ASSERT(arc_is_encrypted(db->db_buf));
+ }
+}
+
+/*
* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
* is critical the we not allow the compiler to inline this function in to
* dbuf_sync_list() thereby drastically bloating the stack usage.
@@ -3241,9 +3358,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=,
+ ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
- bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ bcopy(*datap, DN_BONUS(dn->dn_phys),
+ DN_MAX_BONUS_LEN(dn->dn_phys));
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
@@ -3290,6 +3408,13 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
+ /*
+ * If this is a dnode block, ensure it is appropriately encrypted
+ * or decrypted, depending on what we are writing to it this txg.
+ */
+ if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
+ dbuf_check_crypt(dr);
+
if (db->db_state != DB_NOFILL &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
@@ -3307,16 +3432,27 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* DNONE_DNODE blocks).
*/
int psize = arc_buf_size(*datap);
+ int lsize = arc_buf_lsize(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
enum zio_compress compress_type = arc_get_compression(*datap);
- if (compress_type == ZIO_COMPRESS_OFF) {
- *datap = arc_alloc_buf(os->os_spa, db, type, psize);
- } else {
+ if (arc_is_encrypted(*datap)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
+ *datap = arc_alloc_raw_buf(os->os_spa, db,
+ dmu_objset_id(os), byteorder, salt, iv, mac,
+ dn->dn_type, psize, lsize, compress_type);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
int lsize = arc_buf_lsize(*datap);
*datap = arc_alloc_compressed_buf(os->os_spa, db,
psize, lsize, compress_type);
+ } else {
+ *datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
bcopy(db->db.db_data, (*datap)->b_data, psize);
}
@@ -3453,7 +3589,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
if (!BP_IS_EMBEDDED(bp))
- bp->blk_fill = fill;
+ BP_SET_FILL(bp, fill);
mutex_exit(&db->db_mtx);
@@ -3778,7 +3914,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
- arc_done_func_t *children_ready_cb = NULL;
+ arc_write_done_func_t *children_ready_cb = NULL;
ASSERT(arc_released(data));
/*
@@ -3810,6 +3946,7 @@ EXPORT_SYMBOL(dbuf_free_range);
EXPORT_SYMBOL(dbuf_new_size);
EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
+EXPORT_SYMBOL(dmu_buf_will_change_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 75ab7f5b2..f3ccc94c8 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -269,6 +269,10 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
}
+/*
+ * The bp created via this function may be used for repairs and scrub, but it
+ * will be missing the salt / IV required to do a full decrypting read.
+ */
void
ddt_bp_create(enum zio_checksum checksum,
const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
@@ -279,11 +283,12 @@ ddt_bp_create(enum zio_checksum checksum,
ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
bp->blk_cksum = ddk->ddk_cksum;
- bp->blk_fill = 1;
BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
+ BP_SET_FILL(bp, 1);
BP_SET_CHECKSUM(bp, checksum);
BP_SET_TYPE(bp, DMU_OT_DEDUP);
BP_SET_LEVEL(bp, 0);
@@ -297,9 +302,12 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
ddk->ddk_cksum = bp->blk_cksum;
ddk->ddk_prop = 0;
+ ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
+
DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+ DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
}
void
@@ -389,7 +397,7 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
if (ddp->ddp_phys_birth == 0)
continue;
- for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ for (d = 0; d < DDE_GET_NDVAS(dde); d++)
dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
dds->dds_blocks += 1;
@@ -562,6 +570,7 @@ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
uint64_t ditto = spa->spa_dedup_ditto;
int total_copies = 0;
int desired_copies = 0;
+ int copies_needed = 0;
int p;
for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
@@ -588,7 +597,13 @@ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
if (total_refcnt >= ditto * ditto)
desired_copies++;
- return (MAX(desired_copies, total_copies) - total_copies);
+ copies_needed = MAX(desired_copies, total_copies) - total_copies;
+
+ /* encrypted blocks store their IV in DVA[2] */
+ if (DDK_GET_CRYPT(&dde->dde_key))
+ copies_needed = MIN(copies_needed, SPA_DVAS_PER_BP - 1);
+
+ return (copies_needed);
}
int
@@ -599,7 +614,7 @@ ddt_ditto_copies_present(ddt_entry_t *dde)
int copies = 0 - DVA_GET_GANG(dva);
int d;
- for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+ for (d = 0; d < DDE_GET_NDVAS(dde); d++, dva++)
if (DVA_IS_VALID(dva))
copies++;
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 717bd121f..e098a4966 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -73,60 +73,60 @@ unsigned long zfs_per_txg_dirty_frees_percent = 30;
int zfs_dmu_offset_next_sync = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
- { DMU_BSWAP_UINT8, TRUE, "unallocated" },
- { DMU_BSWAP_ZAP, TRUE, "object directory" },
- { DMU_BSWAP_UINT64, TRUE, "object array" },
- { DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
- { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj header" },
- { DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
- { DMU_BSWAP_UINT64, TRUE, "SPA space map" },
- { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
- { DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
- { DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
- { DMU_BSWAP_UINT64, TRUE, "DSL directory" },
- { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
- { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
- { DMU_BSWAP_ZAP, TRUE, "DSL props" },
- { DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
- { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
- { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
- { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
- { DMU_BSWAP_UINT8, FALSE, "zvol object" },
- { DMU_BSWAP_ZAP, TRUE, "zvol prop" },
- { DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
- { DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
- { DMU_BSWAP_ZAP, TRUE, "other ZAP" },
- { DMU_BSWAP_ZAP, TRUE, "persistent error log" },
- { DMU_BSWAP_UINT8, TRUE, "SPA history" },
- { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
- { DMU_BSWAP_ZAP, TRUE, "Pool properties" },
- { DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
- { DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
- { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
- { DMU_BSWAP_UINT8, TRUE, "FUID table" },
- { DMU_BSWAP_UINT64, TRUE, "FUID table size" },
- { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
- { DMU_BSWAP_ZAP, TRUE, "scan work queue" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
- { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
- { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
- { DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
- { DMU_BSWAP_UINT8, TRUE, "System attributes" },
- { DMU_BSWAP_ZAP, TRUE, "SA master node" },
- { DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
- { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
- { DMU_BSWAP_ZAP, TRUE, "scan translations" },
- { DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
- { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
- { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
- { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "object directory" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "object array" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "ZIL intent log" },
+ { DMU_BSWAP_DNODE, TRUE, TRUE, "DMU dnode" },
+ { DMU_BSWAP_OBJSET, TRUE, FALSE, "DMU objset" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL directory" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL directory child map"},
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset snap map" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL props" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL dataset" },
+ { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
+ { DMU_BSWAP_OLDACL, TRUE, TRUE, "ZFS V0 ACL" },
+ { DMU_BSWAP_UINT8, FALSE, TRUE, "ZFS plain file" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS directory" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS delete queue" },
+ { DMU_BSWAP_UINT8, FALSE, TRUE, "zvol object" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
+ { DMU_BSWAP_UINT8, FALSE, TRUE, "other uint8[]" },
+ { DMU_BSWAP_UINT64, FALSE, TRUE, "other uint64[]" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "Pool properties" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL permissions" },
+ { DMU_BSWAP_ACL, TRUE, TRUE, "ZFS ACL" },
+ { DMU_BSWAP_UINT8, TRUE, TRUE, "ZFS SYSACL" },
+ { DMU_BSWAP_UINT8, TRUE, TRUE, "FUID table" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset next clones"},
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group used" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group quota" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "snapshot refcount tags"},
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
+ { DMU_BSWAP_UINT8, TRUE, TRUE, "System attributes" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "SA master node" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr registration" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr layouts" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
+ { DMU_BSWAP_UINT8, FALSE, TRUE, "deduplicated block" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL deadlist map" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL deadlist map hdr" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dir clones" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
};
const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
@@ -198,6 +198,8 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) {
@@ -221,6 +223,8 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
@@ -321,11 +325,18 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
* returns ENOENT, EIO, or 0.
*/
int
-dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
+ dmu_buf_t **dbp)
{
dnode_t *dn;
dmu_buf_impl_t *db;
int error;
+ uint32_t db_flags = DB_RF_MUST_SUCCEED;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
error = dnode_hold(os, object, FTAG, &dn);
if (error)
@@ -355,12 +366,24 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dnode_rele(dn, FTAG);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
+ error = dbuf_read(db, NULL, db_flags);
+ if (error) {
+ dnode_evict_bonus(dn);
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ return (error);
+ }
*dbp = &db->db;
return (0);
}
+int
+dmu_bonus_hold(objset_t *os, uint64_t obj, void *tag, dmu_buf_t **dbp)
+{
+ return (dmu_bonus_hold_impl(os, obj, tag, DMU_READ_NO_PREFETCH, dbp));
+}
+
/*
* returns ENOENT, EIO, or 0.
*
@@ -601,8 +624,8 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
* indirect blocks prefeteched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
*
- * Note that if the indirect blocks above the blocks being prefetched are not in
- * cache, they will be asychronously read in.
+ * Note that if the indirect blocks above the blocks being prefetched are not
+ * in cache, they will be asychronously read in.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
@@ -1462,6 +1485,83 @@ dmu_return_arcbuf(arc_buf_t *buf)
arc_buf_destroy(buf, FTAG);
}
+void
+dmu_assign_arcbuf_impl(dmu_buf_t *handle, arc_buf_t *buf, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+ dbuf_assign_arcbuf(db, buf, tx);
+}
+
+void
+dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt,
+ const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
+{
+ dmu_object_type_t type;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+ uint64_t dsobj = dmu_objset_id(db->db_objset);
+
+ ASSERT3P(db->db_buf, !=, NULL);
+ ASSERT3U(dsobj, !=, 0);
+
+ dmu_buf_will_change_crypt_params(handle, tx);
+
+ DB_DNODE_ENTER(db);
+ type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ /*
+ * This technically violates the assumption the dmu code makes
+ * that dnode blocks are only released in syncing context.
+ */
+ (void) arc_release(db->db_buf, db);
+ arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac);
+}
+
+void
+dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset,
+ dmu_buf_t *handle, dmu_tx_t *tx)
+{
+ dmu_buf_t *dst_handle;
+ dmu_buf_impl_t *dstdb;
+ dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle;
+ arc_buf_t *abuf;
+ uint64_t datalen;
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ ASSERT3P(srcdb->db_buf, !=, NULL);
+
+ /* hold the db that we want to write to */
+ VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle,
+ DMU_READ_NO_DECRYPT));
+ dstdb = (dmu_buf_impl_t *)dst_handle;
+ datalen = arc_buf_size(srcdb->db_buf);
+
+ /* allocated an arc buffer that matches the type of srcdb->db_buf */
+ if (arc_is_encrypted(srcdb->db_buf)) {
+ arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac);
+ abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os),
+ byteorder, salt, iv, mac, DB_DNODE(dstdb)->dn_type,
+ datalen, arc_buf_lsize(srcdb->db_buf),
+ arc_get_compression(srcdb->db_buf));
+ } else {
+ /* we won't get a compressed db back from dmu_buf_hold() */
+ ASSERT3U(arc_get_compression(srcdb->db_buf),
+ ==, ZIO_COMPRESS_OFF);
+ abuf = arc_loan_buf(os->os_spa,
+ DMU_OT_IS_METADATA(DB_DNODE(dstdb)->dn_type), datalen);
+ }
+
+ ASSERT3U(datalen, ==, arc_buf_size(abuf));
+
+ /* copy the data to the new buffer and assign it to the dstdb */
+ bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen);
+ dbuf_assign_arcbuf(dstdb, abuf, tx);
+ dmu_buf_rele(dst_handle, FTAG);
+}
+
/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
@@ -1537,7 +1637,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
BP_SET_LSIZE(bp, db->db_size);
} else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
- bp->blk_fill = 1;
+ BP_SET_FILL(bp, 1);
}
}
}
@@ -1843,6 +1943,20 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
}
int
+dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_nlevels(dn, nlevels, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dmu_tx_t *tx)
{
@@ -1916,6 +2030,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t dedup = B_FALSE;
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
+ boolean_t encrypt = B_FALSE;
int copies = os->os_copies;
/*
@@ -2003,16 +2118,44 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
- zp->zp_checksum = checksum;
- zp->zp_compress = compress;
- ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
+ /*
+ * All objects in an encrypted objset are protected from modification
+ * via a MAC. Encrypted objects store their IV and salt in the last DVA
+ * in the bp, so we cannot use all copies. Encrypted objects are also
+ * not subject to nopwrite since writing the same data will still
+ * result in a new ciphertext. Only encrypted blocks can be dedup'd
+ * to avoid ambiguity in the dedup code since the DDT does not store
+ * object types.
+ */
+ if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
+ encrypt = B_TRUE;
+
+ if (DMU_OT_IS_ENCRYPTED(type)) {
+ copies = MIN(copies, SPA_DVAS_PER_BP - 1);
+ nopwrite = B_FALSE;
+ } else {
+ dedup = B_FALSE;
+ }
+
+ if (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)
+ compress = ZIO_COMPRESS_EMPTY;
+ }
+ zp->zp_compress = compress;
+ zp->zp_checksum = checksum;
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
+ zp->zp_encrypt = encrypt;
+ zp->zp_byteorder = ZFS_HOST_BYTEORDER;
+ bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
+ bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
+ bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
+
+ ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
}
/*
@@ -2267,6 +2410,7 @@ EXPORT_SYMBOL(dmu_object_info_from_dnode);
EXPORT_SYMBOL(dmu_object_info_from_db);
EXPORT_SYMBOL(dmu_object_size_from_db);
EXPORT_SYMBOL(dmu_object_dnsize_from_db);
+EXPORT_SYMBOL(dmu_object_set_nlevels);
EXPORT_SYMBOL(dmu_object_set_blocksize);
EXPORT_SYMBOL(dmu_object_set_checksum);
EXPORT_SYMBOL(dmu_object_set_compress);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 9a7a6968d..3faa299d1 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -56,6 +56,7 @@
#include <sys/vdev.h>
#include <sys/policy.h>
#include <sys/spa_impl.h>
+#include <sys/dmu_send.h>
/*
* Needed to close a window in dnode_move() that allows the objset to be freed
@@ -391,16 +392,23 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (!BP_IS_HOLE(os->os_rootbp)) {
arc_flags_t aflags = ARC_FLAG_WAIT;
zbookmark_phys_t zb;
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_FLAG_L2CACHE;
+ if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ ASSERT(BP_IS_AUTHENTICATED(bp));
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+
dprintf_bp(os->os_rootbp, "reading %s", "");
err = arc_read(NULL, spa, os->os_rootbp,
arc_getbuf_func, &os->os_phys_buf,
- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (err != 0) {
kmem_free(os, sizeof (objset_t));
/* convert checksum errors into IO errors */
@@ -441,6 +449,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (ds != NULL) {
boolean_t needlock = B_FALSE;
+ os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);
+
/*
* Note: it's valid to open the objset if the dataset is
* long-held, in which case the pool_config lock will not
@@ -450,6 +460,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
needlock = B_TRUE;
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
}
+
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
@@ -517,6 +528,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/* It's the meta-objset. */
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
os->os_compress = ZIO_COMPRESS_ON;
+ os->os_encrypted = B_FALSE;
os->os_copies = spa_max_replication(spa);
os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
os->os_dedup_verify = B_FALSE;
@@ -603,16 +615,18 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
* can be held at a time.
*/
int
-dmu_objset_hold(const char *name, void *tag, objset_t **osp)
+dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
+ objset_t **osp)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
+ ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
err = dsl_pool_hold(name, tag, &dp);
if (err != 0)
return (err);
- err = dsl_dataset_hold(dp, name, tag, &ds);
+ err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);
if (err != 0) {
dsl_pool_rele(dp, tag);
return (err);
@@ -627,23 +641,38 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp)
return (err);
}
+int
+dmu_objset_hold(const char *name, void *tag, objset_t **osp)
+{
+ return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
+}
+
static int
dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
- boolean_t readonly, void *tag, objset_t **osp)
+ boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
{
int err;
err = dmu_objset_from_ds(ds, osp);
if (err != 0) {
- dsl_dataset_disown(ds, tag);
+ return (err);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
- dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
- dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}
- return (err);
+
+ /* if we are decrypting, we can now check MACs in os->os_phys_buf */
+ if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {
+ err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,
+ ds->ds_object, B_FALSE);
+ if (err != 0)
+ return (err);
+
+ ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));
+ }
+
+ return (0);
}
/*
@@ -653,51 +682,73 @@ dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
*/
int
dmu_objset_own(const char *name, dmu_objset_type_t type,
- boolean_t readonly, void *tag, objset_t **osp)
+ boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
+ ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
err = dsl_pool_hold(name, FTAG, &dp);
if (err != 0)
return (err);
- err = dsl_dataset_own(dp, name, tag, &ds);
+ err = dsl_dataset_own(dp, name, flags, tag, &ds);
if (err != 0) {
dsl_pool_rele(dp, FTAG);
return (err);
}
- err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
+ err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
+ if (err != 0) {
+ dsl_dataset_disown(ds, flags, tag);
+ dsl_pool_rele(dp, FTAG);
+ return (err);
+ }
+
dsl_pool_rele(dp, FTAG);
- if (err == 0 && dmu_objset_userobjspace_upgradable(*osp))
+ if (dmu_objset_userobjspace_upgradable(*osp))
dmu_objset_userobjspace_upgrade(*osp);
- return (err);
+ return (0);
}
int
dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
- boolean_t readonly, void *tag, objset_t **osp)
+ boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
{
dsl_dataset_t *ds;
int err;
+ ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
- err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+ err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
if (err != 0)
return (err);
- return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+ err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
+ if (err != 0) {
+ dsl_dataset_disown(ds, flags, tag);
+ return (err);
+ }
+
+ return (0);
}
void
-dmu_objset_rele(objset_t *os, void *tag)
+dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
{
+ ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
+
dsl_pool_t *dp = dmu_objset_pool(os);
- dsl_dataset_rele(os->os_dsl_dataset, tag);
+ dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
dsl_pool_rele(dp, tag);
}
+void
+dmu_objset_rele(objset_t *os, void *tag)
+{
+ dmu_objset_rele_flags(os, B_FALSE, tag);
+}
+
/*
* When we are called, os MUST refer to an objset associated with a dataset
* that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
@@ -710,7 +761,7 @@ dmu_objset_rele(objset_t *os, void *tag)
* same name so that it can be partially torn down and reconstructed.
*/
void
-dmu_objset_refresh_ownership(objset_t *os, void *tag)
+dmu_objset_refresh_ownership(objset_t *os, boolean_t decrypt, void *tag)
{
dsl_pool_t *dp;
dsl_dataset_t *ds, *newds;
@@ -724,20 +775,22 @@ dmu_objset_refresh_ownership(objset_t *os, void *tag)
dsl_dataset_name(ds, name);
dp = dmu_objset_pool(os);
dsl_pool_config_enter(dp, FTAG);
- dmu_objset_disown(os, tag);
- VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
+ dmu_objset_disown(os, decrypt, tag);
+ VERIFY0(dsl_dataset_own(dp, name,
+ (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, &newds));
VERIFY3P(newds, ==, os->os_dsl_dataset);
dsl_pool_config_exit(dp, FTAG);
}
void
-dmu_objset_disown(objset_t *os, void *tag)
+dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
{
/*
* Stop upgrading thread
*/
dmu_objset_upgrade_stop(os);
- dsl_dataset_disown(os->os_dsl_dataset, tag);
+ dsl_dataset_disown(os->os_dsl_dataset,
+ (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag);
}
void
@@ -820,6 +873,8 @@ dmu_objset_evict(objset_t *os)
} else {
mutex_exit(&os->os_lock);
}
+
+
}
void
@@ -866,16 +921,20 @@ dmu_objset_snap_cmtime(objset_t *os)
return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
}
-/* called from dsl for meta-objset */
objset_t *
-dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
- dmu_objset_type_t type, dmu_tx_t *tx)
+dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)
{
objset_t *os;
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
+ if (blksz == 0)
+ blksz = DNODE_BLOCK_SIZE;
+ if (blksz == 0)
+ ibs = DN_MAX_INDBLKSHIFT;
+
if (ds != NULL)
VERIFY0(dmu_objset_from_ds(ds, &os));
else
@@ -883,8 +942,8 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mdn = DMU_META_DNODE(os);
- dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
- DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
+ dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,
+ DNODE_MIN_SLOTS, tx);
/*
* We don't want to have to increase the meta-dnode's nlevels
@@ -898,22 +957,25 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* to convergence, so minimizing its dn_nlevels matters.
*/
if (ds != NULL) {
- int levels = 1;
-
- /*
- * Determine the number of levels necessary for the meta-dnode
- * to contain DN_MAX_OBJECT dnodes. Note that in order to
- * ensure that we do not overflow 64 bits, there has to be
- * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
- * but < 2^64. Therefore,
- * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
- * less than (64 - log2(DN_MAX_OBJECT)) (16).
- */
- while ((uint64_t)mdn->dn_nblkptr <<
- (mdn->dn_datablkshift - DNODE_SHIFT +
- (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
- DN_MAX_OBJECT)
- levels++;
+ if (levels == 0) {
+ levels = 1;
+
+ /*
+ * Determine the number of levels necessary for the
+ * meta-dnode to contain DN_MAX_OBJECT dnodes. Note
+ * that in order to ensure that we do not overflow
+ * 64 bits, there has to be a nlevels that gives us a
+ * number of blocks > DN_MAX_OBJECT but < 2^64.
+ * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)
+ * (10) must be less than (64 - log2(DN_MAX_OBJECT))
+ * (16).
+ */
+ while ((uint64_t)mdn->dn_nblkptr <<
+ (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *
+ (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+ DN_MAX_OBJECT)
+ levels++;
+ }
mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
mdn->dn_nlevels = levels;
@@ -923,7 +985,13 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(type != DMU_OST_ANY);
ASSERT(type < DMU_OST_NUMTYPES);
os->os_phys->os_type = type;
- if (dmu_objset_userused_enabled(os)) {
+
+ /*
+ * Enable user accounting if it is enabled and this is not an
+ * encrypted receive.
+ */
+ if (dmu_objset_userused_enabled(os) &&
+ (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
if (dmu_objset_userobjused_enabled(os)) {
ds->ds_feature_activation_needed[
@@ -939,6 +1007,14 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
return (os);
}
+/* called from dsl for meta-objset */
+objset_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, dmu_tx_t *tx)
+{
+ return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));
+}
+
typedef struct dmu_objset_create_arg {
const char *doca_name;
cred_t *doca_cred;
@@ -947,6 +1023,7 @@ typedef struct dmu_objset_create_arg {
void *doca_userarg;
dmu_objset_type_t doca_type;
uint64_t doca_flags;
+ dsl_crypto_params_t *doca_dcp;
} dmu_objset_create_arg_t;
/*ARGSUSED*/
@@ -972,8 +1049,16 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx)
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EEXIST));
}
+
+ error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+
error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
doca->doca_cred);
+
dsl_dir_rele(pdd, FTAG);
return (error);
@@ -990,13 +1075,15 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
uint64_t obj;
blkptr_t *bp;
objset_t *os;
+ zio_t *rzio;
VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
- doca->doca_cred, tx);
+ doca->doca_cred, doca->doca_dcp, tx);
- VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+ VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
bp = dsl_dataset_get_blkptr(ds);
os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
@@ -1008,18 +1095,56 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
doca->doca_cred, tx);
}
+ /*
+ * The doca_userfunc() will write out some data that needs to be
+ * encrypted if the dataset is encrypted (specifically the root
+ * directory). This data must be written out before the encryption
+ * key mapping is removed by dsl_dataset_rele_flags(). Force the
+ * I/O to occur immediately by invoking the relevant sections of
+ * dsl_pool_sync().
+ */
+ if (os->os_encrypted) {
+ dsl_dataset_t *tmpds = NULL;
+ boolean_t need_sync_done = B_FALSE;
+
+ rzio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ tmpds = txg_list_remove(&dp->dp_dirty_datasets, tx->tx_txg);
+ if (tmpds != NULL) {
+ ASSERT3P(ds, ==, tmpds);
+ dsl_dataset_sync(ds, rzio, tx);
+ need_sync_done = B_TRUE;
+ }
+ VERIFY0(zio_wait(rzio));
+
+ dmu_objset_do_userquota_updates(os, tx);
+ taskq_wait(dp->dp_sync_taskq);
+
+ rzio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ tmpds = txg_list_remove(&dp->dp_dirty_datasets, tx->tx_txg);
+ if (tmpds != NULL) {
+ ASSERT3P(ds, ==, tmpds);
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ dsl_dataset_sync(ds, rzio, tx);
+ }
+ VERIFY0(zio_wait(rzio));
+
+ if (need_sync_done)
+ dsl_dataset_sync_done(ds, tx);
+ }
+
spa_history_log_internal_ds(ds, "create", tx, "");
zvol_create_minors(dp->dp_spa, doca->doca_name, B_TRUE);
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
dsl_dir_rele(pdd, FTAG);
}
int
dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
- void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
+ dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)
{
dmu_objset_create_arg_t doca;
+ dsl_crypto_params_t tmp_dcp = { 0 };
doca.doca_name = name;
doca.doca_cred = CRED();
@@ -1028,9 +1153,19 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
doca.doca_userarg = arg;
doca.doca_type = type;
+ /*
+ * Some callers (mostly for testing) do not provide a dcp on their
+ * own but various code inside the sync task will require it to be
+ * allocated. Rather than adding NULL checks throughout this code
+ * or adding dummy dcp's to all of the callers we simply create a
+ * dummy one here and use that. This zero dcp will have the same
+ * effect as asking for inheritence of all encryption params.
+ */
+ doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;
+
return (dsl_sync_task(name,
dmu_objset_create_check, dmu_objset_create_sync, &doca,
- 5, ZFS_SPACE_CHECK_NORMAL));
+ 6, ZFS_SPACE_CHECK_NORMAL));
}
typedef struct dmu_objset_clone_arg {
@@ -1070,18 +1205,29 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EDQUOT));
}
- dsl_dir_rele(pdd, FTAG);
error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
- if (error != 0)
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
return (error);
+ }
/* You can only clone snapshots, not the head datasets. */
if (!origin->ds_is_snapshot) {
dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EINVAL));
}
+
+ error = dmu_objset_clone_crypt_check(pdd, origin->ds_dir);
+ if (error != 0) {
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+ return (error);
+ }
+
dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(pdd, FTAG);
return (0);
}
@@ -1101,7 +1247,7 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
- doca->doca_cred, tx);
+ doca->doca_cred, NULL, tx);
VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
dsl_dataset_name(origin, namebuf);
@@ -1124,7 +1270,7 @@ dmu_objset_clone(const char *clone, const char *origin)
return (dsl_sync_task(clone,
dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
- 5, ZFS_SPACE_CHECK_NORMAL));
+ 6, ZFS_SPACE_CHECK_NORMAL));
}
int
@@ -1232,6 +1378,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
blkptr_t *bp = zio->io_bp;
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+ uint64_t fill = 0;
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
@@ -1243,9 +1390,11 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
* objects that are stored in the objset_phys_t -- the meta
* dnode and user/group accounting objects).
*/
- bp->blk_fill = 0;
for (i = 0; i < dnp->dn_nblkptr; i++)
- bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
+ fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
+
+ BP_SET_FILL(bp, fill);
+
if (os->os_dsl_dataset != NULL)
rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
*os->os_rootbp = *bp;
@@ -1334,6 +1483,19 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
dmu_write_policy(os, NULL, 0, 0, &zp);
+ /*
+ * If we are either claiming the ZIL or doing a raw receive write out
+ * the os_phys_buf raw. Neither of these actions will effect the MAC
+ * at this point.
+ */
+ if (arc_is_unauthenticated(os->os_phys_buf) || os->os_next_write_raw) {
+ ASSERT(os->os_encrypted);
+ os->os_next_write_raw = B_FALSE;
+ arc_convert_to_raw(os->os_phys_buf,
+ os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,
+ DMU_OT_OBJSET, NULL, NULL, NULL);
+ }
+
zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
@@ -1357,7 +1519,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
txgoff = tx->tx_txg & TXG_MASK;
- if (dmu_objset_userused_enabled(os)) {
+ if (dmu_objset_userused_enabled(os) &&
+ (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
/*
* We must create the list here because it uses the
* dn_dirty_link[] of this txg. But it may already
@@ -1637,6 +1800,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
if (!dmu_objset_userused_enabled(os))
return;
+ /* if this is a raw receive just return and handle accounting later */
+ if (os->os_encrypted && dmu_objset_is_receiving(os))
+ return;
+
/* Allocate the user/groupused objects if necessary. */
if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
VERIFY0(zap_create_claim(os,
@@ -1716,6 +1883,18 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
if (!dmu_objset_userused_enabled(dn->dn_objset))
return;
+ /*
+ * Raw receives introduce a problem with user accounting. Raw
+ * receives cannot update the user accounting info because the
+ * user ids and the sizes are encrypted. To guarantee that we
+ * never end up with bad user accounting, we simply disable it
+ * during raw receives. We also disable this for normal receives
+ * so that an incremental raw receive may be done on top of an
+ * existing non-raw receive.
+ */
+ if (os->os_encrypted && dmu_objset_is_receiving(os))
+ return;
+
if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
DN_ID_CHKED_SPILL)))
return;
@@ -2493,8 +2672,10 @@ EXPORT_SYMBOL(dmu_objset_ds);
EXPORT_SYMBOL(dmu_objset_type);
EXPORT_SYMBOL(dmu_objset_name);
EXPORT_SYMBOL(dmu_objset_hold);
+EXPORT_SYMBOL(dmu_objset_hold_flags);
EXPORT_SYMBOL(dmu_objset_own);
EXPORT_SYMBOL(dmu_objset_rele);
+EXPORT_SYMBOL(dmu_objset_rele_flags);
EXPORT_SYMBOL(dmu_objset_disown);
EXPORT_SYMBOL(dmu_objset_from_ds);
EXPORT_SYMBOL(dmu_objset_create);
@@ -2512,6 +2693,7 @@ EXPORT_SYMBOL(dmu_objset_dnodesize);
EXPORT_SYMBOL(dmu_objset_sync);
EXPORT_SYMBOL(dmu_objset_is_dirty);
+EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);
EXPORT_SYMBOL(dmu_objset_create_impl);
EXPORT_SYMBOL(dmu_objset_open_impl);
EXPORT_SYMBOL(dmu_objset_evict);
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 53f96f83b..aca50197b 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -109,18 +109,17 @@ dump_bytes_cb(void *arg)
ssize_t resid; /* have to get resid to get detailed errno */
/*
- * The code does not rely on this (len being a multiple of 8). We keep
+ * The code does not rely on len being a multiple of 8. We keep
* this assertion because of the corresponding assertion in
* receive_read(). Keeping this assertion ensures that we do not
* inadvertently break backwards compatibility (causing the assertion
- * in receive_read() to trigger on old software).
- *
- * Removing the assertions could be rolled into a new feature that uses
- * data that isn't 8-byte aligned; if the assertions were removed, a
- * feature flag would have to be added.
+ * in receive_read() to trigger on old software). Newer feature flags
+ * (such as raw send) may break this assertion since they were
+ * introduced after the requirement was made obsolete.
*/
- ASSERT0(dbi->dbi_len % 8);
+ ASSERT(dbi->dbi_len % 8 == 0 ||
+ (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
(caddr_t)dbi->dbi_buf, dbi->dbi_len,
@@ -282,11 +281,11 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
}
static int
-dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
- uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp,
- void *data)
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object,
+ uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
{
uint64_t payload_size;
+ boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
/*
@@ -319,16 +318,37 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_toguid = dsp->dsa_toguid;
drrw->drr_logical_size = lsize;
- /* only set the compression fields if the buf is compressed */
- if (lsize != psize) {
- ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED);
+ /* only set the compression fields if the buf is compressed or raw */
+ if (raw || lsize != psize) {
ASSERT(!BP_IS_EMBEDDED(bp));
- ASSERT(!BP_SHOULD_BYTESWAP(bp));
- ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
- ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
ASSERT3S(psize, >, 0);
- ASSERT3S(lsize, >=, psize);
+ if (raw) {
+ ASSERT(BP_IS_PROTECTED(bp));
+
+ /*
+ * This is a raw protected block so we set the encrypted
+ * flag. We need to pass along everything the receiving
+ * side will need to interpret this block, including the
+ * byteswap, salt, IV, and MAC.
+ */
+ drrw->drr_flags |= DRR_RAW_ENCRYPTED;
+ if (BP_SHOULD_BYTESWAP(bp))
+ drrw->drr_flags |= DRR_RAW_BYTESWAP;
+ zio_crypt_decode_params_bp(bp, drrw->drr_salt,
+ drrw->drr_iv);
+ zio_crypt_decode_mac_bp(bp, drrw->drr_mac);
+ } else {
+ /* this is a compressed block */
+ ASSERT(dsp->dsa_featureflags &
+ DMU_BACKUP_FEATURE_COMPRESSED);
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
+ ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
+ ASSERT3S(lsize, >=, psize);
+ }
+
+ /* set fields common to compressed and raw sends */
drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
drrw->drr_compressed_size = psize;
payload_size = drrw->drr_compressed_size;
@@ -336,22 +356,23 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
payload_size = drrw->drr_logical_size;
}
- if (bp == NULL || BP_IS_EMBEDDED(bp)) {
+ if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) {
/*
- * There's no pre-computed checksum for partial-block
- * writes or embedded BP's, so (like
- * fletcher4-checkummed blocks) userland will have to
- * compute a dedup-capable checksum itself.
+ * There's no pre-computed checksum for partial-block writes,
+ * embedded BP's, or encrypted BP's that are being sent as
+ * plaintext, so (like fletcher4-checkummed blocks) userland
+ * will have to compute a dedup-capable checksum itself.
*/
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
ZCHECKSUM_FLAG_DEDUP)
- drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+ drrw->drr_flags |= DRR_CHECKSUM_DEDUP;
DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+ DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp));
drrw->drr_key.ddk_cksum = bp->blk_cksum;
}
@@ -395,9 +416,10 @@ dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
}
static int
-dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
+dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data)
{
struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
+ uint64_t blksz = BP_GET_LSIZE(bp);
if (dsp->dsa_pending_op != PENDING_NONE) {
if (dump_record(dsp, NULL, 0) != 0)
@@ -412,6 +434,18 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
drrs->drr_length = blksz;
drrs->drr_toguid = dsp->dsa_toguid;
+ /* handle raw send fields */
+ if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0 &&
+ BP_IS_PROTECTED(bp)) {
+ drrs->drr_flags |= DRR_RAW_ENCRYPTED;
+ if (BP_SHOULD_BYTESWAP(bp))
+ drrs->drr_flags |= DRR_RAW_BYTESWAP;
+ drrs->drr_compressiontype = BP_GET_COMPRESS(bp);
+ drrs->drr_compressed_size = BP_GET_PSIZE(bp);
+ zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv);
+ zio_crypt_decode_mac_bp(bp, drrs->drr_mac);
+ }
+
if (dump_record(dsp, data, blksz) != 0)
return (SET_ERROR(EINTR));
return (0);
@@ -464,9 +498,11 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
}
static int
-dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
+dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
+ dnode_phys_t *dnp)
{
struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
+ int bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8);
if (object < dsp->dsa_resume_object) {
/*
@@ -507,11 +543,31 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
- if (dump_record(dsp, DN_BONUS(dnp),
- P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
- return (SET_ERROR(EINTR));
+ if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) &&
+ BP_IS_PROTECTED(bp)) {
+ drro->drr_flags |= DRR_RAW_ENCRYPTED;
+ if (BP_SHOULD_BYTESWAP(bp))
+ drro->drr_flags |= DRR_RAW_BYTESWAP;
+
+ /* needed for reconstructing dnp on recv side */
+ drro->drr_indblkshift = dnp->dn_indblkshift;
+ drro->drr_nlevels = dnp->dn_nlevels;
+ drro->drr_nblkptr = dnp->dn_nblkptr;
+
+ /*
+ * Since we encrypt the entire bonus area, the (raw) part
+ * beyond the the bonuslen is actually nonzero, so we need
+ * to send it.
+ */
+ if (bonuslen != 0) {
+ drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
+ bonuslen = drro->drr_raw_bonuslen;
+ }
}
+ if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0)
+ return (SET_ERROR(EINTR));
+
/* Free anything past the end of the file. */
if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
@@ -521,6 +577,42 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
return (0);
}
+static int
+dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj,
+ uint64_t numslots)
+{
+ struct drr_object_range *drror =
+ &(dsp->dsa_drr->drr_u.drr_object_range);
+
+ /* we only use this record type for raw sends */
+ ASSERT(BP_IS_PROTECTED(bp));
+ ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
+ ASSERT0(BP_GET_LEVEL(bp));
+
+ if (dsp->dsa_pending_op != PENDING_NONE) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE;
+ drror->drr_firstobj = firstobj;
+ drror->drr_numslots = numslots;
+ drror->drr_toguid = dsp->dsa_toguid;
+ drror->drr_flags |= DRR_RAW_ENCRYPTED;
+ if (BP_SHOULD_BYTESWAP(bp))
+ drror->drr_flags |= DRR_RAW_BYTESWAP;
+ zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv);
+ zio_crypt_decode_mac_bp(bp, drror->drr_mac);
+
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (SET_ERROR(EINTR));
+ return (0);
+}
+
static boolean_t
backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
{
@@ -564,6 +656,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
zb->zb_object >= sta->resume.zb_object);
+ ASSERT3P(sta->ds, !=, NULL);
if (sta->cancel)
return (SET_ERROR(EINTR));
@@ -639,6 +732,18 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
zb->zb_object >= dsa->dsa_resume_object);
+ /*
+ * All bps of an encrypted os should have the encryption bit set.
+ * If this is not true it indicates tampering and we report an error.
+ */
+ if (dsa->dsa_os->os_encrypted &&
+ !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
+ spa_log_error(spa, zb);
+ zfs_panic_recover("unencrypted block in encrypted "
+ "object set %llu", ds->ds_object);
+ return (SET_ERROR(EIO));
+ }
+
if (zb->zb_object != DMU_META_DNODE_OBJECT &&
DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
@@ -658,34 +763,57 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
+ enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
int i;
+ if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
+ ASSERT(BP_IS_ENCRYPTED(bp));
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ zioflags |= ZIO_FLAG_RAW;
+ }
+
ASSERT0(zb->zb_level);
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
- &aflags, zb) != 0)
+ ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
return (SET_ERROR(EIO));
blk = abuf->b_data;
dnobj = zb->zb_blkid * epb;
- for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
- err = dump_dnode(dsa, dnobj + i, blk + i);
- if (err != 0)
- break;
+
+ /*
+ * Raw sends require sending encryption parameters for the
+ * block of dnodes. Regular sends do not need to send this
+ * info.
+ */
+ if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
+ ASSERT(arc_is_encrypted(abuf));
+ err = dump_object_range(dsa, bp, dnobj, epb);
+ }
+
+ if (err == 0) {
+ for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
+ err = dump_dnode(dsa, bp, dnobj + i, blk + i);
+ if (err != 0)
+ break;
+ }
}
arc_buf_destroy(abuf, &abuf);
} else if (type == DMU_OT_SA) {
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
- int blksz = BP_GET_LSIZE(bp);
+ enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+
+ if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
+ ASSERT(BP_IS_PROTECTED(bp));
+ zioflags |= ZIO_FLAG_RAW;
+ }
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
- &aflags, zb) != 0)
+ ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
return (SET_ERROR(EIO));
- err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
+ err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data);
arc_buf_destroy(abuf, &abuf);
} else if (backup_do_embed(dsa, bp)) {
/* it's an embedded level-0 block of a regular object */
@@ -707,6 +835,14 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
*/
boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
!(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
+
+ /*
+ * Raw sends require that we always get raw data as it exists
+ * on disk, so we assert that we are not splitting blocks here.
+ */
+ boolean_t request_raw =
+ (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
+
/*
* We should only request compressed data from the ARC if all
* the following are true:
@@ -722,6 +858,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
!split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
!BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
+ IMPLY(request_raw, !split_large_blocks);
+ IMPLY(request_raw, BP_IS_PROTECTED(bp));
ASSERT0(zb->zb_level);
ASSERT(zb->zb_object > dsa->dsa_resume_object ||
(zb->zb_object == dsa->dsa_resume_object &&
@@ -730,8 +868,10 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
- if (request_compressed)
+ if (request_raw)
zioflags |= ZIO_FLAG_RAW;
+ else if (request_compressed)
+ zioflags |= ZIO_FLAG_RAW_COMPRESS;
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
@@ -752,6 +892,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
offset = zb->zb_blkid * blksz;
if (split_large_blocks) {
+ ASSERT0(arc_is_encrypted(abuf));
ASSERT3U(arc_get_compression(abuf), ==,
ZIO_COMPRESS_OFF);
char *buf = abuf->b_data;
@@ -765,8 +906,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
}
} else {
err = dump_write(dsa, type, zb->zb_object, offset,
- blksz, arc_buf_size(abuf), bp,
- abuf->b_data);
+ blksz, arc_buf_size(abuf), bp, abuf->b_data);
}
arc_buf_destroy(abuf, &abuf);
}
@@ -795,7 +935,7 @@ static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
- int outfd, uint64_t resumeobj, uint64_t resumeoff,
+ boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
vnode_t *vp, offset_t *off)
{
objset_t *os;
@@ -815,6 +955,24 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
return (err);
}
+ /*
+ * If this is a non-raw send of an encrypted ds, we can ensure that
+ * the objset_phys_t is authenticated. This is safe because this is
+ * either a snapshot or we have owned the dataset, ensuring that
+ * it can't be modified.
+ */
+ if (!rawok && os->os_encrypted &&
+ arc_is_unauthenticated(os->os_phys_buf)) {
+ err = arc_untransform(os->os_phys_buf, os->os_spa,
+ to_ds->ds_object, B_FALSE);
+ if (err != 0) {
+ dsl_pool_rele(dp, tag);
+ return (err);
+ }
+
+ ASSERT0(arc_is_unauthenticated(os->os_phys_buf));
+ }
+
drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
drr->drr_type = DRR_BEGIN;
drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
@@ -837,20 +995,29 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
}
#endif
- if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
+ /* raw sends imply large_block_ok */
+ if ((large_block_ok || rawok) &&
+ to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
- if (embedok &&
+
+ /* encrypted datasets will not have embedded blocks */
+ if ((embedok || rawok) && !os->os_encrypted &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
}
- if (compressok) {
+
+ /* raw send implies compressok */
+ if (compressok || rawok)
featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
- }
+ if (rawok && os->os_encrypted)
+ featureflags |= DMU_BACKUP_FEATURE_RAW;
+
if ((featureflags &
- (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) !=
- 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
+ (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED |
+ DMU_BACKUP_FEATURE_RAW)) != 0 &&
+ spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
featureflags |= DMU_BACKUP_FEATURE_LZ4;
}
@@ -904,20 +1071,43 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsl_dataset_long_hold(to_ds, FTAG);
dsl_pool_rele(dp, tag);
- if (resumeobj != 0 || resumeoff != 0) {
- dmu_object_info_t to_doi;
- nvlist_t *nvl;
- err = dmu_object_info(os, resumeobj, &to_doi);
- if (err != 0)
- goto out;
- SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
- resumeoff / to_doi.doi_data_block_size);
+ /* handle features that require a DRR_BEGIN payload */
+ if (featureflags &
+ (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) {
+ nvlist_t *keynvl = NULL;
+ nvlist_t *nvl = fnvlist_alloc();
+
+ if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+ dmu_object_info_t to_doi;
+ err = dmu_object_info(os, resumeobj, &to_doi);
+ if (err != 0) {
+ fnvlist_free(nvl);
+ goto out;
+ }
+
+ SET_BOOKMARK(&to_arg.resume, to_ds->ds_object,
+ resumeobj, 0,
+ resumeoff / to_doi.doi_data_block_size);
+
+ fnvlist_add_uint64(nvl, "resume_object", resumeobj);
+ fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
+ }
+
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ ASSERT(os->os_encrypted);
+
+ err = dsl_crypto_populate_key_nvlist(to_ds, &keynvl);
+ if (err != 0) {
+ fnvlist_free(nvl);
+ goto out;
+ }
+
+ fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl);
+ }
- nvl = fnvlist_alloc();
- fnvlist_add_uint64(nvl, "resume_object", resumeobj);
- fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
payload = fnvlist_pack(nvl, &payload_len);
drr->drr_payloadlen = payload_len;
+ fnvlist_free(keynvl);
fnvlist_free(nvl);
}
@@ -935,6 +1125,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
to_arg.ds = to_ds;
to_arg.fromtxg = fromtxg;
to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
+ if (rawok)
+ to_arg.flags |= TRAVERSE_NO_DECRYPT;
(void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
TS_RUN, minclsyspri);
@@ -980,7 +1172,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
if (dump_record(dsp, NULL, 0) != 0)
err = dsp->dsa_err;
-
out:
mutex_enter(&to_ds->ds_sendstream_lock);
list_remove(&to_ds->ds_sendstreams, dsp);
@@ -999,18 +1190,19 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
- int outfd, vnode_t *vp, offset_t *off)
+ boolean_t rawok, int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
dsl_dataset_t *fromds = NULL;
+ ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
int err;
err = dsl_pool_hold(pool, FTAG, &dp);
if (err != 0)
return (err);
- err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
+ err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds);
if (err != 0) {
dsl_pool_rele(dp, FTAG);
return (err);
@@ -1022,7 +1214,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
if (err != 0) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
dsl_pool_rele(dp, FTAG);
return (err);
}
@@ -1035,24 +1227,27 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
- embedok, large_block_ok, compressok, outfd, 0, 0, vp, off);
+ embedok, large_block_ok, compressok, rawok, outfd,
+ 0, 0, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
- embedok, large_block_ok, compressok, outfd, 0, 0, vp, off);
+ embedok, large_block_ok, compressok, rawok, outfd,
+ 0, 0, vp, off);
}
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (err);
}
int
dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
- boolean_t large_block_ok, boolean_t compressok, int outfd,
- uint64_t resumeobj, uint64_t resumeoff,
- vnode_t *vp, offset_t *off)
+ boolean_t large_block_ok, boolean_t compressok, boolean_t rawok,
+ int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp,
+ offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
+ ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
boolean_t owned = B_FALSE;
if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
@@ -1067,10 +1262,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
* We are sending a filesystem or volume. Ensure
* that it doesn't change by owning the dataset.
*/
- err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
+ err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds);
owned = B_TRUE;
} else {
- err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+ err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds);
}
if (err != 0) {
dsl_pool_rele(dp, FTAG);
@@ -1110,22 +1305,27 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
}
if (err != 0) {
- dsl_dataset_rele(ds, FTAG);
+ if (owned)
+ dsl_dataset_disown(ds, dsflags, FTAG);
+ else
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+
dsl_pool_rele(dp, FTAG);
return (err);
}
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
- embedok, large_block_ok, compressok,
+ embedok, large_block_ok, compressok, rawok,
outfd, resumeobj, resumeoff, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
- embedok, large_block_ok, compressok,
+ embedok, large_block_ok, compressok, rawok,
outfd, resumeobj, resumeoff, vp, off);
}
if (owned)
- dsl_dataset_disown(ds, FTAG);
+ dsl_dataset_disown(ds, dsflags, FTAG);
else
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+
return (err);
}
@@ -1276,7 +1476,8 @@ dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
* traverse the blocks of the snapshot with birth times after
* from_txg, summing their uncompressed size
*/
- err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
+ err = traverse_dataset(ds, from_txg,
+ TRAVERSE_POST | TRAVERSE_NO_DECRYPT,
dmu_calculate_send_traversal, &size);
if (err)
@@ -1371,9 +1572,17 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
/* if full, then must be forced */
if (!drba->drba_cookie->drc_force)
return (SET_ERROR(EEXIST));
- /* start from $ORIGIN@$ORIGIN, if supported */
- drba->drba_snapobj = dp->dp_origin_snap != NULL ?
- dp->dp_origin_snap->ds_object : 0;
+
+ /*
+ * We don't support using zfs recv -F to blow away
+ * encrypted filesystems. This would require the
+ * dsl dir to point to the old encryption key and
+ * the new one at the same time during the receive.
+ */
+ if (ds->ds_dir->dd_crypto_obj != 0)
+ return (SET_ERROR(EINVAL));
+
+ drba->drba_snapobj = 0;
}
return (0);
@@ -1388,6 +1597,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
uint64_t fromguid = drrb->drr_fromguid;
int flags = drrb->drr_flags;
+ ds_hold_flags_t dsflags = 0;
int error;
uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
@@ -1438,18 +1648,26 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
return (SET_ERROR(ENOTSUP));
- error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+ if ((featureflags & DMU_BACKUP_FEATURE_RAW)) {
+ /* raw receives require the encryption feature */
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION))
+ return (SET_ERROR(ENOTSUP));
+ } else {
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+ }
+
+ error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
/* Can't recv a clone into an existing fs */
if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(EINVAL));
}
error = recv_begin_check_existing_impl(drba, ds, fromguid);
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
} else if (error == ENOENT) {
/* target fs does not exist; must be a full backup or clone */
char buf[ZFS_MAX_DATASET_NAME_LEN];
@@ -1474,7 +1692,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* Open the parent of tofs */
ASSERT3U(strlen(tofs), <, sizeof (buf));
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
- error = dsl_dataset_hold(dp, buf, FTAG, &ds);
+ error = dsl_dataset_hold_flags(dp, buf, dsflags, FTAG, &ds);
if (error != 0)
return (error);
@@ -1486,39 +1704,43 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
if (error != 0) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (error);
}
error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
if (error != 0) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (error);
}
if (drba->drba_origin != NULL) {
dsl_dataset_t *origin;
- error = dsl_dataset_hold(dp, drba->drba_origin,
- FTAG, &origin);
+
+ error = dsl_dataset_hold_flags(dp, drba->drba_origin,
+ dsflags, FTAG, &origin);
if (error != 0) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (error);
}
if (!origin->ds_is_snapshot) {
- dsl_dataset_rele(origin, FTAG);
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(origin,
+ DS_HOLD_FLAG_DECRYPT, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(EINVAL));
}
if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
fromguid != 0) {
- dsl_dataset_rele(origin, FTAG);
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(origin,
+ DS_HOLD_FLAG_DECRYPT, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(ENODEV));
}
- dsl_dataset_rele(origin, FTAG);
+ dsl_dataset_rele_flags(origin,
+ dsflags, FTAG);
}
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
error = 0;
}
return (error);
@@ -1532,27 +1754,42 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
objset_t *mos = dp->dp_meta_objset;
struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
const char *tofs = drba->drba_cookie->drc_tofs;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds, *newds;
+ objset_t *os;
uint64_t dsobj;
+ ds_hold_flags_t dsflags = 0;
int error;
uint64_t crflags = 0;
+ dsl_crypto_params_t *dcpp = NULL;
+ dsl_crypto_params_t dcp = { 0 };
if (drrb->drr_flags & DRR_FLAG_CI_DATA)
crflags |= DS_FLAG_CI_DATASET;
+ if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+ } else {
+ dcp.cp_cmd = DCP_CMD_RAW_RECV;
+ }
- error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+ error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
if (error == 0) {
/* create temporary clone */
dsl_dataset_t *snap = NULL;
+
if (drba->drba_snapobj != 0) {
VERIFY0(dsl_dataset_hold_obj(dp,
drba->drba_snapobj, FTAG, &snap));
+ } else {
+ /* we use the dcp whenever we are not making a clone */
+ dcpp = &dcp;
}
+
dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
- snap, crflags, drba->drba_cred, tx);
+ snap, crflags, drba->drba_cred, dcpp, tx);
if (drba->drba_snapobj != 0)
dsl_dataset_rele(snap, FTAG);
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
} else {
dsl_dir_t *dd;
const char *tail;
@@ -1563,18 +1800,21 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
if (drba->drba_origin != NULL) {
VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
FTAG, &origin));
+ } else {
+ /* we use the dcp whenever we are not making a clone */
+ dcpp = &dcp;
}
/* Create new dataset. */
- dsobj = dsl_dataset_create_sync(dd,
- strrchr(tofs, '/') + 1,
- origin, crflags, drba->drba_cred, tx);
+ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1,
+ origin, crflags, drba->drba_cred, dcpp, tx);
if (origin != NULL)
dsl_dataset_rele(origin, FTAG);
dsl_dir_rele(dd, FTAG);
drba->drba_cookie->drc_newfs = B_TRUE;
}
- VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+ VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &newds));
+ VERIFY0(dmu_objset_from_ds(newds, &os));
if (drba->drba_cookie->drc_resumable) {
uint64_t one = 1;
@@ -1595,32 +1835,46 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
8, 1, &zero, tx));
VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
8, 1, &zero, tx));
- if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
- DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
+ if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
8, 1, &one, tx));
}
- if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
- DMU_BACKUP_FEATURE_EMBED_DATA) {
+ if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) {
VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
8, 1, &one, tx));
}
- if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
- DMU_BACKUP_FEATURE_COMPRESSED) {
+ if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) {
VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
8, 1, &one, tx));
}
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK,
+ 8, 1, &one, tx));
+ }
+ }
+
+ /*
+ * Usually the os->os_encrypted value is tied to the presence of a
+ * DSL Crypto Key object in the dd. However, that will not be received
+ * until dmu_recv_stream(), so we set the value manually for now.
+ */
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ os->os_encrypted = B_TRUE;
+ drba->drba_cookie->drc_raw = B_TRUE;
}
dmu_buf_will_dirty(newds->ds_dbuf, tx);
dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
/*
- * If we actually created a non-clone, we need to create the
- * objset in our new dataset.
+ * If we actually created a non-clone, we need to create the objset
+ * in our new dataset. If this is a raw send we postpone this until
+ * dmu_recv_stream() so that we can allocate the metadnode with the
+ * properties from the DRR_BEGIN payload.
*/
rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
- if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
+ (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
(void) dmu_objset_create_impl(dp->dp_spa,
newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
}
@@ -1638,6 +1892,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
dsl_pool_t *dp = dmu_tx_pool(tx);
struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
int error;
+ ds_hold_flags_t dsflags = 0;
uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
@@ -1689,28 +1944,31 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
(void) snprintf(recvname, sizeof (recvname), "%s/%s",
tofs, recv_clone_name);
- if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+ if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+
+ if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
/* %recv does not exist; continue in tofs */
- error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+ error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
if (error != 0)
return (error);
}
/* check that ds is marked inconsistent */
if (!DS_IS_INCONSISTENT(ds)) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(EINVAL));
}
/* check that there is resuming data, and that the toguid matches */
if (!dsl_dataset_is_zapified(ds)) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(EINVAL));
}
error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
if (error != 0 || drrb->drr_toguid != val) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -1720,13 +1978,13 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
* fails) because it will be marked inconsistent.
*/
if (dsl_dataset_has_owner(ds)) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(EBUSY));
}
/* There should not be any snapshots of this fs yet. */
if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -1740,11 +1998,11 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
if (drrb->drr_fromguid != val) {
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (SET_ERROR(EINVAL));
}
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
return (0);
}
@@ -1754,7 +2012,11 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
dmu_recv_begin_arg_t *drba = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
const char *tofs = drba->drba_cookie->drc_tofs;
+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
+ objset_t *os;
+ ds_hold_flags_t dsflags = 0;
uint64_t dsobj;
/* 6 extra bytes for /%recv */
char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
@@ -1762,9 +2024,15 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
(void) snprintf(recvname, sizeof (recvname), "%s/%s",
tofs, recv_clone_name);
- if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ drba->drba_cookie->drc_raw = B_TRUE;
+ } else {
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+ }
+
+ if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
/* %recv does not exist; continue in tofs */
- VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
+ VERIFY0(dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds));
drba->drba_cookie->drc_newfs = B_TRUE;
}
@@ -1773,9 +2041,10 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
dsobj = ds->ds_object;
- dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
- VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
+ VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &ds));
+ VERIFY0(dmu_objset_from_ds(ds, &os));
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1843,7 +2112,7 @@ struct receive_record_arg {
* If the record is a write, pointer to the arc_buf_t containing the
* payload.
*/
- arc_buf_t *write_buf;
+ arc_buf_t *arc_buf;
int payload_size;
uint64_t bytes_read; /* bytes read from stream when record created */
boolean_t eos_marker; /* Marks the end of the stream */
@@ -1901,12 +2170,14 @@ struct receive_arg {
zio_cksum_t prev_cksum;
int err;
boolean_t byteswap;
+ uint64_t featureflags;
/* Sorted list of objects not to issue prefetches for. */
struct objlist ignore_objlist;
};
typedef struct guid_map_entry {
uint64_t guid;
+ boolean_t raw;
dsl_dataset_t *gme_ds;
avl_node_t avlnode;
} guid_map_entry_t;
@@ -1929,7 +2200,8 @@ free_guid_map_onexit(void *arg)
while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
dsl_dataset_long_rele(gmep->gme_ds, gmep);
- dsl_dataset_rele(gmep->gme_ds, gmep);
+ dsl_dataset_rele_flags(gmep->gme_ds,
+ (gmep->raw) ? 0 : DS_HOLD_FLAG_DECRYPT, gmep);
kmem_free(gmep, sizeof (guid_map_entry_t));
}
avl_destroy(ca);
@@ -1945,7 +2217,8 @@ receive_read(struct receive_arg *ra, int len, void *buf)
* The code doesn't rely on this (lengths being multiples of 8). See
* comment in dump_bytes.
*/
- ASSERT0(len % 8);
+ ASSERT(len % 8 == 0 ||
+ (ra->featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
while (done < len) {
ssize_t resid;
@@ -1998,6 +2271,7 @@ byteswap_record(dmu_replay_record_t *drr)
DO32(drr_object.drr_bonustype);
DO32(drr_object.drr_blksz);
DO32(drr_object.drr_bonuslen);
+ DO32(drr_object.drr_raw_bonuslen);
DO64(drr_object.drr_toguid);
break;
case DRR_FREEOBJECTS:
@@ -2045,6 +2319,13 @@ byteswap_record(dmu_replay_record_t *drr)
DO64(drr_spill.drr_object);
DO64(drr_spill.drr_length);
DO64(drr_spill.drr_toguid);
+ DO64(drr_spill.drr_compressed_size);
+ DO32(drr_spill.drr_type);
+ break;
+ case DRR_OBJECT_RANGE:
+ DO64(drr_object_range.drr_firstobj);
+ DO64(drr_object_range.drr_numslots);
+ DO64(drr_object_range.drr_toguid);
break;
case DRR_END:
DO64(drr_end.drr_toguid);
@@ -2135,6 +2416,21 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
return (SET_ERROR(EINVAL));
}
+ if (DRR_IS_RAW_ENCRYPTED(drro->drr_flags)) {
+ if (drro->drr_raw_bonuslen < drro->drr_bonuslen ||
+ drro->drr_indblkshift > SPA_MAXBLOCKSHIFT ||
+ drro->drr_nlevels > DN_MAX_LEVELS ||
+ drro->drr_nblkptr > DN_MAX_NBLKPTR ||
+ DN_SLOTS_TO_BONUSLEN(drro->drr_dn_slots) <
+ drro->drr_raw_bonuslen)
+ return (SET_ERROR(EINVAL));
+ } else {
+ if (drro->drr_flags != 0 || drro->drr_raw_bonuslen != 0 ||
+ drro->drr_indblkshift != 0 || drro->drr_nlevels != 0 ||
+ drro->drr_nblkptr != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
err = dmu_object_info(rwa->os, drro->drr_object, &doi);
if (err != 0 && err != ENOENT)
@@ -2145,15 +2441,25 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
* If we are losing blkptrs or changing the block size this must
* be a new file instance. We must clear out the previous file
* contents before we can change this type of metadata in the dnode.
+ * Raw receives will also check that the indirect structure of the
+ * dnode hasn't changed.
*/
if (err == 0) {
- int nblkptr;
-
- nblkptr = deduce_nblkptr(drro->drr_bonustype,
+ uint32_t indblksz = drro->drr_indblkshift ?
+ 1ULL << drro->drr_indblkshift : 0;
+ int nblkptr = deduce_nblkptr(drro->drr_bonustype,
drro->drr_bonuslen);
+ /* nblkptr will be bounded by the bonus size and type */
+ if (DRR_IS_RAW_ENCRYPTED(drro->drr_flags) &&
+ nblkptr != drro->drr_nblkptr)
+ return (SET_ERROR(EINVAL));
+
if (drro->drr_blksz != doi.doi_data_block_size ||
- nblkptr < doi.doi_nblkptr) {
+ nblkptr < doi.doi_nblkptr ||
+ (DRR_IS_RAW_ENCRYPTED(drro->drr_flags) &&
+ (indblksz != doi.doi_metadata_block_size ||
+ drro->drr_nlevels < doi.doi_indirection))) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
@@ -2163,6 +2469,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
tx = dmu_tx_create(rwa->os);
dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_write(tx, object, 0, 0);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
@@ -2185,7 +2492,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
drro->drr_bonustype, drro->drr_bonuslen, tx);
}
if (err != 0) {
- dmu_tx_commit(tx);
+ dmu_tx_abort(tx);
return (SET_ERROR(EINVAL));
}
@@ -2194,19 +2501,42 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
dmu_object_set_compress(rwa->os, drro->drr_object,
drro->drr_compress, tx);
+ /* handle more restrictive dnode structuring for raw recvs */
+ if (DRR_IS_RAW_ENCRYPTED(drro->drr_flags)) {
+ /*
+ * Set the indirect block shift and nlevels. This will not fail
+ * because we ensured all of the blocks were free earlier if
+ * this is a new object.
+ */
+ VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
+ drro->drr_blksz, drro->drr_indblkshift, tx));
+ VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
+ drro->drr_nlevels, tx));
+ }
+
if (data != NULL) {
dmu_buf_t *db;
+ uint32_t flags = DMU_READ_NO_PREFETCH;
- VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
+ if (DRR_IS_RAW_ENCRYPTED(drro->drr_flags))
+ flags |= DMU_READ_NO_DECRYPT;
+
+ VERIFY0(dmu_bonus_hold_impl(rwa->os, drro->drr_object,
+ FTAG, flags, &db));
dmu_buf_will_dirty(db, tx);
ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
- bcopy(data, db->db_data, drro->drr_bonuslen);
- if (rwa->byteswap) {
+ bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro));
+
+ /*
+ * Raw bonus buffers have their byteorder determined by the
+ * DRR_OBJECT_RANGE record.
+ */
+ if (rwa->byteswap && !DRR_IS_RAW_ENCRYPTED(drro->drr_flags)) {
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(drro->drr_bonustype);
dmu_ot_byteswap[byteswap].ob_func(db->db_data,
- drro->drr_bonuslen);
+ DRR_OBJECT_PAYLOAD_SIZE(drro));
}
dmu_buf_rele(db, FTAG);
}
@@ -2285,7 +2615,8 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
dmu_tx_abort(tx);
return (err);
}
- if (rwa->byteswap) {
+ if (rwa->byteswap && !arc_is_encrypted(abuf) &&
+ arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(drrw->drr_type);
dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
@@ -2327,6 +2658,7 @@ receive_write_byref(struct receive_writer_arg *rwa,
guid_map_entry_t *gmep;
avl_index_t where;
objset_t *ref_os = NULL;
+ int flags = DMU_READ_PREFETCH;
dmu_buf_t *dbp;
if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
@@ -2348,8 +2680,13 @@ receive_write_byref(struct receive_writer_arg *rwa,
ref_os = rwa->os;
}
+ if (DRR_IS_RAW_ENCRYPTED(drrwbr->drr_flags)) {
+ flags |= DMU_READ_NO_DECRYPT;
+ }
+
+ /* may return either a regular db or an encrypted one */
err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
- drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
+ drrwbr->drr_refoffset, FTAG, &dbp, flags);
if (err != 0)
return (err);
@@ -2362,8 +2699,14 @@ receive_write_byref(struct receive_writer_arg *rwa,
dmu_tx_abort(tx);
return (err);
}
- dmu_write(rwa->os, drrwbr->drr_object,
- drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+
+ if (DRR_IS_RAW_ENCRYPTED(drrwbr->drr_flags)) {
+ dmu_copy_from_buf(rwa->os, drrwbr->drr_object,
+ drrwbr->drr_offset, dbp, tx);
+ } else {
+ dmu_write(rwa->os, drrwbr->drr_object,
+ drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+ }
dmu_buf_rele(dbp, FTAG);
/* See comment in restore_write. */
@@ -2413,7 +2756,7 @@ receive_write_embedded(struct receive_writer_arg *rwa,
static int
receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
- void *data)
+ arc_buf_t *abuf)
{
dmu_tx_t *tx;
dmu_buf_t *db, *db_spill;
@@ -2423,6 +2766,13 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
return (SET_ERROR(EINVAL));
+ if (DRR_IS_RAW_ENCRYPTED(drrs->drr_flags)) {
+ if (!DMU_OT_IS_VALID(drrs->drr_type) ||
+ drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
+ drrs->drr_compressed_size == 0)
+ return (SET_ERROR(EINVAL));
+ }
+
if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
@@ -2448,7 +2798,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
if (db_spill->db_size < drrs->drr_length)
VERIFY(0 == dbuf_spill_set_blksz(db_spill,
drrs->drr_length, tx));
- bcopy(data, db_spill->db_data, drrs->drr_length);
+ dmu_assign_arcbuf_impl(db_spill, abuf, tx);
dmu_buf_rele(db, FTAG);
dmu_buf_rele(db_spill, FTAG);
@@ -2476,18 +2826,98 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
return (err);
}
+static int
+receive_object_range(struct receive_writer_arg *rwa,
+ struct drr_object_range *drror)
+{
+ int ret;
+ dmu_tx_t *tx;
+ dnode_t *mdn = NULL;
+ dmu_buf_t *db = NULL;
+ uint64_t offset;
+
+ /*
+ * By default, we assume this block is in our native format
+ * (ZFS_HOST_BYTEORDER). We then take into account whether
+ * the send stream is byteswapped (rwa->byteswap). Finally,
+ * we need to byteswap again if this particular block was
+ * in non-native format on the send side.
+ */
+ boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^
+ !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags);
+
+ /*
+ * Since dnode block sizes are constant, we should not need to worry
+ * about making sure that the dnode block size is the same on the
+ * sending and receiving sides for the time being. For non-raw sends,
+ * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE
+ * record at all). Raw sends require this record type because the
+ * encryption parameters are used to protect an entire block of bonus
+ * buffers. If the size of dnode blocks ever becomes variable,
+ * handling will need to be added to ensure that dnode block sizes
+ * match on the sending and receiving side.
+ */
+ if (drror->drr_numslots != DNODES_PER_BLOCK ||
+ P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 ||
+ !DRR_IS_RAW_ENCRYPTED(drror->drr_flags))
+ return (SET_ERROR(EINVAL));
+
+ offset = drror->drr_firstobj * sizeof (dnode_phys_t);
+ mdn = DMU_META_DNODE(rwa->os);
+
+ tx = dmu_tx_create(rwa->os);
+ ret = dmu_tx_assign(tx, TXG_WAIT);
+ if (ret != 0) {
+ dmu_tx_abort(tx);
+ return (ret);
+ }
+
+ ret = dmu_buf_hold_by_dnode(mdn, offset, FTAG, &db,
+ DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
+ if (ret != 0) {
+ dmu_tx_commit(tx);
+ return (ret);
+ }
+
+ /*
+ * Convert the buffer associated with this range of dnodes to a
+ * raw buffer. This ensures that it will be written out as a raw
+ * buffer when we fill in the dnode objects in future records.
+ * Since we are commiting this tx now, it is technically possible
+ * for the dnode block to end up on-disk with the incorrect MAC.
+ * Despite this, the dataset is marked as inconsistent so no other
+ * code paths (apart from scrubs) will attempt to read this data.
+ * Scrubs will not be effected by this either since scrubs only
+ * read raw data and do not attempt to check the MAC.
+ */
+ dmu_convert_to_raw(db, byteorder, drror->drr_salt, drror->drr_iv,
+ drror->drr_mac, tx);
+ dmu_buf_rele(db, FTAG);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
/* used to destroy the drc_ds on error */
static void
dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
{
+ ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
+
+ /*
+ * Wait for the txg sync before cleaning up the receive. For
+ * resumable receives, this ensures that our resume state has
+ * been written out to disk. For raw receives, this ensures
+ * that the user accounting code will not attempt to do anything
+ * after we stopped receiving the dataset.
+ */
+ txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
+
if (drc->drc_resumable) {
- /* wait for our resume state to be written to disk */
- txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
- dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ dsl_dataset_disown(drc->drc_ds, dsflags, dmu_recv_tag);
} else {
char name[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(drc->drc_ds, name);
- dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ dsl_dataset_disown(drc->drc_ds, dsflags, dmu_recv_tag);
(void) dsl_destroy_head(name);
}
}
@@ -2537,6 +2967,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
err = receive_read(ra, sizeof (ra->next_rrd->header),
&ra->next_rrd->header);
ra->next_rrd->bytes_read = ra->bytes_read;
+
if (err != 0) {
kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
ra->next_rrd = NULL;
@@ -2680,9 +3111,10 @@ receive_read_record(struct receive_arg *ra)
case DRR_OBJECT:
{
struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
- uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
+ uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
void *buf = kmem_zalloc(size, KM_SLEEP);
dmu_object_info_t doi;
+
err = receive_read_payload_and_next_header(ra, size, buf);
if (err != 0) {
kmem_free(buf, size);
@@ -2710,7 +3142,18 @@ receive_read_record(struct receive_arg *ra)
struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
arc_buf_t *abuf;
boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
- if (DRR_WRITE_COMPRESSED(drrw)) {
+
+ if (DRR_IS_RAW_ENCRYPTED(drrw->drr_flags)) {
+ boolean_t byteorder = ZFS_HOST_BYTEORDER ^
+ !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
+ ra->byteswap;
+
+ abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os),
+ drrw->drr_object, byteorder, drrw->drr_salt,
+ drrw->drr_iv, drrw->drr_mac, drrw->drr_type,
+ drrw->drr_compressed_size, drrw->drr_logical_size,
+ drrw->drr_compressiontype);
+ } else if (DRR_WRITE_COMPRESSED(drrw)) {
ASSERT3U(drrw->drr_compressed_size, >, 0);
ASSERT3U(drrw->drr_logical_size, >=,
drrw->drr_compressed_size);
@@ -2730,7 +3173,7 @@ receive_read_record(struct receive_arg *ra)
dmu_return_arcbuf(abuf);
return (err);
}
- ra->rrd->write_buf = abuf;
+ ra->rrd->arc_buf = abuf;
receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
drrw->drr_logical_size);
return (err);
@@ -2780,11 +3223,38 @@ receive_read_record(struct receive_arg *ra)
case DRR_SPILL:
{
struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
- void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
- err = receive_read_payload_and_next_header(ra, drrs->drr_length,
- buf);
- if (err != 0)
- kmem_free(buf, drrs->drr_length);
+ arc_buf_t *abuf;
+ int len = DRR_SPILL_PAYLOAD_SIZE(drrs);
+
+ /* DRR_SPILL records are either raw or uncompressed */
+ if (DRR_IS_RAW_ENCRYPTED(drrs->drr_flags)) {
+ boolean_t byteorder = ZFS_HOST_BYTEORDER ^
+ !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
+ ra->byteswap;
+
+ abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os),
+ drrs->drr_object, byteorder, drrs->drr_salt,
+ drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
+ drrs->drr_compressed_size, drrs->drr_length,
+ drrs->drr_compressiontype);
+ } else {
+ abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+ DMU_OT_IS_METADATA(drrs->drr_type),
+ drrs->drr_length);
+ }
+
+ err = receive_read_payload_and_next_header(ra, len,
+ abuf->b_data);
+ if (err != 0) {
+ dmu_return_arcbuf(abuf);
+ return (err);
+ }
+ ra->rrd->arc_buf = abuf;
+ return (err);
+ }
+ case DRR_OBJECT_RANGE:
+ {
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
return (err);
}
default:
@@ -2825,7 +3295,7 @@ dprintf_drr(struct receive_record_arg *rrd, int err)
"compress = %u psize = %llu err = %d\n",
drrw->drr_object, drrw->drr_type, drrw->drr_offset,
drrw->drr_logical_size, drrw->drr_checksumtype,
- drrw->drr_checksumflags, drrw->drr_compressiontype,
+ drrw->drr_flags, drrw->drr_compressiontype,
drrw->drr_compressed_size, err);
break;
}
@@ -2841,7 +3311,7 @@ dprintf_drr(struct receive_record_arg *rrd, int err)
drrwbr->drr_length, drrwbr->drr_toguid,
drrwbr->drr_refguid, drrwbr->drr_refobject,
drrwbr->drr_refoffset, drrwbr->drr_checksumtype,
- drrwbr->drr_checksumflags, err);
+ drrwbr->drr_flags, err);
break;
}
case DRR_WRITE_EMBEDDED:
@@ -2909,11 +3379,11 @@ receive_process_record(struct receive_writer_arg *rwa,
case DRR_WRITE:
{
struct drr_write *drrw = &rrd->header.drr_u.drr_write;
- err = receive_write(rwa, drrw, rrd->write_buf);
+ err = receive_write(rwa, drrw, rrd->arc_buf);
/* if receive_write() is successful, it consumes the arc_buf */
if (err != 0)
- dmu_return_arcbuf(rrd->write_buf);
- rrd->write_buf = NULL;
+ dmu_return_arcbuf(rrd->arc_buf);
+ rrd->arc_buf = NULL;
rrd->payload = NULL;
break;
}
@@ -2942,11 +3412,20 @@ receive_process_record(struct receive_writer_arg *rwa,
case DRR_SPILL:
{
struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
- err = receive_spill(rwa, drrs, rrd->payload);
- kmem_free(rrd->payload, rrd->payload_size);
+ err = receive_spill(rwa, drrs, rrd->arc_buf);
+ /* if receive_spill() is successful, it consumes the arc_buf */
+ if (err != 0)
+ dmu_return_arcbuf(rrd->arc_buf);
+ rrd->arc_buf = NULL;
rrd->payload = NULL;
break;
}
+ case DRR_OBJECT_RANGE:
+ {
+ struct drr_object_range *drror =
+ &rrd->header.drr_u.drr_object_range;
+ return (receive_object_range(rwa, drror));
+ }
default:
return (SET_ERROR(EINVAL));
}
@@ -2977,9 +3456,9 @@ receive_writer_thread(void *arg)
*/
if (rwa->err == 0) {
rwa->err = receive_process_record(rwa, rrd);
- } else if (rrd->write_buf != NULL) {
- dmu_return_arcbuf(rrd->write_buf);
- rrd->write_buf = NULL;
+ } else if (rrd->arc_buf != NULL) {
+ dmu_return_arcbuf(rrd->arc_buf);
+ rrd->arc_buf = NULL;
rrd->payload = NULL;
} else if (rrd->payload != NULL) {
kmem_free(rrd->payload, rrd->payload_size);
@@ -3075,6 +3554,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+ ra->featureflags = featureflags;
/* if this stream is dedup'ed, set up the avl tree for guid mapping */
if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
@@ -3129,6 +3609,24 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
goto out;
}
+ /* handle DSL encryption key payload */
+ if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+ nvlist_t *keynvl = NULL;
+
+ ASSERT(ra->os->os_encrypted);
+ ASSERT(drc->drc_raw);
+
+ err = nvlist_lookup_nvlist(begin_nvl, "crypt_keydata", &keynvl);
+ if (err != 0)
+ goto out;
+
+ err = dsl_crypto_recv_key(spa_name(ra->os->os_spa),
+ drc->drc_ds->ds_object, drc->drc_drrb->drr_type,
+ keynvl);
+ if (err != 0)
+ goto out;
+ }
+
if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
err = resume_check(ra, begin_nvl);
if (err != 0)
@@ -3293,6 +3791,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
{
dmu_recv_cookie_t *drc = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
+ boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
tx, "snap=%s", drc->drc_tosnap);
@@ -3386,21 +3885,31 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
}
zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
+
/*
* Release the hold from dmu_recv_begin. This must be done before
- * we return to open context, so that when we free the dataset's dnode,
- * we can evict its bonus buffer.
+ * we return to open context, so that when we free the dataset's dnode
+ * we can evict its bonus buffer. Since the dataset may be destroyed
+ * at this point (and therefore won't have a valid pointer to the spa)
+ * we release the key mapping manually here while we do have a valid
+ * pointer, if it exists.
*/
- dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ if (!drc->drc_raw && encrypted) {
+ (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa,
+ drc->drc_ds->ds_object, drc->drc_ds);
+ }
+ dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag);
drc->drc_ds = NULL;
}
static int
-add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
+add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj,
+ boolean_t raw)
{
dsl_pool_t *dp;
dsl_dataset_t *snapds;
guid_map_entry_t *gmep;
+ ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
int err;
ASSERT(guid_map != NULL);
@@ -3409,9 +3918,10 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
if (err != 0)
return (err);
gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
- err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
+ err = dsl_dataset_hold_obj_flags(dp, snapobj, dsflags, gmep, &snapds);
if (err == 0) {
gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
+ gmep->raw = raw;
gmep->gme_ds = snapds;
avl_add(guid_map, gmep);
dsl_dataset_long_hold(snapds, gmep);
@@ -3466,9 +3976,8 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
if (error != 0) {
dmu_recv_cleanup_ds(drc);
} else if (drc->drc_guid_to_ds_map != NULL) {
- (void) add_ds_to_guidmap(drc->drc_tofs,
- drc->drc_guid_to_ds_map,
- drc->drc_newsnapobj);
+ (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map,
+ drc->drc_newsnapobj, drc->drc_raw);
}
return (error);
}
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index c78228d74..a6c27b4be 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -132,7 +132,7 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh)
zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
- claim_txg);
+ claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
zil_free(zilog);
}
@@ -181,6 +181,7 @@ traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+ int zio_flags = ZIO_FLAG_CANFAIL;
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
return;
@@ -196,8 +197,11 @@ traverse_prefetch_metadata(traverse_data_t *td,
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
return;
+ if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
}
static boolean_t
@@ -294,6 +298,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
zbookmark_phys_t *czb;
+ ASSERT(!BP_IS_PROTECTED(bp));
+
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
@@ -324,14 +330,23 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_FLAG_WAIT;
+ uint32_t zio_flags = ZIO_FLAG_CANFAIL;
int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
dnode_phys_t *child_dnp;
+ /*
+ * dnode blocks might have their bonus buffers encrypted, so
+ * we must be careful to honor TRAVERSE_NO_DECRYPT
+ */
+ if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err != 0)
goto post;
+
child_dnp = buf->b_data;
for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
@@ -347,11 +362,15 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ uint32_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
+ if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err != 0)
goto post;
@@ -500,6 +519,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
prefetch_data_t *pfd = arg;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT(pfd->pd_bytes_fetched >= 0);
@@ -518,8 +538,11 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
cv_broadcast(&pfd->pd_cv);
mutex_exit(&pfd->pd_mtx);
+ if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+ zio_flags |= ZIO_FLAG_RAW;
+
(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
+ zio_flags, &aflags, zb);
return (0);
}
@@ -599,13 +622,17 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
uint32_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;
- err = arc_read(NULL, td->td_spa, rootbp,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb);
+ if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&
+ BP_IS_PROTECTED(rootbp))
+ zio_flags |= ZIO_FLAG_RAW;
+
+ err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,
+ &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);
if (err != 0)
return (err);
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 41180bedf..9942d6427 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1246,7 +1246,12 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
rw_exit(&mdn->dn_struct_rwlock);
if (db == NULL)
return (SET_ERROR(EIO));
- err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+
+ /*
+ * We do not need to decrypt to read the dnode so it doesn't matter
+ * if we get the encrypted or decrypted version.
+ */
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
if (err) {
dbuf_rele(db, FTAG);
return (err);
@@ -1550,11 +1555,73 @@ fail:
return (SET_ERROR(ENOTSUP));
}
+static void
+dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
+{
+ uint64_t txgoff = tx->tx_txg & TXG_MASK;
+ int old_nlevels = dn->dn_nlevels;
+ dmu_buf_impl_t *db;
+ list_t *list;
+ dbuf_dirty_record_t *new, *dr, *dr_next;
+
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ dn->dn_nlevels = new_nlevels;
+
+ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+ dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+ /* dirty the left indirects */
+ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+ new = dbuf_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+
+ /* transfer the dirty records to the new indirect */
+ mutex_enter(&dn->dn_mtx);
+ mutex_enter(&new->dt.di.dr_mtx);
+ list = &dn->dn_dirty_records[txgoff];
+ for (dr = list_head(list); dr; dr = dr_next) {
+ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+ if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+ dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ list_remove(&dn->dn_dirty_records[txgoff], dr);
+ list_insert_tail(&new->dt.di.dr_children, dr);
+ dr->dr_parent = new;
+ }
+ }
+ mutex_exit(&new->dt.di.dr_mtx);
+ mutex_exit(&dn->dn_mtx);
+}
+
+int
+dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
+{
+ int ret = 0;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ if (dn->dn_nlevels == nlevels) {
+ ret = 0;
+ goto out;
+ } else if (nlevels < dn->dn_nlevels) {
+ ret = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ dnode_set_nlevels_impl(dn, nlevels, tx);
+
+out:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ret);
+}
+
/* read-holding callers must not rely on the lock being continuously held */
void
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
{
- uint64_t txgoff = tx->tx_txg & TXG_MASK;
int epbs, new_nlevels;
uint64_t sz;
@@ -1594,41 +1661,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
- if (new_nlevels > dn->dn_nlevels) {
- int old_nlevels = dn->dn_nlevels;
- dmu_buf_impl_t *db;
- list_t *list;
- dbuf_dirty_record_t *new, *dr, *dr_next;
-
- dn->dn_nlevels = new_nlevels;
-
- ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
- dn->dn_next_nlevels[txgoff] = new_nlevels;
-
- /* dirty the left indirects */
- db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
- ASSERT(db != NULL);
- new = dbuf_dirty(db, tx);
- dbuf_rele(db, FTAG);
-
- /* transfer the dirty records to the new indirect */
- mutex_enter(&dn->dn_mtx);
- mutex_enter(&new->dt.di.dr_mtx);
- list = &dn->dn_dirty_records[txgoff];
- for (dr = list_head(list); dr; dr = dr_next) {
- dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
- if (dr->dr_dbuf->db_level != new_nlevels-1 &&
- dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
- ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
- list_remove(&dn->dn_dirty_records[txgoff], dr);
- list_insert_tail(&new->dt.di.dr_children, dr);
- dr->dr_parent = new;
- }
- }
- mutex_exit(&new->dt.di.dr_mtx);
- mutex_exit(&dn->dn_mtx);
- }
+ if (new_nlevels > dn->dn_nlevels)
+ dnode_set_nlevels_impl(dn, new_nlevels, tx);
out:
if (have_read)
@@ -1987,7 +2021,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
*/
return (SET_ERROR(ESRCH));
}
- error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
+ error = dbuf_read(db, NULL,
+ DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
if (error) {
dbuf_rele(db, FTAG);
return (error);
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 742d962bc..c5ce4b2a2 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -31,6 +31,7 @@
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/range_tree.h>
@@ -557,6 +558,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
{
+ objset_t *os = dn->dn_objset;
dnode_phys_t *dnp = dn->dn_phys;
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
@@ -572,8 +574,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
- if (dmu_objset_userused_enabled(dn->dn_objset) &&
- !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ /*
+ * Do user accounting if it is enabled and this is not
+ * an encrypted receive.
+ */
+ if (dmu_objset_userused_enabled(os) &&
+ !DMU_OBJECT_IS_SPECIAL(dn->dn_object) &&
+ (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
mutex_enter(&dn->dn_mtx);
dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
dn->dn_oldflags = dn->dn_phys->dn_flags;
@@ -584,7 +591,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
} else {
- /* Once we account for it, we should always account for it. */
+ /* Once we account for it, we should always account for it */
ASSERT(!(dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED));
ASSERT(!(dn->dn_phys->dn_flags &
diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c
new file mode 100644
index 000000000..af46dd753
--- /dev/null
+++ b/module/zfs/dsl_crypt.c
@@ -0,0 +1,2611 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/dsl_crypt.h>
+#include <sys/dsl_pool.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/zvol.h>
+
+/*
+ * This file's primary purpose is for managing master encryption keys in
+ * memory and on disk. For more info on how these keys are used, see the
+ * block comment in zio_crypt.c.
+ *
+ * All master keys are stored encrypted on disk in the form of the DSL
+ * Crypto Key ZAP object. The binary key data in this object is always
+ * randomly generated and is encrypted with the user's wrapping key. This
+ * layer of indirection allows the user to change their key without
+ * needing to re-encrypt the entire dataset. The ZAP also holds on to the
+ * (non-encrypted) encryption algorithm identifier, IV, and MAC needed to
+ * safely decrypt the master key. For more info on the user's key see the
+ * block comment in libzfs_crypto.c
+ *
+ * In-memory encryption keys are managed through the spa_keystore. The
+ * keystore consists of 3 AVL trees, which are as follows:
+ *
+ * The Wrapping Key Tree:
+ * The wrapping key (wkey) tree stores the user's keys that are fed into the
+ * kernel through 'zfs load-key' and related commands. Datasets inherit their
+ * parent's wkey by default, so these structures are refcounted. The wrapping
+ * keys remain in memory until they are explicitly unloaded (with
+ * "zfs unload-key"). Unloading is only possible when no datasets are using
+ * them (refcount=0).
+ *
+ * The DSL Crypto Key Tree:
+ * The DSL Crypto Keys (DCK) are the in-memory representation of decrypted
+ * master keys. They are used by the functions in zio_crypt.c to perform
+ * encryption, decryption, and authentication. Snapshots and clones of a given
+ * dataset will share a DSL Crypto Key, so they are also refcounted. Once the
+ * refcount on a key hits zero, it is immediately zeroed out and freed.
+ *
+ * The Crypto Key Mapping Tree:
+ * The zio layer needs to lookup master keys by their dataset object id. Since
+ * the DSL Crypto Keys can belong to multiple datasets, we maintain a tree of
+ * dsl_key_mapping_t's which essentially just map the dataset object id to its
+ * appropriate DSL Crypto Key. The management for creating and destroying these
+ * mappings hooks into the code for owning and disowning datasets. Usually,
+ * there will only be one active dataset owner, but there are times
+ * (particularly during dataset creation and destruction) when this may not be
+ * true or the dataset may not be initialized enough to own. As a result, this
+ * object is also refcounted.
+ */
+
+static void
+dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag)
+{
+ (void) refcount_add(&wkey->wk_refcnt, tag);
+}
+
+static void
+dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag)
+{
+ (void) refcount_remove(&wkey->wk_refcnt, tag);
+}
+
+static void
+dsl_wrapping_key_free(dsl_wrapping_key_t *wkey)
+{
+ ASSERT0(refcount_count(&wkey->wk_refcnt));
+
+ if (wkey->wk_key.ck_data) {
+ bzero(wkey->wk_key.ck_data,
+ BITS_TO_BYTES(wkey->wk_key.ck_length));
+ kmem_free(wkey->wk_key.ck_data,
+ BITS_TO_BYTES(wkey->wk_key.ck_length));
+ }
+
+ refcount_destroy(&wkey->wk_refcnt);
+ kmem_free(wkey, sizeof (dsl_wrapping_key_t));
+}
+
+static int
+dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat,
+ uint64_t salt, uint64_t iters, dsl_wrapping_key_t **wkey_out)
+{
+ int ret;
+ dsl_wrapping_key_t *wkey;
+
+ /* allocate the wrapping key */
+ wkey = kmem_alloc(sizeof (dsl_wrapping_key_t), KM_SLEEP);
+ if (!wkey)
+ return (SET_ERROR(ENOMEM));
+
+ /* allocate and initialize the underlying crypto key */
+ wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP);
+ if (!wkey->wk_key.ck_data) {
+ ret = ENOMEM;
+ goto error;
+ }
+
+ wkey->wk_key.ck_format = CRYPTO_KEY_RAW;
+ wkey->wk_key.ck_length = BYTES_TO_BITS(WRAPPING_KEY_LEN);
+ bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN);
+
+ /* initialize the rest of the struct */
+ refcount_create(&wkey->wk_refcnt);
+ wkey->wk_keyformat = keyformat;
+ wkey->wk_salt = salt;
+ wkey->wk_iters = iters;
+
+ *wkey_out = wkey;
+ return (0);
+
+error:
+ dsl_wrapping_key_free(wkey);
+
+ *wkey_out = NULL;
+ return (ret);
+}
+
+int
+dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props,
+ nvlist_t *crypto_args, dsl_crypto_params_t **dcp_out)
+{
+ int ret;
+ uint64_t crypt = ZIO_CRYPT_INHERIT;
+ uint64_t keyformat = ZFS_KEYFORMAT_NONE;
+ uint64_t salt = 0, iters = 0;
+ dsl_crypto_params_t *dcp = NULL;
+ dsl_wrapping_key_t *wkey = NULL;
+ uint8_t *wkeydata = NULL;
+ uint_t wkeydata_len = 0;
+ char *keylocation = NULL;
+
+ dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP);
+ if (!dcp) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+
+ dcp->cp_cmd = cmd;
+
+ /* get relevant arguments from the nvlists */
+ if (props != NULL) {
+ (void) nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &crypt);
+ (void) nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat);
+ (void) nvlist_lookup_string(props,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation);
+ (void) nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), &salt);
+ (void) nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters);
+
+ dcp->cp_crypt = crypt;
+ }
+
+ if (crypto_args != NULL) {
+ (void) nvlist_lookup_uint8_array(crypto_args, "wkeydata",
+ &wkeydata, &wkeydata_len);
+ }
+
+ /* check for valid command */
+ if (dcp->cp_cmd >= DCP_CMD_MAX) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ } else {
+ dcp->cp_cmd = cmd;
+ }
+
+ /* check for valid crypt */
+ if (dcp->cp_crypt >= ZIO_CRYPT_FUNCTIONS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ } else {
+ dcp->cp_crypt = crypt;
+ }
+
+ /* check for valid keyformat */
+ if (keyformat >= ZFS_KEYFORMAT_FORMATS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check for a valid keylocation (of any kind) and copy it in */
+ if (keylocation != NULL) {
+ if (!zfs_prop_valid_keylocation(keylocation, B_FALSE)) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ dcp->cp_keylocation = spa_strdup(keylocation);
+ }
+
+ /* check wrapping key length, if given */
+ if (wkeydata != NULL && wkeydata_len != WRAPPING_KEY_LEN) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* if the user asked for the deault crypt, determine that now */
+ if (dcp->cp_crypt == ZIO_CRYPT_ON)
+ dcp->cp_crypt = ZIO_CRYPT_ON_VALUE;
+
+ /* create the wrapping key from the raw data */
+ if (wkeydata != NULL) {
+ /* create the wrapping key with the verified parameters */
+ ret = dsl_wrapping_key_create(wkeydata, keyformat, salt,
+ iters, &wkey);
+ if (ret != 0)
+ goto error;
+
+ dcp->cp_wkey = wkey;
+ }
+
+ /*
+ * Remove the encryption properties from the nvlist since they are not
+ * maintained through the DSL.
+ */
+ (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION));
+ (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT));
+ (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT));
+ (void) nvlist_remove_all(props,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS));
+
+ *dcp_out = dcp;
+
+ return (0);
+
+error:
+ if (wkey != NULL)
+ dsl_wrapping_key_free(wkey);
+ if (dcp != NULL)
+ kmem_free(dcp, sizeof (dsl_crypto_params_t));
+
+ *dcp_out = NULL;
+ return (ret);
+}
+
+void
+dsl_crypto_params_free(dsl_crypto_params_t *dcp, boolean_t unload)
+{
+ if (dcp == NULL)
+ return;
+
+ if (dcp->cp_keylocation != NULL)
+ spa_strfree(dcp->cp_keylocation);
+ if (unload && dcp->cp_wkey != NULL)
+ dsl_wrapping_key_free(dcp->cp_wkey);
+
+ kmem_free(dcp, sizeof (dsl_crypto_params_t));
+}
+
+static int
+spa_crypto_key_compare(const void *a, const void *b)
+{
+ const dsl_crypto_key_t *dcka = a;
+ const dsl_crypto_key_t *dckb = b;
+
+ if (dcka->dck_obj < dckb->dck_obj)
+ return (-1);
+ if (dcka->dck_obj > dckb->dck_obj)
+ return (1);
+ return (0);
+}
+
+static int
+spa_key_mapping_compare(const void *a, const void *b)
+{
+ const dsl_key_mapping_t *kma = a;
+ const dsl_key_mapping_t *kmb = b;
+
+ if (kma->km_dsobj < kmb->km_dsobj)
+ return (-1);
+ if (kma->km_dsobj > kmb->km_dsobj)
+ return (1);
+ return (0);
+}
+
+static int
+spa_wkey_compare(const void *a, const void *b)
+{
+ const dsl_wrapping_key_t *wka = a;
+ const dsl_wrapping_key_t *wkb = b;
+
+ if (wka->wk_ddobj < wkb->wk_ddobj)
+ return (-1);
+ if (wka->wk_ddobj > wkb->wk_ddobj)
+ return (1);
+ return (0);
+}
+
+void
+spa_keystore_init(spa_keystore_t *sk)
+{
+ rw_init(&sk->sk_dk_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&sk->sk_km_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&sk->sk_wkeys_lock, NULL, RW_DEFAULT, NULL);
+ avl_create(&sk->sk_dsl_keys, spa_crypto_key_compare,
+ sizeof (dsl_crypto_key_t),
+ offsetof(dsl_crypto_key_t, dck_avl_link));
+ avl_create(&sk->sk_key_mappings, spa_key_mapping_compare,
+ sizeof (dsl_key_mapping_t),
+ offsetof(dsl_key_mapping_t, km_avl_link));
+ avl_create(&sk->sk_wkeys, spa_wkey_compare, sizeof (dsl_wrapping_key_t),
+ offsetof(dsl_wrapping_key_t, wk_avl_link));
+}
+
+void
+spa_keystore_fini(spa_keystore_t *sk)
+{
+ dsl_wrapping_key_t *wkey;
+ void *cookie = NULL;
+
+ ASSERT(avl_is_empty(&sk->sk_dsl_keys));
+ ASSERT(avl_is_empty(&sk->sk_key_mappings));
+
+ while ((wkey = avl_destroy_nodes(&sk->sk_wkeys, &cookie)) != NULL)
+ dsl_wrapping_key_free(wkey);
+
+ avl_destroy(&sk->sk_wkeys);
+ avl_destroy(&sk->sk_key_mappings);
+ avl_destroy(&sk->sk_dsl_keys);
+ rw_destroy(&sk->sk_wkeys_lock);
+ rw_destroy(&sk->sk_km_lock);
+ rw_destroy(&sk->sk_dk_lock);
+}
+
+int
+dsl_dir_get_encryption_root_ddobj(dsl_dir_t *dd, uint64_t *rddobj)
+{
+ if (dd->dd_crypto_obj == 0)
+ return (SET_ERROR(ENOENT));
+
+ return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, rddobj));
+}
+
+static int
+spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj,
+ void *tag, dsl_wrapping_key_t **wkey_out)
+{
+ int ret;
+ dsl_wrapping_key_t search_wkey;
+ dsl_wrapping_key_t *found_wkey;
+
+ ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_wkeys_lock));
+
+ /* init the search wrapping key */
+ search_wkey.wk_ddobj = ddobj;
+
+ /* lookup the wrapping key */
+ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &search_wkey, NULL);
+ if (!found_wkey) {
+ ret = SET_ERROR(ENOENT);
+ goto error;
+ }
+
+ /* increment the refcount */
+ dsl_wrapping_key_hold(found_wkey, tag);
+
+ *wkey_out = found_wkey;
+ return (0);
+
+error:
+ *wkey_out = NULL;
+ return (ret);
+}
+
+static int
+spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+ dsl_wrapping_key_t **wkey_out)
+{
+ int ret;
+ dsl_wrapping_key_t *wkey;
+ uint64_t rddobj;
+ boolean_t locked = B_FALSE;
+
+ if (!RW_WRITE_HELD(&spa->spa_keystore.sk_wkeys_lock)) {
+ rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_READER);
+ locked = B_TRUE;
+ }
+
+ /* get the ddobj that the keylocation property was inherited from */
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+ if (ret != 0)
+ goto error;
+
+ /* lookup the wkey in the avl tree */
+ ret = spa_keystore_wkey_hold_ddobj_impl(spa, rddobj, tag, &wkey);
+ if (ret != 0)
+ goto error;
+
+ /* unlock the wkey tree if we locked it */
+ if (locked)
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ *wkey_out = wkey;
+ return (0);
+
+error:
+ if (locked)
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ *wkey_out = NULL;
+ return (ret);
+}
+
+int
+dsl_crypto_can_set_keylocation(const char *dsname, const char *keylocation)
+{
+ int ret = 0;
+ dsl_dir_t *dd = NULL;
+ dsl_pool_t *dp = NULL;
+ dsl_wrapping_key_t *wkey = NULL;
+ uint64_t rddobj;
+
+ /* hold the dsl dir */
+ ret = dsl_pool_hold(dsname, FTAG, &dp);
+ if (ret != 0)
+ goto out;
+
+ ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+ if (ret != 0)
+ goto out;
+
+ /* if dd is not encrypted, the value may only be "none" */
+ if (dd->dd_crypto_obj == 0) {
+ if (strcmp(keylocation, "none") != 0) {
+ ret = SET_ERROR(EACCES);
+ goto out;
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ /* check for a valid keylocation for encrypted datasets */
+ if (!zfs_prop_valid_keylocation(keylocation, B_TRUE)) {
+ ret = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ /* check that this is an encryption root */
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+ if (ret != 0)
+ goto out;
+
+ if (rddobj != dd->dd_object) {
+ ret = SET_ERROR(EACCES);
+ goto out;
+ }
+
+ if (wkey != NULL)
+ dsl_wrapping_key_rele(wkey, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ return (0);
+
+out:
+ if (wkey != NULL)
+ dsl_wrapping_key_rele(wkey, FTAG);
+ if (dd != NULL)
+ dsl_dir_rele(dd, FTAG);
+ if (dp != NULL)
+ dsl_pool_rele(dp, FTAG);
+
+ return (ret);
+}
+
+static void
+dsl_crypto_key_free(dsl_crypto_key_t *dck)
+{
+ ASSERT(refcount_count(&dck->dck_holds) == 0);
+
+ /* destroy the zio_crypt_key_t */
+ zio_crypt_key_destroy(&dck->dck_key);
+
+ /* free the refcount, wrapping key, and lock */
+ refcount_destroy(&dck->dck_holds);
+ if (dck->dck_wkey)
+ dsl_wrapping_key_rele(dck->dck_wkey, dck);
+
+ /* free the key */
+ kmem_free(dck, sizeof (dsl_crypto_key_t));
+}
+
+static void
+dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag)
+{
+ if (refcount_remove(&dck->dck_holds, tag) == 0)
+ dsl_crypto_key_free(dck);
+}
+
+static int
+dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
+ uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out)
+{
+ int ret;
+ uint64_t crypt = 0, guid = 0;
+ uint8_t raw_keydata[MASTER_KEY_MAX_LEN];
+ uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN];
+ uint8_t iv[WRAPPING_IV_LEN];
+ uint8_t mac[WRAPPING_MAC_LEN];
+ dsl_crypto_key_t *dck;
+
+ /* allocate and initialize the key */
+ dck = kmem_zalloc(sizeof (dsl_crypto_key_t), KM_SLEEP);
+ if (!dck)
+ return (SET_ERROR(ENOMEM));
+
+ /* fetch all of the values we need from the ZAP */
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+ &crypt);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+ MASTER_KEY_MAX_LEN, raw_keydata);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+ SHA512_HMAC_KEYLEN, raw_hmac_keydata);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+ iv);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+ mac);
+ if (ret != 0)
+ goto error;
+
+ /*
+ * Unwrap the keys. If there is an error return EACCES to indicate
+ * an authentication failure.
+ */
+ ret = zio_crypt_key_unwrap(&wkey->wk_key, crypt, guid, raw_keydata,
+ raw_hmac_keydata, iv, mac, &dck->dck_key);
+ if (ret != 0) {
+ ret = SET_ERROR(EACCES);
+ goto error;
+ }
+
+ /* finish initializing the dsl_crypto_key_t */
+ refcount_create(&dck->dck_holds);
+ dsl_wrapping_key_hold(wkey, dck);
+ dck->dck_wkey = wkey;
+ dck->dck_obj = dckobj;
+ refcount_add(&dck->dck_holds, tag);
+
+ *dck_out = dck;
+ return (0);
+
+error:
+ if (dck != NULL) {
+ bzero(dck, sizeof (dsl_crypto_key_t));
+ kmem_free(dck, sizeof (dsl_crypto_key_t));
+ }
+
+ *dck_out = NULL;
+ return (ret);
+}
+
+static int
+spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag,
+ dsl_crypto_key_t **dck_out)
+{
+ int ret;
+ dsl_crypto_key_t search_dck;
+ dsl_crypto_key_t *found_dck;
+
+ ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_dk_lock));
+
+ /* init the search key */
+ search_dck.dck_obj = dckobj;
+
+ /* find the matching key in the keystore */
+ found_dck = avl_find(&spa->spa_keystore.sk_dsl_keys, &search_dck, NULL);
+ if (!found_dck) {
+ ret = SET_ERROR(ENOENT);
+ goto error;
+ }
+
+ /* increment the refcount */
+ refcount_add(&found_dck->dck_holds, tag);
+
+ *dck_out = found_dck;
+ return (0);
+
+error:
+ *dck_out = NULL;
+ return (ret);
+}
+
+static int
+spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+ dsl_crypto_key_t **dck_out)
+{
+ int ret;
+ avl_index_t where;
+ dsl_crypto_key_t *dck = NULL;
+ dsl_wrapping_key_t *wkey = NULL;
+ uint64_t dckobj = dd->dd_crypto_obj;
+
+ rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER);
+
+ /* lookup the key in the tree of currently loaded keys */
+ ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck);
+ if (!ret) {
+ rw_exit(&spa->spa_keystore.sk_dk_lock);
+ *dck_out = dck;
+ return (0);
+ }
+
+ /* lookup the wrapping key from the keystore */
+ ret = spa_keystore_wkey_hold_dd(spa, dd, FTAG, &wkey);
+ if (ret != 0) {
+ ret = SET_ERROR(EACCES);
+ goto error_unlock;
+ }
+
+ /* read the key from disk */
+ ret = dsl_crypto_key_open(spa->spa_meta_objset, wkey, dckobj,
+ tag, &dck);
+ if (ret != 0)
+ goto error_unlock;
+
+ /*
+ * add the key to the keystore (this should always succeed
+ * since we made sure it didn't exist before)
+ */
+ avl_find(&spa->spa_keystore.sk_dsl_keys, dck, &where);
+ avl_insert(&spa->spa_keystore.sk_dsl_keys, dck, where);
+
+ /* release the wrapping key (the dsl key now has a reference to it) */
+ dsl_wrapping_key_rele(wkey, FTAG);
+
+ rw_exit(&spa->spa_keystore.sk_dk_lock);
+
+ *dck_out = dck;
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_dk_lock);
+ if (wkey != NULL)
+ dsl_wrapping_key_rele(wkey, FTAG);
+
+ *dck_out = NULL;
+ return (ret);
+}
+
+void
+spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag)
+{
+ rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER);
+
+ if (refcount_remove(&dck->dck_holds, tag) == 0) {
+ avl_remove(&spa->spa_keystore.sk_dsl_keys, dck);
+ dsl_crypto_key_free(dck);
+ }
+
+ rw_exit(&spa->spa_keystore.sk_dk_lock);
+}
+
+int
+spa_keystore_load_wkey_impl(spa_t *spa, dsl_wrapping_key_t *wkey)
+{
+ int ret;
+ avl_index_t where;
+ dsl_wrapping_key_t *found_wkey;
+
+ rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+ /* insert the wrapping key into the keystore */
+ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where);
+ if (found_wkey != NULL) {
+ ret = SET_ERROR(EEXIST);
+ goto error_unlock;
+ }
+ avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where);
+
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+ return (ret);
+}
+
+int
+spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp,
+ boolean_t noop)
+{
+ int ret;
+ dsl_dir_t *dd = NULL;
+ dsl_crypto_key_t *dck = NULL;
+ dsl_wrapping_key_t *wkey = dcp->cp_wkey;
+ dsl_pool_t *dp = NULL;
+
+ /*
+ * We don't validate the wrapping key's keyformat, salt, or iters
+ * since they will never be needed after the DCK has been wrapped.
+ */
+ if (dcp->cp_wkey == NULL ||
+ dcp->cp_cmd != DCP_CMD_NONE ||
+ dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+ dcp->cp_keylocation != NULL)
+ return (SET_ERROR(EINVAL));
+
+ ret = dsl_pool_hold(dsname, FTAG, &dp);
+ if (ret != 0)
+ goto error;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+ ret = (SET_ERROR(ENOTSUP));
+ goto error;
+ }
+
+ /* hold the dsl dir */
+ ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+ if (ret != 0)
+ goto error;
+
+ /* initialize the wkey's ddobj */
+ wkey->wk_ddobj = dd->dd_object;
+
+ /* verify that the wkey is correct by opening its dsl key */
+ ret = dsl_crypto_key_open(dp->dp_meta_objset, wkey,
+ dd->dd_crypto_obj, FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ /*
+ * At this point we have verified the key. We can simply cleanup and
+ * return if this is all the user wanted to do.
+ */
+ if (noop)
+ goto error;
+
+ /* insert the wrapping key into the keystore */
+ ret = spa_keystore_load_wkey_impl(dp->dp_spa, wkey);
+ if (ret != 0)
+ goto error;
+
+ dsl_crypto_key_rele(dck, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ /* create any zvols under this ds */
+ zvol_create_minors(dp->dp_spa, dsname, B_TRUE);
+
+ return (0);
+
+error:
+ if (dck != NULL)
+ dsl_crypto_key_rele(dck, FTAG);
+ if (dd != NULL)
+ dsl_dir_rele(dd, FTAG);
+ if (dp != NULL)
+ dsl_pool_rele(dp, FTAG);
+
+ return (ret);
+}
+
+int
+spa_keystore_unload_wkey_impl(spa_t *spa, uint64_t ddobj)
+{
+ int ret;
+ dsl_wrapping_key_t search_wkey;
+ dsl_wrapping_key_t *found_wkey;
+
+ /* init the search wrapping key */
+ search_wkey.wk_ddobj = ddobj;
+
+ rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+ /* remove the wrapping key from the keystore */
+ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys,
+ &search_wkey, NULL);
+ if (!found_wkey) {
+ ret = SET_ERROR(ENOENT);
+ goto error_unlock;
+ } else if (refcount_count(&found_wkey->wk_refcnt) != 0) {
+ ret = SET_ERROR(EBUSY);
+ goto error_unlock;
+ }
+ avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey);
+
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ /* free the wrapping key */
+ dsl_wrapping_key_free(found_wkey);
+
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+ return (ret);
+}
+
+int
+spa_keystore_unload_wkey(const char *dsname)
+{
+ int ret = 0;
+ dsl_dir_t *dd = NULL;
+ dsl_pool_t *dp = NULL;
+
+ /* hold the dsl dir */
+ ret = dsl_pool_hold(dsname, FTAG, &dp);
+ if (ret != 0)
+ goto error;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+ ret = (SET_ERROR(ENOTSUP));
+ goto error;
+ }
+
+ ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+ if (ret != 0)
+ goto error;
+
+ /* unload the wkey */
+ ret = spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
+ if (ret != 0)
+ goto error;
+
+ dsl_dir_rele(dd, FTAG);
+ dsl_pool_rele(dp, FTAG);
+
+ /* remove any zvols under this ds */
+ zvol_remove_minors(dp->dp_spa, dsname, B_TRUE);
+
+ return (0);
+
+error:
+ if (dd != NULL)
+ dsl_dir_rele(dd, FTAG);
+ if (dp != NULL)
+ dsl_pool_rele(dp, FTAG);
+
+ return (ret);
+}
+
+int
+spa_keystore_create_mapping_impl(spa_t *spa, uint64_t dsobj,
+ dsl_dir_t *dd, void *tag)
+{
+ int ret;
+ avl_index_t where;
+ dsl_key_mapping_t *km = NULL, *found_km;
+ boolean_t should_free = B_FALSE;
+
+ /* allocate the mapping */
+ km = kmem_alloc(sizeof (dsl_key_mapping_t), KM_SLEEP);
+ if (!km)
+ return (SET_ERROR(ENOMEM));
+
+ /* initialize the mapping */
+ refcount_create(&km->km_refcnt);
+
+ ret = spa_keystore_dsl_key_hold_dd(spa, dd, km, &km->km_key);
+ if (ret != 0)
+ goto error;
+
+ km->km_dsobj = dsobj;
+
+ rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER);
+
+ /*
+ * If a mapping already exists, simply increment its refcount and
+ * cleanup the one we made. We want to allocate / free outside of
+ * the lock because this lock is also used by the zio layer to lookup
+ * key mappings. Otherwise, use the one we created. Normally, there will
+ * only be one active reference at a time (the objset owner), but there
+ * are times when there could be multiple async users.
+ */
+ found_km = avl_find(&spa->spa_keystore.sk_key_mappings, km, &where);
+ if (found_km != NULL) {
+ should_free = B_TRUE;
+ refcount_add(&found_km->km_refcnt, tag);
+ } else {
+ refcount_add(&km->km_refcnt, tag);
+ avl_insert(&spa->spa_keystore.sk_key_mappings, km, where);
+ }
+
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ if (should_free) {
+ spa_keystore_dsl_key_rele(spa, km->km_key, km);
+ refcount_destroy(&km->km_refcnt);
+ kmem_free(km, sizeof (dsl_key_mapping_t));
+ }
+
+ return (0);
+
+error:
+ if (km->km_key)
+ spa_keystore_dsl_key_rele(spa, km->km_key, km);
+
+ refcount_destroy(&km->km_refcnt);
+ kmem_free(km, sizeof (dsl_key_mapping_t));
+
+ return (ret);
+}
+
+int
+spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag)
+{
+ return (spa_keystore_create_mapping_impl(spa, ds->ds_object,
+ ds->ds_dir, tag));
+}
+
+int
+spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag)
+{
+ int ret;
+ dsl_key_mapping_t search_km;
+ dsl_key_mapping_t *found_km;
+ boolean_t should_free = B_FALSE;
+
+ /* init the search key mapping */
+ search_km.km_dsobj = dsobj;
+
+ rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER);
+
+ /* find the matching mapping */
+ found_km = avl_find(&spa->spa_keystore.sk_key_mappings,
+ &search_km, NULL);
+ if (found_km == NULL) {
+ ret = SET_ERROR(ENOENT);
+ goto error_unlock;
+ }
+
+ /*
+ * Decrement the refcount on the mapping and remove it from the tree if
+ * it is zero. Try to minimize time spent in this lock by deferring
+ * cleanup work.
+ */
+ if (refcount_remove(&found_km->km_refcnt, tag) == 0) {
+ should_free = B_TRUE;
+ avl_remove(&spa->spa_keystore.sk_key_mappings, found_km);
+ }
+
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ /* destroy the key mapping */
+ if (should_free) {
+ spa_keystore_dsl_key_rele(spa, found_km->km_key, found_km);
+ kmem_free(found_km, sizeof (dsl_key_mapping_t));
+ }
+
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+ return (ret);
+}
+
+/*
+ * This function is primarily used by the zio and arc layer to lookup
+ * DSL Crypto Keys for encryption. Callers must release the key with
+ * spa_keystore_dsl_key_rele(). The function may also be called with
+ * dck_out == NULL and tag == NULL to simply check that a key exists
+ * without getting a reference to it.
+ */
+int
+spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag,
+ dsl_crypto_key_t **dck_out)
+{
+ int ret;
+ dsl_key_mapping_t search_km;
+ dsl_key_mapping_t *found_km;
+
+ ASSERT((tag != NULL && dck_out != NULL) ||
+ (tag == NULL && dck_out == NULL));
+
+ /* init the search key mapping */
+ search_km.km_dsobj = dsobj;
+
+ rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER);
+
+ /* remove the mapping from the tree */
+ found_km = avl_find(&spa->spa_keystore.sk_key_mappings, &search_km,
+ NULL);
+ if (found_km == NULL) {
+ ret = SET_ERROR(ENOENT);
+ goto error_unlock;
+ }
+
+ if (found_km && tag)
+ refcount_add(&found_km->km_key->dck_holds, tag);
+
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ if (dck_out != NULL)
+ *dck_out = found_km->km_key;
+ return (0);
+
+error_unlock:
+ rw_exit(&spa->spa_keystore.sk_km_lock);
+
+ if (dck_out != NULL)
+ *dck_out = NULL;
+ return (ret);
+}
+
+static int
+dmu_objset_check_wkey_loaded(dsl_dir_t *dd)
+{
+ int ret;
+ dsl_wrapping_key_t *wkey = NULL;
+
+ ret = spa_keystore_wkey_hold_dd(dd->dd_pool->dp_spa, dd, FTAG,
+ &wkey);
+ if (ret != 0)
+ return (SET_ERROR(EACCES));
+
+ dsl_wrapping_key_rele(wkey, FTAG);
+
+ return (0);
+}
+
+static zfs_keystatus_t
+dsl_dataset_get_keystatus(dsl_dir_t *dd)
+{
+ /* check if this dd has a has a dsl key */
+ if (dd->dd_crypto_obj == 0)
+ return (ZFS_KEYSTATUS_NONE);
+
+ return (dmu_objset_check_wkey_loaded(dd) == 0 ?
+ ZFS_KEYSTATUS_AVAILABLE : ZFS_KEYSTATUS_UNAVAILABLE);
+}
+
+static int
+dsl_dir_get_crypt(dsl_dir_t *dd, uint64_t *crypt)
+{
+ if (dd->dd_crypto_obj == 0) {
+ *crypt = ZIO_CRYPT_OFF;
+ return (0);
+ }
+
+ return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, crypt));
+}
+
+static void
+dsl_crypto_key_sync_impl(objset_t *mos, uint64_t dckobj, uint64_t crypt,
+ uint64_t root_ddobj, uint64_t guid, uint8_t *iv, uint8_t *mac,
+ uint8_t *keydata, uint8_t *hmac_keydata, uint64_t keyformat,
+ uint64_t salt, uint64_t iters, dmu_tx_t *tx)
+{
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+ &crypt, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1,
+ &root_ddobj, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1,
+ &guid, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+ iv, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+ mac, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+ MASTER_KEY_MAX_LEN, keydata, tx));
+ VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+ SHA512_HMAC_KEYLEN, hmac_keydata, tx));
+ VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT),
+ 8, 1, &keyformat, tx));
+ VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
+ 8, 1, &salt, tx));
+ VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
+ 8, 1, &iters, tx));
+}
+
+static void
+dsl_crypto_key_sync(dsl_crypto_key_t *dck, dmu_tx_t *tx)
+{
+ zio_crypt_key_t *key = &dck->dck_key;
+ dsl_wrapping_key_t *wkey = dck->dck_wkey;
+ uint8_t keydata[MASTER_KEY_MAX_LEN];
+ uint8_t hmac_keydata[SHA512_HMAC_KEYLEN];
+ uint8_t iv[WRAPPING_IV_LEN];
+ uint8_t mac[WRAPPING_MAC_LEN];
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3U(key->zk_crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+ /* encrypt and store the keys along with the IV and MAC */
+ VERIFY0(zio_crypt_key_wrap(&dck->dck_wkey->wk_key, key, iv, mac,
+ keydata, hmac_keydata));
+
+ /* update the ZAP with the obtained values */
+ dsl_crypto_key_sync_impl(tx->tx_pool->dp_meta_objset, dck->dck_obj,
+ key->zk_crypt, wkey->wk_ddobj, key->zk_guid, iv, mac, keydata,
+ hmac_keydata, wkey->wk_keyformat, wkey->wk_salt, wkey->wk_iters,
+ tx);
+}
+
+typedef struct spa_keystore_change_key_args {
+ const char *skcka_dsname;
+ dsl_crypto_params_t *skcka_cp;
+} spa_keystore_change_key_args_t;
+
+static int
+spa_keystore_change_key_check(void *arg, dmu_tx_t *tx)
+{
+ int ret;
+ dsl_dir_t *dd = NULL;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_keystore_change_key_args_t *skcka = arg;
+ dsl_crypto_params_t *dcp = skcka->skcka_cp;
+ uint64_t rddobj;
+
+ /* check for the encryption feature */
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+ ret = SET_ERROR(ENOTSUP);
+ goto error;
+ }
+
+ /* check for valid key change command */
+ if (dcp->cp_cmd != DCP_CMD_NEW_KEY &&
+ dcp->cp_cmd != DCP_CMD_INHERIT &&
+ dcp->cp_cmd != DCP_CMD_FORCE_NEW_KEY &&
+ dcp->cp_cmd != DCP_CMD_FORCE_INHERIT) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* hold the dd */
+ ret = dsl_dir_hold(dp, skcka->skcka_dsname, FTAG, &dd, NULL);
+ if (ret != 0)
+ goto error;
+
+ /* verify that the dataset is encrypted */
+ if (dd->dd_crypto_obj == 0) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* clones must always use their origin's key */
+ if (dsl_dir_is_clone(dd)) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* lookup the ddobj we are inheriting the keylocation from */
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+ if (ret != 0)
+ goto error;
+
+ /* Handle inheritence */
+ if (dcp->cp_cmd == DCP_CMD_INHERIT ||
+ dcp->cp_cmd == DCP_CMD_FORCE_INHERIT) {
+ /* no other encryption params should be given */
+ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+ dcp->cp_keylocation != NULL ||
+ dcp->cp_wkey != NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check that this is an encryption root */
+ if (dd->dd_object != rddobj) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check that the parent is encrypted */
+ if (dd->dd_parent->dd_crypto_obj == 0) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* if we are rewrapping check that both keys are loaded */
+ if (dcp->cp_cmd == DCP_CMD_INHERIT) {
+ ret = dmu_objset_check_wkey_loaded(dd);
+ if (ret != 0)
+ goto error;
+
+ ret = dmu_objset_check_wkey_loaded(dd->dd_parent);
+ if (ret != 0)
+ goto error;
+ }
+
+ dsl_dir_rele(dd, FTAG);
+ return (0);
+ }
+
+ /* handle forcing an encryption root without rewrapping */
+ if (dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) {
+ /* no other encryption params should be given */
+ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+ dcp->cp_keylocation != NULL ||
+ dcp->cp_wkey != NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check that this is not an encryption root */
+ if (dd->dd_object == rddobj) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ dsl_dir_rele(dd, FTAG);
+ return (0);
+ }
+
+ /* crypt cannot be changed after creation */
+ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* we are not inheritting our parent's wkey so we need one ourselves */
+ if (dcp->cp_wkey == NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check for a valid keyformat for the new wrapping key */
+ if (dcp->cp_wkey->wk_keyformat >= ZFS_KEYFORMAT_FORMATS ||
+ dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_NONE) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /*
+ * If this dataset is not currently an encryption root we need a new
+ * keylocation for this dataset's new wrapping key. Otherwise we can
+ * just keep the one we already had.
+ */
+ if (dd->dd_object != rddobj && dcp->cp_keylocation == NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* check that the keylocation is valid if it is not NULL */
+ if (dcp->cp_keylocation != NULL &&
+ !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* passphrases require pbkdf2 salt and iters */
+ if (dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_PASSPHRASE) {
+ if (dcp->cp_wkey->wk_salt == 0 ||
+ dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+ } else {
+ if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+ }
+
+ /* make sure the dd's wkey is loaded */
+ ret = dmu_objset_check_wkey_loaded(dd);
+ if (ret != 0)
+ goto error;
+
+ dsl_dir_rele(dd, FTAG);
+
+ return (0);
+
+error:
+ if (dd != NULL)
+ dsl_dir_rele(dd, FTAG);
+
+ return (ret);
+}
+
+
+static void
+spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj,
+ uint64_t new_rddobj, dsl_wrapping_key_t *wkey, dmu_tx_t *tx)
+{
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd = NULL;
+ dsl_crypto_key_t *dck = NULL;
+ uint64_t curr_rddobj;
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_spa->spa_keystore.sk_wkeys_lock));
+
+ /* hold the dd */
+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+ /* ignore hidden dsl dirs */
+ if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') {
+ dsl_dir_rele(dd, FTAG);
+ return;
+ }
+
+ /* stop recursing if this dsl dir didn't inherit from the root */
+ VERIFY0(dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj));
+
+ if (curr_rddobj != rddobj) {
+ dsl_dir_rele(dd, FTAG);
+ return;
+ }
+
+ /*
+ * If we don't have a wrapping key just update the dck to reflect the
+ * new encryption root. Otherwise rewrap the entire dck and re-sync it
+ * to disk.
+ */
+ if (wkey == NULL) {
+ VERIFY0(zap_update(dp->dp_meta_objset, dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, &new_rddobj, tx));
+ } else {
+ VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd,
+ FTAG, &dck));
+ dsl_wrapping_key_hold(wkey, dck);
+ dsl_wrapping_key_rele(dck->dck_wkey, dck);
+ dck->dck_wkey = wkey;
+ dsl_crypto_key_sync(dck, tx);
+ spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG);
+ }
+
+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /* Recurse into all child and clone dsl dirs. */
+ for (zap_cursor_init(zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ spa_keystore_change_key_sync_impl(rddobj,
+ za->za_first_integer, new_rddobj, wkey, tx);
+ }
+ zap_cursor_fini(zc);
+
+ for (zap_cursor_init(zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_clones);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ dsl_dataset_t *clone;
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ za->za_first_integer, FTAG, &clone));
+ spa_keystore_change_key_sync_impl(rddobj,
+ clone->ds_dir->dd_object, new_rddobj, wkey, tx);
+ dsl_dataset_rele(clone, FTAG);
+ }
+ zap_cursor_fini(zc);
+
+ kmem_free(za, sizeof (zap_attribute_t));
+ kmem_free(zc, sizeof (zap_cursor_t));
+
+ dsl_dir_rele(dd, FTAG);
+}
+
+static void
+spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds;
+ avl_index_t where;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ spa_keystore_change_key_args_t *skcka = arg;
+ dsl_crypto_params_t *dcp = skcka->skcka_cp;
+ dsl_wrapping_key_t *wkey = NULL, *found_wkey;
+ dsl_wrapping_key_t wkey_search;
+ char *keylocation = dcp->cp_keylocation;
+ uint64_t rddobj, new_rddobj;
+
+ /* create and initialize the wrapping key */
+ VERIFY0(dsl_dataset_hold(dp, skcka->skcka_dsname, FTAG, &ds));
+ ASSERT(!ds->ds_is_snapshot);
+
+ if (dcp->cp_cmd == DCP_CMD_NEW_KEY ||
+ dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) {
+ /*
+ * We are changing to a new wkey. Set additional properties
+ * which can be sent along with this ioctl. Note that this
+ * command can set keylocation even if it can't normally be
+ * set via 'zfs set' due to a non-local keylocation.
+ */
+ if (dcp->cp_cmd == DCP_CMD_NEW_KEY) {
+ wkey = dcp->cp_wkey;
+ wkey->wk_ddobj = ds->ds_dir->dd_object;
+ } else {
+ keylocation = "prompt";
+ }
+
+ if (keylocation != NULL) {
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1,
+ keylocation, tx);
+ }
+
+ VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj));
+ new_rddobj = ds->ds_dir->dd_object;
+ } else {
+ /*
+ * We are inheritting the parent's wkey. Unset any local
+ * keylocation and grab a reference to the wkey.
+ */
+ if (dcp->cp_cmd == DCP_CMD_INHERIT) {
+ VERIFY0(spa_keystore_wkey_hold_dd(spa,
+ ds->ds_dir->dd_parent, FTAG, &wkey));
+ }
+
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_NONE,
+ 0, 0, NULL, tx);
+
+ rddobj = ds->ds_dir->dd_object;
+ new_rddobj = ds->ds_dir->dd_parent->dd_object;
+ }
+
+ if (wkey == NULL) {
+ ASSERT(dcp->cp_cmd == DCP_CMD_FORCE_INHERIT ||
+ dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY);
+ }
+
+ rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+ /* recurse through all children and rewrap their keys */
+ spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object,
+ new_rddobj, wkey, tx);
+
+ /*
+ * All references to the old wkey should be released now (if it
+ * existed). Replace the wrapping key.
+ */
+ wkey_search.wk_ddobj = ds->ds_dir->dd_object;
+ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &wkey_search, NULL);
+ if (found_wkey != NULL) {
+ ASSERT0(refcount_count(&found_wkey->wk_refcnt));
+ avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey);
+ dsl_wrapping_key_free(found_wkey);
+ }
+
+ if (dcp->cp_cmd == DCP_CMD_NEW_KEY) {
+ avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where);
+ avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where);
+ } else if (wkey != NULL) {
+ dsl_wrapping_key_rele(wkey, FTAG);
+ }
+
+ rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp)
+{
+ spa_keystore_change_key_args_t skcka;
+
+ /* initialize the args struct */
+ skcka.skcka_dsname = dsname;
+ skcka.skcka_cp = dcp;
+
+ /*
+ * Perform the actual work in syncing context. The blocks modified
+ * here could be calculated but it would require holding the pool
+ * lock and tarversing all of the datasets that will have their keys
+ * changed.
+ */
+ return (dsl_sync_task(dsname, spa_keystore_change_key_check,
+ spa_keystore_change_key_sync, &skcka, 15,
+ ZFS_SPACE_CHECK_RESERVED));
+}
+
+int
+dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent)
+{
+ int ret;
+ uint64_t curr_rddobj, parent_rddobj;
+
+ if (dd->dd_crypto_obj == 0) {
+ /* children of encrypted parents must be encrypted */
+ if (newparent->dd_crypto_obj != 0) {
+ ret = SET_ERROR(EACCES);
+ goto error;
+ }
+
+ return (0);
+ }
+
+ ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj);
+ if (ret != 0)
+ goto error;
+
+ /*
+ * if this is not an encryption root, we must make sure we are not
+ * moving dd to a new encryption root
+ */
+ if (dd->dd_object != curr_rddobj) {
+ ret = dsl_dir_get_encryption_root_ddobj(newparent,
+ &parent_rddobj);
+ if (ret != 0)
+ goto error;
+
+ if (parent_rddobj != curr_rddobj) {
+ ret = SET_ERROR(EACCES);
+ goto error;
+ }
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * Check to make sure that a promote from targetdd to origindd will not require
+ * any key rewraps.
+ */
+int
+dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin)
+{
+ int ret;
+ uint64_t rddobj, op_rddobj, tp_rddobj;
+
+ /* If the dataset is not encrypted we don't need to check anything */
+ if (origin->dd_crypto_obj == 0)
+ return (0);
+
+ /*
+ * If we are not changing the first origin snapshot in a chain
+ * the encryption root won't change either.
+ */
+ if (dsl_dir_is_clone(origin))
+ return (0);
+
+ /*
+ * If the origin is the encryption root we will update
+ * the DSL Crypto Key to point to the target instead.
+ */
+ ret = dsl_dir_get_encryption_root_ddobj(origin, &rddobj);
+ if (ret != 0)
+ return (ret);
+
+ if (rddobj == origin->dd_object)
+ return (0);
+
+ /*
+ * The origin is inheriting its encryption root from its parent.
+ * Check that the parent of the target has the same encryption root.
+ */
+ ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj);
+ if (ret != 0)
+ return (ret);
+
+ ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj);
+ if (ret != 0)
+ return (ret);
+
+ if (op_rddobj != tp_rddobj)
+ return (SET_ERROR(EACCES));
+
+ return (0);
+}
+
+void
+dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin,
+ dmu_tx_t *tx)
+{
+ uint64_t rddobj;
+ dsl_pool_t *dp = target->dd_pool;
+ dsl_dataset_t *targetds;
+ dsl_dataset_t *originds;
+ char *keylocation;
+
+ if (origin->dd_crypto_obj == 0)
+ return;
+ if (dsl_dir_is_clone(origin))
+ return;
+
+ VERIFY0(dsl_dir_get_encryption_root_ddobj(origin, &rddobj));
+
+ if (rddobj != origin->dd_object)
+ return;
+
+ /*
+ * If the target is being promoted to the encyrption root update the
+ * DSL Crypto Key and keylocation to reflect that. We also need to
+ * update the DSL Crypto Keys of all children inheritting their
+ * encryption root to point to the new target. Otherwise, the check
+ * function ensured that the encryption root will not change.
+ */
+ keylocation = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(target)->dd_head_dataset_obj, FTAG, &targetds));
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(origin)->dd_head_dataset_obj, FTAG, &originds));
+
+ VERIFY0(dsl_prop_get_dd(origin, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ 1, ZAP_MAXVALUELEN, keylocation, NULL, B_FALSE));
+ dsl_prop_set_sync_impl(targetds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, keylocation, tx);
+ dsl_prop_set_sync_impl(originds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ ZPROP_SRC_NONE, 0, 0, NULL, tx);
+
+ rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+ spa_keystore_change_key_sync_impl(rddobj, origin->dd_object,
+ target->dd_object, NULL, tx);
+ rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock);
+
+ dsl_dataset_rele(targetds, FTAG);
+ dsl_dataset_rele(originds, FTAG);
+ kmem_free(keylocation, ZAP_MAXVALUELEN);
+}
+
+int
+dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd)
+{
+ int ret;
+ uint64_t pcrypt, crypt;
+
+ /*
+ * Check that we are not making an unencrypted child of an
+ * encrypted parent.
+ */
+ ret = dsl_dir_get_crypt(parentdd, &pcrypt);
+ if (ret != 0)
+ return (ret);
+
+ ret = dsl_dir_get_crypt(origindd, &crypt);
+ if (ret != 0)
+ return (ret);
+
+ ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT);
+ ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT);
+
+ if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF)
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+}
+
+
+int
+dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp)
+{
+ int ret;
+ uint64_t pcrypt, crypt;
+
+ if (dcp->cp_cmd != DCP_CMD_NONE)
+ return (SET_ERROR(EINVAL));
+
+ if (parentdd != NULL) {
+ ret = dsl_dir_get_crypt(parentdd, &pcrypt);
+ if (ret != 0)
+ return (ret);
+ } else {
+ pcrypt = ZIO_CRYPT_OFF;
+ }
+
+ crypt = (dcp->cp_crypt == ZIO_CRYPT_INHERIT) ? pcrypt : dcp->cp_crypt;
+
+ ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT);
+ ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT);
+
+ /*
+ * We can't create an unencrypted child of an encrypted parent
+ * under any circumstances.
+ */
+ if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF)
+ return (SET_ERROR(EINVAL));
+
+ /* check for valid dcp with no encryption (inherited or local) */
+ if (crypt == ZIO_CRYPT_OFF) {
+ /* Must not specify encryption params */
+ if (dcp->cp_wkey != NULL ||
+ (dcp->cp_keylocation != NULL &&
+ strcmp(dcp->cp_keylocation, "none") != 0))
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+ }
+
+ /*
+ * We will now definitely be encrypting. Check the feature flag. When
+ * creating the pool the caller will check this for us since we won't
+ * technically have the fetaure activated yet.
+ */
+ if (parentdd != NULL &&
+ !spa_feature_is_enabled(parentdd->dd_pool->dp_spa,
+ SPA_FEATURE_ENCRYPTION)) {
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ /* handle inheritence */
+ if (dcp->cp_wkey == NULL) {
+ ASSERT3P(parentdd, !=, NULL);
+
+ /* key must be fully unspecified */
+ if (dcp->cp_keylocation != NULL)
+ return (SET_ERROR(EINVAL));
+
+ /* parent must have a key to inherit */
+ if (pcrypt == ZIO_CRYPT_OFF)
+ return (SET_ERROR(EINVAL));
+
+ /* check for parent key */
+ ret = dmu_objset_check_wkey_loaded(parentdd);
+ if (ret != 0)
+ return (ret);
+
+ return (0);
+ }
+
+ /* At this point we should have a fully specified key. Check location */
+ if (dcp->cp_keylocation == NULL ||
+ !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE))
+ return (SET_ERROR(EINVAL));
+
+ /* Must have fully specified keyformat */
+ switch (dcp->cp_wkey->wk_keyformat) {
+ case ZFS_KEYFORMAT_HEX:
+ case ZFS_KEYFORMAT_RAW:
+ /* requires no pbkdf2 iters and salt */
+ if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0)
+ return (SET_ERROR(EINVAL));
+ break;
+ case ZFS_KEYFORMAT_PASSPHRASE:
+ /* requires pbkdf2 iters and salt */
+ if (dcp->cp_wkey->wk_salt == 0 ||
+ dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS)
+ return (SET_ERROR(EINVAL));
+ break;
+ case ZFS_KEYFORMAT_NONE:
+ default:
+ /* keyformat must be specified and valid */
+ return (SET_ERROR(EINVAL));
+ }
+
+ return (0);
+}
+
+void
+dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd,
+ dsl_dataset_t *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+ uint64_t crypt;
+ dsl_wrapping_key_t *wkey;
+
+ /* clones always use their origin's wrapping key */
+ if (dsl_dir_is_clone(dd)) {
+ ASSERT3P(dcp, ==, NULL);
+
+ /*
+ * If this is an encrypted clone we just need to clone the
+ * dck into dd. Zapify the dd so we can do that.
+ */
+ if (origin->ds_dir->dd_crypto_obj != 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_zapify(dd, tx);
+
+ dd->dd_crypto_obj =
+ dsl_crypto_key_clone_sync(origin->ds_dir, tx);
+ VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1,
+ &dd->dd_crypto_obj, tx));
+ }
+
+ return;
+ }
+
+ /*
+ * A NULL dcp at this point indicates this is the origin dataset
+ * which does not have an objset to encrypt. Raw receives will handle
+ * encryption seperately later. In both cases we can simply return.
+ */
+ if (dcp == NULL || dcp->cp_cmd == DCP_CMD_RAW_RECV)
+ return;
+
+ crypt = dcp->cp_crypt;
+ wkey = dcp->cp_wkey;
+
+ /* figure out the effective crypt */
+ if (crypt == ZIO_CRYPT_INHERIT && dd->dd_parent != NULL)
+ VERIFY0(dsl_dir_get_crypt(dd->dd_parent, &crypt));
+
+ /* if we aren't doing encryption just return */
+ if (crypt == ZIO_CRYPT_OFF || crypt == ZIO_CRYPT_INHERIT)
+ return;
+
+ /* zapify the dd so that we can add the crypto key obj to it */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dsl_dir_zapify(dd, tx);
+
+ /* use the new key if given or inherit from the parent */
+ if (wkey == NULL) {
+ VERIFY0(spa_keystore_wkey_hold_dd(dp->dp_spa,
+ dd->dd_parent, FTAG, &wkey));
+ } else {
+ wkey->wk_ddobj = dd->dd_object;
+ }
+
+ /* Create or clone the DSL crypto key and activate the feature */
+ dd->dd_crypto_obj = dsl_crypto_key_create_sync(crypt, wkey, tx);
+ VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj,
+ tx));
+ dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION, tx);
+
+ /*
+ * If we inherited the wrapping key we release our reference now.
+ * Otherwise, this is a new key and we need to load it into the
+ * keystore.
+ */
+ if (dcp->cp_wkey == NULL) {
+ dsl_wrapping_key_rele(wkey, FTAG);
+ } else {
+ VERIFY0(spa_keystore_load_wkey_impl(dp->dp_spa, wkey));
+ }
+}
+
+typedef struct dsl_crypto_recv_key_arg {
+ uint64_t dcrka_dsobj;
+ nvlist_t *dcrka_nvl;
+ dmu_objset_type_t dcrka_ostype;
+} dsl_crypto_recv_key_arg_t;
+
+int
+dsl_crypto_recv_key_check(void *arg, dmu_tx_t *tx)
+{
+ int ret;
+ objset_t *mos = tx->tx_pool->dp_meta_objset;
+ objset_t *os;
+ dnode_t *mdn;
+ dsl_crypto_recv_key_arg_t *dcrka = arg;
+ nvlist_t *nvl = dcrka->dcrka_nvl;
+ dsl_dataset_t *ds = NULL;
+ uint8_t *buf = NULL;
+ uint_t len;
+ uint64_t intval, guid, nlevels, blksz, ibs, nblkptr;
+ boolean_t is_passphrase = B_FALSE;
+
+ ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj, FTAG, &ds);
+ if (ret != 0)
+ goto error;
+
+ ASSERT(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT);
+
+ /*
+ * Read and check all the encryption values from the nvlist. We need
+ * all of the fields of a DSL Crypto Key, as well as a fully specified
+ * wrapping key.
+ */
+ ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval);
+ if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS ||
+ intval <= ZIO_CRYPT_OFF) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval);
+ if (ret != 0) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /*
+ * If this is an incremental receive make sure the given key guid
+ * matches the one we already have.
+ */
+ if (ds->ds_dir->dd_crypto_obj != 0) {
+ ret = zap_lookup(mos, ds->ds_dir->dd_crypto_obj,
+ DSL_CRYPTO_KEY_GUID, 8, 1, &guid);
+ if (ret != 0)
+ goto error;
+
+ if (intval != guid) {
+ ret = SET_ERROR(EACCES);
+ goto error;
+ }
+ }
+
+ ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+ &buf, &len);
+ if (ret != 0 || len != MASTER_KEY_MAX_LEN) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+ &buf, &len);
+ if (ret != 0 || len != SHA512_HMAC_KEYLEN) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &buf, &len);
+ if (ret != 0 || len != WRAPPING_IV_LEN) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &buf, &len);
+ if (ret != 0 || len != WRAPPING_MAC_LEN) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+
+ ret = nvlist_lookup_uint8_array(nvl, "portable_mac", &buf, &len);
+ if (ret != 0 || len != ZIO_OBJSET_MAC_LEN) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT),
+ &intval);
+ if (ret != 0 || intval >= ZFS_KEYFORMAT_FORMATS ||
+ intval == ZFS_KEYFORMAT_NONE) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ is_passphrase = (intval == ZFS_KEYFORMAT_PASSPHRASE);
+
+ /*
+ * for raw receives we allow any number of pbkdf2iters since there
+ * won't be a chance for the user to change it.
+ */
+ ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
+ &intval);
+ if (ret != 0 || (is_passphrase == (intval == 0))) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
+ &intval);
+ if (ret != 0 || (is_passphrase == (intval == 0))) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ /* raw receives also need info about the structure of the metadnode */
+ ret = nvlist_lookup_uint64(nvl, "mdn_checksum", &intval);
+ if (ret != 0 || intval >= ZIO_CHECKSUM_LEGACY_FUNCTIONS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_compress", &intval);
+ if (ret != 0 || intval >= ZIO_COMPRESS_LEGACY_FUNCTIONS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_nlevels", &nlevels);
+ if (ret != 0 || nlevels > DN_MAX_LEVELS) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_blksz", &blksz);
+ if (ret != 0 || blksz < SPA_MINBLOCKSIZE) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ } else if (blksz > spa_maxblocksize(tx->tx_pool->dp_spa)) {
+ ret = SET_ERROR(ENOTSUP);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_indblkshift", &ibs);
+ if (ret != 0 || ibs < DN_MIN_INDBLKSHIFT ||
+ ibs > DN_MAX_INDBLKSHIFT) {
+ ret = SET_ERROR(ENOTSUP);
+ goto error;
+ }
+
+ ret = nvlist_lookup_uint64(nvl, "mdn_nblkptr", &nblkptr);
+ if (ret != 0 || nblkptr != DN_MAX_NBLKPTR) {
+ ret = SET_ERROR(ENOTSUP);
+ goto error;
+ }
+
+ ret = dmu_objset_from_ds(ds, &os);
+ if (ret != 0)
+ goto error;
+
+ /*
+ * Useraccounting is not portable and must be done with the keys loaded.
+ * Therefore, whenever we do any kind of receive the useraccounting
+ * must not be present.
+ */
+ ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
+ ASSERT0(os->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
+
+ mdn = DMU_META_DNODE(os);
+
+ /*
+ * If we already created the objset, make sure its unchangable
+ * properties match the ones received in the nvlist.
+ */
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ if (!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) &&
+ (mdn->dn_nlevels != nlevels || mdn->dn_datablksz != blksz ||
+ mdn->dn_indblkshift != ibs || mdn->dn_nblkptr != nblkptr)) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+
+error:
+ if (ds != NULL)
+ dsl_dataset_rele(ds, FTAG);
+ return (ret);
+}
+
+static void
+dsl_crypto_recv_key_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_crypto_recv_key_arg_t *dcrka = arg;
+ uint64_t dsobj = dcrka->dcrka_dsobj;
+ nvlist_t *nvl = dcrka->dcrka_nvl;
+ dsl_pool_t *dp = tx->tx_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *ds;
+ objset_t *os;
+ dnode_t *mdn;
+ uint8_t *keydata, *hmac_keydata, *iv, *mac, *portable_mac;
+ uint_t len;
+ uint64_t rddobj, one = 1;
+ uint64_t crypt, guid, keyformat, iters, salt;
+ uint64_t compress, checksum, nlevels, blksz, ibs;
+ char *keylocation = "prompt";
+
+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ mdn = DMU_META_DNODE(os);
+
+ /* lookup the values we need to create the DSL Crypto Key and objset */
+ crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE);
+ guid = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID);
+ keyformat = fnvlist_lookup_uint64(nvl,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT));
+ iters = fnvlist_lookup_uint64(nvl,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS));
+ salt = fnvlist_lookup_uint64(nvl,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+ &keydata, &len));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+ &hmac_keydata, &len));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, "portable_mac", &portable_mac,
+ &len));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &iv, &len));
+ VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &mac, &len));
+ compress = fnvlist_lookup_uint64(nvl, "mdn_compress");
+ checksum = fnvlist_lookup_uint64(nvl, "mdn_checksum");
+ nlevels = fnvlist_lookup_uint64(nvl, "mdn_nlevels");
+ blksz = fnvlist_lookup_uint64(nvl, "mdn_blksz");
+ ibs = fnvlist_lookup_uint64(nvl, "mdn_indblkshift");
+
+ /* if we haven't created an objset for the ds yet, do that now */
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
+ (void) dmu_objset_create_impl_dnstats(dp->dp_spa, ds,
+ dsl_dataset_get_blkptr(ds), dcrka->dcrka_ostype, nlevels,
+ blksz, ibs, tx);
+ }
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ /*
+ * Set the portable MAC. The local MAC will always be zero since the
+ * incoming data will all be portable and user accounting will be
+ * deferred until the next mount. Afterwards, flag the os to be
+ * written out raw next time.
+ */
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+ bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN);
+ bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN);
+ os->os_next_write_raw = B_TRUE;
+
+ /* set metadnode compression and checksum */
+ mdn->dn_compress = compress;
+ mdn->dn_checksum = checksum;
+ dsl_dataset_dirty(ds, tx);
+
+ /* if this is a new dataset setup the DSL Crypto Key. */
+ if (ds->ds_dir->dd_crypto_obj == 0) {
+ /* zapify the dsl dir so we can add the key object to it */
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ dsl_dir_zapify(ds->ds_dir, tx);
+
+ /* create the DSL Crypto Key on disk and activate the feature */
+ ds->ds_dir->dd_crypto_obj = zap_create(mos,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_dir->dd_crypto_obj, DSL_CRYPTO_KEY_REFCOUNT,
+ sizeof (uint64_t), 1, &one, tx));
+
+ dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION, tx);
+ ds->ds_feature_inuse[SPA_FEATURE_ENCRYPTION] = B_TRUE;
+
+ /* save the dd_crypto_obj on disk */
+ VERIFY0(zap_add(mos, ds->ds_dir->dd_object,
+ DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1,
+ &ds->ds_dir->dd_crypto_obj, tx));
+
+ /*
+ * Set the keylocation to prompt by default. If keylocation
+ * has been provided via the properties, this will be overriden
+ * later.
+ */
+ dsl_prop_set_sync_impl(ds,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+ ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1,
+ keylocation, tx);
+
+ rddobj = ds->ds_dir->dd_object;
+ } else {
+ VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj));
+ }
+
+ /* sync the key data to the ZAP object on disk */
+ dsl_crypto_key_sync_impl(mos, ds->ds_dir->dd_crypto_obj, crypt,
+ rddobj, guid, iv, mac, keydata, hmac_keydata, keyformat, salt,
+ iters, tx);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * This function is used to sync an nvlist representing a DSL Crypto Key and
+ * the associated encryption parameters. The key will be written exactly as is
+ * without wrapping it.
+ */
+int
+dsl_crypto_recv_key(const char *poolname, uint64_t dsobj,
+ dmu_objset_type_t ostype, nvlist_t *nvl)
+{
+ dsl_crypto_recv_key_arg_t dcrka;
+
+ dcrka.dcrka_dsobj = dsobj;
+ dcrka.dcrka_nvl = nvl;
+ dcrka.dcrka_ostype = ostype;
+
+ return (dsl_sync_task(poolname, dsl_crypto_recv_key_check,
+ dsl_crypto_recv_key_sync, &dcrka, 1, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dsl_crypto_populate_key_nvlist(dsl_dataset_t *ds, nvlist_t **nvl_out)
+{
+ int ret;
+ objset_t *os;
+ dnode_t *mdn;
+ uint64_t rddobj;
+ nvlist_t *nvl = NULL;
+ uint64_t dckobj = ds->ds_dir->dd_crypto_obj;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t crypt = 0, guid = 0, format = 0, iters = 0, salt = 0;
+ uint8_t raw_keydata[MASTER_KEY_MAX_LEN];
+ uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN];
+ uint8_t iv[WRAPPING_IV_LEN];
+ uint8_t mac[WRAPPING_MAC_LEN];
+
+ ASSERT(dckobj != 0);
+
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ mdn = DMU_META_DNODE(os);
+
+ ret = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
+ if (ret != 0)
+ goto error;
+
+ /* lookup values from the DSL Crypto Key */
+ ret = dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+ &crypt);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+ MASTER_KEY_MAX_LEN, raw_keydata);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+ SHA512_HMAC_KEYLEN, raw_hmac_keydata);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+ iv);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+ mac);
+ if (ret != 0)
+ goto error;
+
+ /* lookup wrapping key properties */
+ ret = zap_lookup(dp->dp_meta_objset, dckobj,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &format);
+ if (ret != 0)
+ goto error;
+
+ if (format == ZFS_KEYFORMAT_PASSPHRASE) {
+ ret = zap_lookup(dp->dp_meta_objset, dckobj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters);
+ if (ret != 0)
+ goto error;
+
+ ret = zap_lookup(dp->dp_meta_objset, dckobj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt);
+ if (ret != 0)
+ goto error;
+ }
+
+ fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, crypt);
+ fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_GUID, guid);
+ VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+ raw_keydata, MASTER_KEY_MAX_LEN));
+ VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+ raw_hmac_keydata, SHA512_HMAC_KEYLEN));
+ VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_IV, iv,
+ WRAPPING_IV_LEN));
+ VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, mac,
+ WRAPPING_MAC_LEN));
+ VERIFY0(nvlist_add_uint8_array(nvl, "portable_mac",
+ os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN));
+ fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), format);
+ fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), iters);
+ fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt);
+ fnvlist_add_uint64(nvl, "mdn_checksum", mdn->dn_checksum);
+ fnvlist_add_uint64(nvl, "mdn_compress", mdn->dn_compress);
+ fnvlist_add_uint64(nvl, "mdn_nlevels", mdn->dn_nlevels);
+ fnvlist_add_uint64(nvl, "mdn_blksz", mdn->dn_datablksz);
+ fnvlist_add_uint64(nvl, "mdn_indblkshift", mdn->dn_indblkshift);
+ fnvlist_add_uint64(nvl, "mdn_nblkptr", mdn->dn_nblkptr);
+
+ *nvl_out = nvl;
+ return (0);
+
+error:
+ nvlist_free(nvl);
+
+ *nvl_out = NULL;
+ return (ret);
+}
+
+uint64_t
+dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey,
+ dmu_tx_t *tx)
+{
+ dsl_crypto_key_t dck;
+ uint64_t one = 1;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(crypt, >, ZIO_CRYPT_OFF);
+
+ /* create the DSL Crypto Key ZAP object */
+ dck.dck_obj = zap_create(tx->tx_pool->dp_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+
+ /* fill in the key (on the stack) and sync it to disk */
+ dck.dck_wkey = wkey;
+ VERIFY0(zio_crypt_key_init(crypt, &dck.dck_key));
+
+ dsl_crypto_key_sync(&dck, tx);
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj,
+ DSL_CRYPTO_KEY_REFCOUNT, sizeof (uint64_t), 1, &one, tx));
+
+ zio_crypt_key_destroy(&dck.dck_key);
+ bzero(&dck.dck_key, sizeof (zio_crypt_key_t));
+
+ return (dck.dck_obj);
+}
+
+uint64_t
+dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx)
+{
+ objset_t *mos = tx->tx_pool->dp_meta_objset;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(zap_increment(mos, origindd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_REFCOUNT, 1, tx));
+
+ return (origindd->dd_crypto_obj);
+}
+
+void
+dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx)
+{
+ objset_t *mos = tx->tx_pool->dp_meta_objset;
+ uint64_t refcnt;
+
+ /* Decrement the refcount, destroy if this is the last reference */
+ VERIFY0(zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT,
+ sizeof (uint64_t), 1, &refcnt));
+
+ if (refcnt != 1) {
+ VERIFY0(zap_increment(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT,
+ -1, tx));
+ } else {
+ VERIFY0(zap_destroy(mos, dckobj, tx));
+ }
+}
+
+void
+dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ uint64_t intval;
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_dir_t *enc_root;
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+
+ if (dd->dd_crypto_obj == 0)
+ return;
+
+ intval = dsl_dataset_get_keystatus(dd);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYSTATUS, intval);
+
+ if (dsl_dir_get_crypt(dd, &intval) == 0)
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_ENCRYPTION, intval);
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_GUID, 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEY_GUID, intval);
+ }
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYFORMAT, intval);
+ }
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_SALT, intval);
+ }
+ if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &intval) == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_ITERS, intval);
+ }
+
+ if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) {
+ VERIFY0(dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG,
+ &enc_root));
+ dsl_dir_name(enc_root, buf);
+ dsl_dir_rele(enc_root, FTAG);
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_ENCRYPTION_ROOT, buf);
+ }
+}
+
+int
+spa_crypt_get_salt(spa_t *spa, uint64_t dsobj, uint8_t *salt)
+{
+ int ret;
+ dsl_crypto_key_t *dck = NULL;
+
+ /* look up the key from the spa's keystore */
+ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ ret = zio_crypt_key_get_salt(&dck->dck_key, salt);
+ if (ret != 0)
+ goto error;
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ return (0);
+
+error:
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ return (ret);
+}
+
+/*
+ * Objset blocks are a special case for MAC generation. These blocks have 2
+ * 256-bit MACs which are embedded within the block itself, rather than a
+ * single 128 bit MAC. As a result, this function handles encoding and decoding
+ * the MACs on its own, unlike other functions in this file.
+ */
+int
+spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj,
+ abd_t *abd, uint_t datalen, boolean_t byteswap)
+{
+ int ret;
+ dsl_crypto_key_t *dck = NULL;
+ void *buf = abd_borrow_buf_copy(abd, datalen);
+ objset_phys_t *osp = buf;
+ uint8_t portable_mac[ZIO_OBJSET_MAC_LEN];
+ uint8_t local_mac[ZIO_OBJSET_MAC_LEN];
+
+ /* look up the key from the spa's keystore */
+ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ /* calculate both HMACs */
+ ret = zio_crypt_do_objset_hmacs(&dck->dck_key, buf, datalen,
+ byteswap, portable_mac, local_mac);
+ if (ret != 0)
+ goto error;
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ /* if we are generating encode the HMACs in the objset_phys_t */
+ if (generate) {
+ bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN);
+ bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN);
+ abd_return_buf_copy(abd, buf, datalen);
+ return (0);
+ }
+
+ if (bcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 ||
+ bcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) {
+ abd_return_buf(abd, buf, datalen);
+ return (SET_ERROR(ECKSUM));
+ }
+
+ abd_return_buf(abd, buf, datalen);
+
+ return (0);
+
+error:
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ abd_return_buf(abd, buf, datalen);
+ return (ret);
+}
+
+int
+spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd,
+ uint_t datalen, uint8_t *mac)
+{
+ int ret;
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t *buf = abd_borrow_buf_copy(abd, datalen);
+ uint8_t digestbuf[ZIO_DATA_MAC_LEN];
+
+ /* look up the key from the spa's keystore */
+ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+ if (ret != 0)
+ goto error;
+
+ /* perform the hmac */
+ ret = zio_crypt_do_hmac(&dck->dck_key, buf, datalen, digestbuf);
+ if (ret != 0)
+ goto error;
+
+ abd_return_buf(abd, buf, datalen);
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ /*
+ * Truncate and fill in mac buffer if we were asked to generate a MAC.
+ * Otherwise verify that the MAC matched what we expected.
+ */
+ if (generate) {
+ bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN);
+ return (0);
+ }
+
+ if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0)
+ return (SET_ERROR(ECKSUM));
+
+ return (0);
+
+error:
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+ abd_return_buf(abd, buf, datalen);
+ return (ret);
+}
+
+/*
+ * This function serves as a multiplexer for encryption and decryption of
+ * all blocks (except the L2ARC). For encryption, it will populate the IV,
+ * salt, MAC, and cabd (the ciphertext). On decryption it will simply use
+ * these fields to populate pabd (the plaintext).
+ */
+int
+spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, uint64_t dsobj,
+ const blkptr_t *bp, uint64_t txgid, uint_t datalen, abd_t *pabd,
+ abd_t *cabd, uint8_t *iv, uint8_t *mac, uint8_t *salt, boolean_t *no_crypt)
+{
+ int ret;
+ dmu_object_type_t ot = BP_GET_TYPE(bp);
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t *plainbuf = NULL, *cipherbuf = NULL;
+
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(BP_IS_ENCRYPTED(bp));
+
+ /* look up the key from the spa's keystore */
+ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+ if (ret != 0)
+ return (ret);
+
+ if (encrypt) {
+ plainbuf = abd_borrow_buf_copy(pabd, datalen);
+ cipherbuf = abd_borrow_buf(cabd, datalen);
+ } else {
+ plainbuf = abd_borrow_buf(pabd, datalen);
+ cipherbuf = abd_borrow_buf_copy(cabd, datalen);
+ }
+
+ /*
+ * Both encryption and decryption functions need a salt for key
+ * generation and an IV. When encrypting a non-dedup block, we
+ * generate the salt and IV randomly to be stored by the caller. Dedup
+ * blocks perform a (more expensive) HMAC of the plaintext to obtain
+ * the salt and the IV. ZIL blocks have their salt and IV generated
+ * at allocation time in zio_alloc_zil(). On decryption, we simply use
+ * the provided values.
+ */
+ if (encrypt && ot != DMU_OT_INTENT_LOG && !BP_GET_DEDUP(bp)) {
+ ret = zio_crypt_key_get_salt(&dck->dck_key, salt);
+ if (ret != 0)
+ goto error;
+
+ ret = zio_crypt_generate_iv(iv);
+ if (ret != 0)
+ goto error;
+ } else if (encrypt && BP_GET_DEDUP(bp)) {
+ ret = zio_crypt_generate_iv_salt_dedup(&dck->dck_key,
+ plainbuf, datalen, iv, salt);
+ if (ret != 0)
+ goto error;
+ }
+
+ /* call lower level function to perform encryption / decryption */
+ ret = zio_do_crypt_data(encrypt, &dck->dck_key, salt, ot, iv, mac,
+ datalen, BP_SHOULD_BYTESWAP(bp), plainbuf, cipherbuf, no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (encrypt) {
+ abd_return_buf(pabd, plainbuf, datalen);
+ abd_return_buf_copy(cabd, cipherbuf, datalen);
+ } else {
+ abd_return_buf_copy(pabd, plainbuf, datalen);
+ abd_return_buf(cabd, cipherbuf, datalen);
+ }
+
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ return (0);
+
+error:
+ if (encrypt) {
+ /* zero out any state we might have changed while encrypting */
+ bzero(salt, ZIO_DATA_SALT_LEN);
+ bzero(iv, ZIO_DATA_IV_LEN);
+ bzero(mac, ZIO_DATA_MAC_LEN);
+ abd_return_buf(pabd, plainbuf, datalen);
+ abd_return_buf_copy(cabd, cipherbuf, datalen);
+ } else {
+ abd_return_buf_copy(pabd, plainbuf, datalen);
+ abd_return_buf(cabd, cipherbuf, datalen);
+ }
+
+ if (dck != NULL)
+ spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+ return (ret);
+}
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index bd03b4868..478995a4e 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -386,8 +386,8 @@ dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
}
int
-dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
- dsl_dataset_t **dsp)
+dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
+ ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
{
objset_t *mos = dp->dp_meta_objset;
dmu_buf_t *dbuf;
@@ -548,11 +548,27 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
*dsp = ds;
+
+ if ((flags & DS_HOLD_FLAG_DECRYPT) && ds->ds_dir->dd_crypto_obj != 0) {
+ err = spa_keystore_create_mapping(dp->dp_spa, ds, ds);
+ if (err != 0) {
+ dsl_dataset_rele(ds, tag);
+ return (SET_ERROR(EACCES));
+ }
+ }
+
return (0);
}
int
-dsl_dataset_hold(dsl_pool_t *dp, const char *name,
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+ dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_hold_obj_flags(dp, dsobj, 0, tag, dsp));
+}
+
+int
+dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
dsl_dir_t *dd;
@@ -568,7 +584,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
ASSERT(dsl_pool_config_held(dp));
obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
if (obj != 0)
- err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+ err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds);
else
err = SET_ERROR(ENOENT);
@@ -577,16 +593,18 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
dsl_dataset_t *snap_ds;
if (*snapname++ != '@') {
- dsl_dataset_rele(ds, tag);
+ dsl_dataset_rele_flags(ds, flags, tag);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(ENOENT));
}
dprintf("looking for snapshot '%s'\n", snapname);
err = dsl_dataset_snap_lookup(ds, snapname, &obj);
- if (err == 0)
- err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
- dsl_dataset_rele(ds, tag);
+ if (err == 0) {
+ err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag,
+ &snap_ds);
+ }
+ dsl_dataset_rele_flags(ds, flags, tag);
if (err == 0) {
mutex_enter(&snap_ds->ds_lock);
@@ -604,14 +622,21 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
}
int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
+dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
+ dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
+}
+
+int
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
- int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
+ int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
if (err != 0)
return (err);
if (!dsl_dataset_tryown(*dsp, tag)) {
- dsl_dataset_rele(*dsp, tag);
+ dsl_dataset_rele_flags(*dsp, flags, tag);
*dsp = NULL;
return (SET_ERROR(EBUSY));
}
@@ -619,14 +644,14 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
}
int
-dsl_dataset_own(dsl_pool_t *dp, const char *name,
+dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
- int err = dsl_dataset_hold(dp, name, tag, dsp);
+ int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
if (err != 0)
return (err);
if (!dsl_dataset_tryown(*dsp, tag)) {
- dsl_dataset_rele(*dsp, tag);
+ dsl_dataset_rele_flags(*dsp, flags, tag);
return (SET_ERROR(EBUSY));
}
return (0);
@@ -707,13 +732,25 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
}
void
-dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
{
+ if (ds->ds_dir != NULL && ds->ds_dir->dd_crypto_obj != 0 &&
+ (flags & DS_HOLD_FLAG_DECRYPT)) {
+ (void) spa_keystore_remove_mapping(ds->ds_dir->dd_pool->dp_spa,
+ ds->ds_object, ds);
+ }
+
dmu_buf_rele(ds->ds_dbuf, tag);
}
void
-dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+{
+ dsl_dataset_rele_flags(ds, 0, tag);
+}
+
+void
+dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
{
ASSERT3P(ds->ds_owner, ==, tag);
ASSERT(ds->ds_dbuf != NULL);
@@ -722,7 +759,7 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
ds->ds_owner = NULL;
mutex_exit(&ds->ds_lock);
dsl_dataset_long_rele(ds, tag);
- dsl_dataset_rele(ds, tag);
+ dsl_dataset_rele_flags(ds, flags, tag);
}
boolean_t
@@ -751,7 +788,7 @@ dsl_dataset_has_owner(dsl_dataset_t *ds)
return (rv);
}
-static void
+void
dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
@@ -781,7 +818,7 @@ dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
uint64_t
dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
- uint64_t flags, dmu_tx_t *tx)
+ dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx)
{
dsl_pool_t *dp = dd->dd_pool;
dmu_buf_t *dbuf;
@@ -881,6 +918,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
}
}
+ /* handle encryption */
+ dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx);
+
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
@@ -903,6 +943,8 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
zio_t *zio;
bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+ if (os->os_encrypted)
+ os->os_next_write_raw = B_TRUE;
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dsl_dataset_sync(ds, zio, tx);
@@ -916,7 +958,8 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
uint64_t
dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
- dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
+ dsl_dataset_t *origin, uint64_t flags, cred_t *cr,
+ dsl_crypto_params_t *dcp, dmu_tx_t *tx)
{
dsl_pool_t *dp = pdd->dd_pool;
uint64_t dsobj, ddobj;
@@ -928,7 +971,7 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
- dsobj = dsl_dataset_create_sync_dd(dd, origin,
+ dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp,
flags & ~DS_CREATE_FLAG_NODIRTY, tx);
dsl_deleg_set_create_perms(dd, tx, cr);
@@ -1821,6 +1864,10 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
DS_FIELD_RESUME_COMPRESSOK) == 0) {
fnvlist_add_boolean(token_nv, "compressok");
}
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_RAWOK) == 0) {
+ fnvlist_add_boolean(token_nv, "rawok");
+ }
packed = fnvlist_pack(token_nv, &packed_size);
fnvlist_free(token_nv);
compressed = kmem_alloc(packed_size, KM_SLEEP);
@@ -1851,6 +1898,7 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
void
dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
+ int err;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
uint64_t refd, avail, uobjs, aobjs, ratio;
@@ -1901,12 +1949,12 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
ds->ds_userrefs);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
+ dsl_dataset_crypt_stats(ds, nv);
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
uint64_t written, comp, uncomp;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dsl_dataset_t *prev;
- int err;
err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
@@ -2340,7 +2388,7 @@ dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
fnvlist_add_string(ddra->ddra_result, "target", namebuf);
cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
- ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
+ ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx);
VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
@@ -2427,6 +2475,23 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EXDEV));
}
+ snap = list_head(&ddpa->shared_snaps);
+ if (snap == NULL) {
+ err = SET_ERROR(ENOENT);
+ goto out;
+ }
+ origin_ds = snap->ds;
+
+ /*
+ * Encrypted clones share a DSL Crypto Key with their origin's dsl dir.
+ * When doing a promote we must make sure the encryption root for
+ * both the target and the target's origin does not change to avoid
+ * needing to rewrap encryption keys
+ */
+ err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir);
+ if (err != 0)
+ goto out;
+
/*
* Compute and check the amount of space to transfer. Since this is
* so expensive, don't do the preliminary check.
@@ -2436,13 +2501,6 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
return (0);
}
- snap = list_head(&ddpa->shared_snaps);
- if (snap == NULL) {
- err = SET_ERROR(ENOENT);
- goto out;
- }
- origin_ds = snap->ds;
-
/* compute origin's new unique space */
snap = list_tail(&ddpa->clone_snaps);
ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
@@ -2611,6 +2669,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
NULL, FTAG, &odd));
+ dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx);
+
/* change origin's next snap */
dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
@@ -3692,11 +3752,14 @@ MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
#endif
EXPORT_SYMBOL(dsl_dataset_hold);
+EXPORT_SYMBOL(dsl_dataset_hold_flags);
EXPORT_SYMBOL(dsl_dataset_hold_obj);
+EXPORT_SYMBOL(dsl_dataset_hold_obj_flags);
EXPORT_SYMBOL(dsl_dataset_own);
EXPORT_SYMBOL(dsl_dataset_own_obj);
EXPORT_SYMBOL(dsl_dataset_name);
EXPORT_SYMBOL(dsl_dataset_rele);
+EXPORT_SYMBOL(dsl_dataset_rele_flags);
EXPORT_SYMBOL(dsl_dataset_disown);
EXPORT_SYMBOL(dsl_dataset_tryown);
EXPORT_SYMBOL(dsl_dataset_create_sync);
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index d980f7d1f..1d4716028 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -598,8 +598,8 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
ka.ds = ds;
ka.tx = tx;
VERIFY0(traverse_dataset(ds,
- dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
- kill_blkptr, &ka));
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST |
+ TRAVERSE_NO_DECRYPT, kill_blkptr, &ka));
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
dsl_dataset_phys(ds)->ds_unique_bytes == 0);
}
@@ -706,6 +706,11 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
for (t = 0; t < DD_USED_NUM; t++)
ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
+ if (dd->dd_crypto_obj != 0) {
+ dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx);
+ (void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
+ }
+
VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
@@ -951,7 +956,8 @@ dsl_destroy_head(const char *name)
* remove the objects from open context so that the txg sync
* is not too long.
*/
- error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
+ error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_FALSE,
+ FTAG, &os);
if (error == 0) {
uint64_t obj;
uint64_t prev_snap_txg =
@@ -963,7 +969,7 @@ dsl_destroy_head(const char *name)
(void) dmu_free_long_object(os, obj);
/* sync out all frees */
txg_wait_synced(dmu_objset_pool(os), 0);
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_FALSE, FTAG);
}
}
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index a3ef5896a..68791fe74 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -159,6 +159,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
{
dmu_buf_t *dbuf;
dsl_dir_t *dd;
+ dmu_object_info_t doi;
int err;
ASSERT(dsl_pool_config_held(dp));
@@ -167,14 +168,11 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
if (err != 0)
return (err);
dd = dmu_buf_get_user(dbuf);
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(dbuf, &doi);
- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
- ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
- }
-#endif
+
+ dmu_object_info_from_db(dbuf, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
+
if (dd == NULL) {
dsl_dir_t *winner;
@@ -182,6 +180,15 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
dd->dd_object = ddobj;
dd->dd_dbuf = dbuf;
dd->dd_pool = dp;
+
+ if (dsl_dir_is_zapified(dd) &&
+ zap_contains(dp->dp_meta_objset, ddobj,
+ DD_FIELD_CRYPTO_KEY_OBJ) == 0) {
+ VERIFY0(zap_lookup(dp->dp_meta_objset,
+ ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
+ sizeof (uint64_t), 1, &dd->dd_crypto_obj));
+ }
+
mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
dsl_prop_init(dd);
@@ -918,6 +925,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+
dmu_buf_rele(dbuf, FTAG);
return (ddobj);
@@ -935,6 +943,8 @@ dsl_dir_is_clone(dsl_dir_t *dd)
void
dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
{
+ uint64_t intval;
+
mutex_enter(&dd->dd_lock);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
dsl_dir_phys(dd)->dd_used_bytes);
@@ -962,18 +972,17 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
mutex_exit(&dd->dd_lock);
if (dsl_dir_is_zapified(dd)) {
- uint64_t count;
objset_t *os = dd->dd_pool->dp_meta_objset;
if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
- sizeof (count), 1, &count) == 0) {
+ sizeof (intval), 1, &intval) == 0) {
dsl_prop_nvlist_add_uint64(nv,
- ZFS_PROP_FILESYSTEM_COUNT, count);
+ ZFS_PROP_FILESYSTEM_COUNT, intval);
}
if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
- sizeof (count), 1, &count) == 0) {
+ sizeof (intval), 1, &intval) == 0) {
dsl_prop_nvlist_add_uint64(nv,
- ZFS_PROP_SNAPSHOT_COUNT, count);
+ ZFS_PROP_SNAPSHOT_COUNT, intval);
}
}
@@ -1814,6 +1823,14 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
}
}
+ /* check for encryption errors */
+ error = dsl_dir_rename_crypt_check(dd, newparent);
+ if (error != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (SET_ERROR(EACCES));
+ }
+
/* no rename into our descendant */
if (closest_common_ancestor(dd, newparent) == dd) {
dsl_dir_rele(newparent, FTAG);
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index c16708048..a28be34fb 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -359,7 +359,8 @@ dsl_pool_close(dsl_pool_t *dp)
}
dsl_pool_t *
-dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
+ uint64_t txg)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
@@ -373,6 +374,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
/* create and open the MOS (meta-objset) */
dp->dp_meta_objset = dmu_objset_create_impl(spa,
NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
+ spa->spa_meta_objset = dp->dp_meta_objset;
/* create the pool directory */
err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -410,8 +412,19 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
dsl_pool_create_origin(dp, tx);
+ /*
+ * Some features may be needed when creating the root dataset, so we
+ * create the feature objects here.
+ */
+ if (spa_version(spa) >= SPA_VERSION_FEATURES)
+ spa_feature_create_zap_objects(spa, tx);
+
+ if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF &&
+ dcp->cp_crypt != ZIO_CRYPT_INHERIT)
+ spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx);
+
/* create the root dataset */
- obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);
/* create the root objset */
VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
@@ -865,7 +878,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
/* create the origin dir, ds, & snap-ds */
dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
- NULL, 0, kcred, tx);
+ NULL, 0, kcred, NULL, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c
index bd32a4d95..57b8eb794 100644
--- a/module/zfs/dsl_prop.c
+++ b/module/zfs/dsl_prop.c
@@ -963,7 +963,7 @@ typedef enum dsl_prop_getflags {
DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */
DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */
DSL_PROP_GET_LOCAL = 0x4, /* local properties */
- DSL_PROP_GET_RECEIVED = 0x8 /* received properties */
+ DSL_PROP_GET_RECEIVED = 0x8, /* received properties */
} dsl_prop_getflags_t;
static int
@@ -1130,6 +1130,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
if (err)
break;
}
+
out:
if (err) {
nvlist_free(*nvp);
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 9399ec71a..d8e318895 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -683,7 +683,7 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
zilog = zil_alloc(dp->dp_meta_objset, zh);
(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
- claim_txg);
+ claim_txg, B_FALSE);
zil_free(zilog);
}
@@ -695,6 +695,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
{
zbookmark_phys_t czb;
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
if (zfs_no_scrub_prefetch)
return;
@@ -703,11 +704,16 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
return;
+ if (BP_IS_PROTECTED(bp)) {
+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
+ ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+
SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
- NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &czb);
}
static boolean_t
@@ -793,6 +799,11 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_buf_t *buf;
+ if (BP_IS_PROTECTED(bp)) {
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index cb86c6200..c519e933b 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1169,6 +1169,8 @@ spa_activate(spa_t *spa, int mode)
spa_error_entry_compare, sizeof (spa_error_entry_t),
offsetof(spa_error_entry_t, se_avl));
+ spa_keystore_init(&spa->spa_keystore);
+
/*
* This taskq is used to perform zvol-minor-related tasks
* asynchronously. This has several advantages, including easy
@@ -1246,10 +1248,11 @@ spa_deactivate(spa_t *spa)
* still have errors left in the queues. Empty them just in case.
*/
spa_errlog_drain(spa);
-
avl_destroy(&spa->spa_errlist_scrub);
avl_destroy(&spa->spa_errlist_last);
+ spa_keystore_fini(&spa->spa_keystore);
+
spa->spa_state = POOL_STATE_UNINITIALIZED;
mutex_enter(&spa->spa_proc_lock);
@@ -2094,8 +2097,8 @@ spa_load_verify(spa_t *spa)
if (spa_load_verify_metadata) {
error = traverse_pool(spa, spa->spa_verify_min_txg,
- TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
- spa_load_verify_cb, rio);
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+ TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
}
(void) zio_wait(rio);
@@ -2301,7 +2304,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
spa->spa_loaded_ts.tv_nsec = 0;
}
if (error != EBADF) {
- zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0);
}
}
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
@@ -3979,11 +3982,27 @@ spa_l2cache_drop(spa_t *spa)
}
/*
+ * Verify encryption parameters for spa creation. If we are encrypting, we must
+ * have the encryption feature flag enabled.
+ */
+static int
+spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
+ boolean_t has_encryption)
+{
+ if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
+ dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
+ !has_encryption)
+ return (SET_ERROR(ENOTSUP));
+
+ return (dmu_objset_create_crypt_check(NULL, dcp));
+}
+
+/*
* Pool Creation
*/
int
spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
- nvlist_t *zplprops)
+ nvlist_t *zplprops, dsl_crypto_params_t *dcp)
{
spa_t *spa;
char *altroot = NULL;
@@ -3994,8 +4013,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
- uint64_t version, obj;
+ uint64_t version, obj, root_dsobj = 0;
boolean_t has_features;
+ boolean_t has_encryption;
+ spa_feature_t feat;
+ char *feat_name;
nvpair_t *elem;
int c, i;
char *poolname;
@@ -4038,10 +4060,28 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
has_features = B_FALSE;
+ has_encryption = B_FALSE;
for (elem = nvlist_next_nvpair(props, NULL);
elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
- if (zpool_prop_feature(nvpair_name(elem)))
+ if (zpool_prop_feature(nvpair_name(elem))) {
has_features = B_TRUE;
+
+ feat_name = strchr(nvpair_name(elem), '@') + 1;
+ VERIFY0(zfeature_lookup_name(feat_name, &feat));
+ if (feat == SPA_FEATURE_ENCRYPTION)
+ has_encryption = B_TRUE;
+ }
+ }
+
+ /* verify encryption params, if they were provided */
+ if (dcp != NULL) {
+ error = spa_create_check_encryption_params(dcp, has_encryption);
+ if (error != 0) {
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
}
if (has_features || nvlist_lookup_uint64(props,
@@ -4131,8 +4171,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
}
spa->spa_is_initializing = B_TRUE;
- spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
- spa->spa_meta_objset = dp->dp_meta_objset;
+ spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
spa->spa_is_initializing = B_FALSE;
/*
@@ -4157,9 +4196,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
cmn_err(CE_PANIC, "failed to add pool config");
}
- if (spa_version(spa) >= SPA_VERSION_FEATURES)
- spa_feature_create_zap_objects(spa, tx);
-
if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
sizeof (uint64_t), 1, &version, tx) != 0) {
@@ -4220,15 +4256,26 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
dmu_tx_commit(tx);
+ /*
+ * If the root dataset is encrypted we will need to create key mappings
+ * for the zio layer before we start to write any data to disk and hold
+ * them until after the first txg has been synced. Waiting for the first
+ * transaction to complete also ensures that our bean counters are
+ * appropriately updated.
+ */
+ if (dp->dp_root_dir->dd_crypto_obj != 0) {
+ root_dsobj = dsl_dir_phys(dp->dp_root_dir)->dd_head_dataset_obj;
+ VERIFY0(spa_keystore_create_mapping_impl(spa, root_dsobj,
+ dp->dp_root_dir, FTAG));
+ }
+
spa->spa_sync_on = B_TRUE;
- txg_sync_start(spa->spa_dsl_pool);
+ txg_sync_start(dp);
mmp_thread_start(spa);
+ txg_wait_synced(dp, txg);
- /*
- * We explicitly wait for the first transaction to complete so that our
- * bean counters are appropriately updated.
- */
- txg_wait_synced(spa->spa_dsl_pool, txg);
+ if (dp->dp_root_dir->dd_crypto_obj != 0)
+ VERIFY0(spa_keystore_remove_mapping(spa, root_dsobj, FTAG));
spa_config_sync(spa, B_FALSE, B_TRUE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index 5b792b868..7e712d368 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -305,7 +305,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
*/
if (target->spa_ccw_fail_time == 0) {
zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
- target, NULL, NULL, 0, 0);
+ target, NULL, NULL, NULL, 0, 0);
}
target->spa_ccw_fail_time = gethrtime();
spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index 3c8aa543b..1299faa58 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -90,9 +90,8 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
* during spa_errlog_sync().
*/
void
-spa_log_error(spa_t *spa, zio_t *zio)
+spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
{
- zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
spa_error_entry_t search;
spa_error_entry_t *new;
avl_tree_t *tree;
diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c
index 73571c032..9d6c5ca5a 100644
--- a/module/zfs/spa_history.c
+++ b/module/zfs/spa_history.c
@@ -385,11 +385,16 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
{
int err = 0;
dmu_tx_t *tx;
- nvlist_t *nvarg;
+ nvlist_t *nvarg, *in_nvl = NULL;
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
return (SET_ERROR(EINVAL));
+ err = nvlist_lookup_nvlist(nvl, ZPOOL_HIST_INPUT_NVL, &in_nvl);
+ if (err == 0) {
+ (void) nvlist_remove_all(in_nvl, ZPOOL_HIDDEN_ARGS);
+ }
+
tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 3787e010f..14723a1ca 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1414,6 +1414,7 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
char type[256];
char *checksum = NULL;
char *compress = NULL;
+ char *crypt_type = NULL;
if (bp != NULL) {
if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
@@ -1427,6 +1428,15 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
sizeof (type));
}
+ if (BP_IS_ENCRYPTED(bp)) {
+ crypt_type = "encrypted";
+ } else if (BP_IS_AUTHENTICATED(bp)) {
+ crypt_type = "authenticated";
+ } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
+ crypt_type = "indirect-MAC";
+ } else {
+ crypt_type = "unencrypted";
+ }
if (!BP_IS_EMBEDDED(bp)) {
checksum =
zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
@@ -1435,7 +1445,7 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
}
SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
- compress);
+ crypt_type, compress);
}
void
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index df07d893d..4daba421f 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1050,7 +1050,7 @@ vdev_probe_done(zio_t *zio)
} else {
ASSERT(zio->io_error != 0);
zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
- spa, vd, NULL, 0, 0);
+ spa, vd, NULL, NULL, 0, 0);
zio->io_error = SET_ERROR(ENXIO);
}
@@ -1397,7 +1397,7 @@ vdev_open(vdev_t *vd)
if (ashift > vd->vdev_top->vdev_ashift &&
vd->vdev_ops->vdev_op_leaf) {
zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
- spa, vd, NULL, 0, 0);
+ spa, vd, NULL, NULL, 0, 0);
}
vd->vdev_max_asize = max_asize;
@@ -3590,7 +3590,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
- zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
+ zfs_ereport_post(class, spa, vd, NULL, NULL,
+ save_state, 0);
}
/* Erase any notion of persistent removed state */
@@ -3758,7 +3759,7 @@ vdev_deadman(vdev_t *vd)
fio->io_timestamp, delta,
vq->vq_io_complete_ts);
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
- spa, vd, fio, 0, 0);
+ spa, vd, &fio->io_bookmark, fio, 0, 0);
}
}
mutex_exit(&vq->vq_lock);
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index ba850b4f8..65b143091 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1766,9 +1766,9 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
- zfs_ereport_post_checksum(zio->io_spa, vd, zio,
- rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data,
- &zbc);
+ zfs_ereport_post_checksum(zio->io_spa, vd,
+ &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
+ rc->rc_abd, bad_data, &zbc);
}
}
@@ -2256,7 +2256,8 @@ vdev_raidz_io_done(zio_t *zio)
zfs_ereport_start_checksum(
zio->io_spa,
vd->vdev_child[rc->rc_devidx],
- zio, rc->rc_offset, rc->rc_size,
+ &zio->io_bookmark, zio,
+ rc->rc_offset, rc->rc_size,
(void *)(uintptr_t)c, &zbc);
}
}
diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c
index d8220aa23..f708b286a 100644
--- a/module/zfs/zfeature.c
+++ b/module/zfs/zfeature.c
@@ -424,8 +424,8 @@ spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
* We create feature flags ZAP objects in two instances: during pool
* creation and during pool upgrade.
*/
- ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
- tx->tx_txg == TXG_INITIAL));
+ ASSERT((!spa->spa_sync_on && tx->tx_txg == TXG_INITIAL) ||
+ dsl_pool_sync_context(spa_get_dsl(spa)));
spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c
index 0e7203ea6..ae9ae33bc 100644
--- a/module/zfs/zfs_acl.c
+++ b/module/zfs/zfs_acl.c
@@ -2204,7 +2204,7 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
* placed into the working_mode, giving the caller a mask of denied
* accesses. Returns:
* 0 if all AoI granted
- * EACCESS if the denied mask is non-zero
+ * EACCES if the denied mask is non-zero
* other error if abnormal failure (e.g., IO error)
*
* A secondary usage of the function is to determine if any of the
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 3986b3959..9e6c12e00 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -142,8 +142,8 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
static void
zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
- const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
- uint64_t stateoroffset, uint64_t size)
+ const char *subclass, spa_t *spa, vdev_t *vd, zbookmark_phys_t *zb,
+ zio_t *zio, uint64_t stateoroffset, uint64_t size)
{
nvlist_t *ereport, *detector;
@@ -413,24 +413,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
DATA_TYPE_UINT64, zio->io_size, NULL);
}
-
- /*
- * Payload for I/Os with corresponding logical information.
- */
- if (zio->io_logical != NULL)
- fm_payload_set(ereport,
- FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
- DATA_TYPE_UINT64,
- zio->io_logical->io_bookmark.zb_objset,
- FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
- DATA_TYPE_UINT64,
- zio->io_logical->io_bookmark.zb_object,
- FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
- DATA_TYPE_INT64,
- zio->io_logical->io_bookmark.zb_level,
- FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
- DATA_TYPE_UINT64,
- zio->io_logical->io_bookmark.zb_blkid, NULL);
} else if (vd != NULL) {
/*
* If we have a vdev but no zio, this is a device fault, and the
@@ -442,6 +424,20 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, stateoroffset, NULL);
}
+ /*
+ * Payload for I/Os with corresponding logical information.
+ */
+ if (zb != NULL && (zio == NULL || zio->io_logical != NULL))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+ DATA_TYPE_UINT64, zb->zb_objset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+ DATA_TYPE_UINT64, zb->zb_object,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+ DATA_TYPE_INT64, zb->zb_level,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+ DATA_TYPE_UINT64, zb->zb_blkid, NULL);
+
mutex_exit(&spa->spa_errlist_lock);
*ereport_out = ereport;
@@ -771,8 +767,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
#endif
void
-zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
- uint64_t stateoroffset, uint64_t size)
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
+ zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size)
{
#ifdef _KERNEL
nvlist_t *ereport = NULL;
@@ -781,8 +777,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
if (zfs_is_ratelimiting_event(subclass, vd))
return;
- zfs_ereport_start(&ereport, &detector,
- subclass, spa, vd, zio, stateoroffset, size);
+ zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
+ zb, zio, stateoroffset, size);
if (ereport == NULL)
return;
@@ -793,7 +789,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
}
void
-zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
+zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length, void *arg,
zio_bad_cksum_t *info)
{
@@ -823,7 +819,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
#ifdef _KERNEL
zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
- FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
if (report->zcr_ereport == NULL) {
zfs_ereport_free_checksum(report);
@@ -879,7 +875,7 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
void
-zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length,
const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
{
@@ -888,8 +884,8 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
nvlist_t *detector = NULL;
zfs_ecksum_info_t *info;
- zfs_ereport_start(&ereport, &detector,
- FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+ zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
+ spa, vd, zb, zio, offset, length);
if (ereport == NULL)
return;
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 66311711c..9f32d00ac 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -34,7 +34,7 @@
* Copyright 2016 Toomas Soome <[email protected]>
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2017, loli10K <[email protected]>. All rights reserved.
- * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017 Datto Inc. All rights reserved.
* Copyright 2017 RackTop Systems.
*/
@@ -185,6 +185,7 @@
#include <sys/dsl_scan.h>
#include <sharefs/share.h>
#include <sys/fm/util.h>
+#include <sys/dsl_crypt.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
@@ -565,12 +566,12 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
* Try to own the dataset; abort if there is any error,
* (e.g., already mounted, in use, or other error).
*/
- error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
+ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE,
setsl_tag, &os);
if (error != 0)
return (SET_ERROR(EPERM));
- dmu_objset_disown(os, setsl_tag);
+ dmu_objset_disown(os, B_TRUE, setsl_tag);
if (new_default) {
needed_priv = PRIV_FILE_DOWNGRADE_SL;
@@ -1301,6 +1302,20 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
return (error);
}
+static int
+zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_LOAD_KEY, cr));
+}
+
+static int
+zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_CHANGE_KEY, cr));
+}
+
/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
@@ -1462,7 +1477,7 @@ zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
if (zfsvfs->z_sb) {
deactivate_super(zfsvfs->z_sb);
} else {
- dmu_objset_disown(zfsvfs->z_os, zfsvfs);
+ dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
zfsvfs_free(zfsvfs);
}
}
@@ -1474,6 +1489,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
nvlist_t *config, *props = NULL;
nvlist_t *rootprops = NULL;
nvlist_t *zplprops = NULL;
+ dsl_crypto_params_t *dcp = NULL;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)))
@@ -1488,6 +1504,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
if (props) {
nvlist_t *nvl = NULL;
+ nvlist_t *hidden_args = NULL;
uint64_t version = SPA_VERSION;
(void) nvlist_lookup_uint64(props,
@@ -1506,6 +1523,18 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
}
(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
}
+
+ (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS,
+ &hidden_args);
+ error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
+ rootprops, hidden_args, &dcp);
+ if (error != 0) {
+ nvlist_free(config);
+ nvlist_free(props);
+ return (error);
+ }
+ (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS);
+
VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
error = zfs_fill_zplprops_root(version, rootprops,
zplprops, NULL);
@@ -1513,7 +1542,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
goto pool_props_bad;
}
- error = spa_create(zc->zc_name, config, props, zplprops);
+ error = spa_create(zc->zc_name, config, props, zplprops, dcp);
/*
* Set the remaining root properties
@@ -1527,6 +1556,7 @@ pool_props_bad:
nvlist_free(zplprops);
nvlist_free(config);
nvlist_free(props);
+ dsl_crypto_params_free(dcp, !!error);
return (error);
}
@@ -1802,15 +1832,16 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc)
int error;
/* XXX reading from objset not owned */
- if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
+ FTAG, &os)) != 0)
return (error);
if (dmu_objset_type(os) != DMU_OST_ZFS) {
- dmu_objset_rele(os, FTAG);
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
return (SET_ERROR(EINVAL));
}
error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
sizeof (zc->zc_value));
- dmu_objset_rele(os, FTAG);
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
return (error);
}
@@ -1831,15 +1862,16 @@ zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
int error;
/* XXX reading from objset not owned */
- if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
+ FTAG, &os)) != 0)
return (error);
if (dmu_objset_type(os) != DMU_OST_ZFS) {
- dmu_objset_rele(os, FTAG);
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
return (SET_ERROR(EINVAL));
}
error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
sizeof (zc->zc_value));
- dmu_objset_rele(os, FTAG);
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
return (error);
}
@@ -2385,7 +2417,8 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
{
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
- uint64_t intval;
+ uint64_t intval = 0;
+ char *strval = NULL;
int err = -1;
if (prop == ZPROP_INVAL) {
@@ -2401,10 +2434,12 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
&pair) == 0);
}
- if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
- return (-1);
-
- VERIFY(0 == nvpair_value_uint64(pair, &intval));
+ /* all special properties are numeric except for keylocation */
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+ strval = fnvpair_value_string(pair);
+ } else {
+ intval = fnvpair_value_uint64(pair);
+ }
switch (prop) {
case ZFS_PROP_QUOTA:
@@ -2428,6 +2463,16 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
if (err == 0)
err = -1;
break;
+ case ZFS_PROP_KEYLOCATION:
+ err = dsl_crypto_can_set_keylocation(dsname, strval);
+
+ /*
+ * Set err to -1 to force the zfs_set_prop_nvlist code down the
+ * default path to set the value in the nvlist.
+ */
+ if (err == 0)
+ err = -1;
+ break;
case ZFS_PROP_RESERVATION:
err = dsl_dir_set_reservation(dsname, source, intval);
break;
@@ -3156,6 +3201,8 @@ zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
* innvl: {
* "type" -> dmu_objset_type_t (int32)
* (optional) "props" -> { prop -> value }
+ * (optional) "hidden_args" -> { "wkeydata" -> value }
+ * raw uint8_t array of encryption wrapping key data (32 bytes)
* }
*
* outnvl: propname -> error code (int32)
@@ -3166,15 +3213,18 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
int error = 0;
zfs_creat_t zct = { 0 };
nvlist_t *nvprops = NULL;
+ nvlist_t *hidden_args = NULL;
void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
int32_t type32;
dmu_objset_type_t type;
boolean_t is_insensitive = B_FALSE;
+ dsl_crypto_params_t *dcp = NULL;
if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
return (SET_ERROR(EINVAL));
type = type32;
(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
+ (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
switch (type) {
case DMU_OST_ZFS:
@@ -3240,9 +3290,18 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
}
}
+ error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops,
+ hidden_args, &dcp);
+ if (error != 0) {
+ nvlist_free(zct.zct_zplprops);
+ return (error);
+ }
+
error = dmu_objset_create(fsname, type,
- is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
+ is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct);
+
nvlist_free(zct.zct_zplprops);
+ dsl_crypto_params_free(dcp, !!error);
/*
* It would be nice to do this atomically.
@@ -3277,6 +3336,8 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* "origin" -> name of origin snapshot
* (optional) "props" -> { prop -> value }
+ * (optional) "hidden_args" -> { "wkeydata" -> value }
+ * raw uint8_t array of encryption wrapping key data (32 bytes)
* }
*
* outputs:
@@ -3299,9 +3360,8 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
if (dataset_namecheck(origin_name, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
+
error = dmu_objset_clone(fsname, origin_name);
- if (error != 0)
- return (error);
/*
* It would be nice to do this atomically.
@@ -4160,7 +4220,11 @@ extract_delay_props(nvlist_t *props)
{
nvlist_t *delayprops;
nvpair_t *nvp, *tmp;
- static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
+ static const zfs_prop_t delayable[] = {
+ ZFS_PROP_REFQUOTA,
+ ZFS_PROP_KEYLOCATION,
+ 0
+ };
int i;
VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -4704,6 +4768,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
boolean_t embedok = (zc->zc_flags & 0x1);
boolean_t large_block_ok = (zc->zc_flags & 0x2);
boolean_t compressok = (zc->zc_flags & 0x4);
+ boolean_t rawok = (zc->zc_flags & 0x8);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4735,7 +4800,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
if (error != 0)
return (error);
- error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj,
+ FTAG, &tosnap);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
@@ -4751,7 +4817,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
}
}
- error = dmu_send_estimate(tosnap, fromsnap, compressok,
+ error = dmu_send_estimate(tosnap, fromsnap, compressok || rawok,
&zc->zc_objset_type);
if (fromsnap != NULL)
@@ -4765,7 +4831,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
- zc->zc_fromobj, embedok, large_block_ok, compressok,
+ zc->zc_fromobj, embedok, large_block_ok, compressok, rawok,
zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
@@ -5152,7 +5218,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
dmu_objset_refresh_ownership(zfsvfs->z_os,
- zfsvfs);
+ B_TRUE, zfsvfs);
error = zfs_resume_fs(zfsvfs, ds);
}
}
@@ -5161,12 +5227,12 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
deactivate_super(zfsvfs->z_sb);
} else {
/* XXX kind of reading contents without owning */
- error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
if (error != 0)
return (error);
error = dmu_objset_userspace_upgrade(os);
- dmu_objset_rele(os, FTAG);
+ dmu_objset_rele_flags(os, B_TRUE, FTAG);
}
return (error);
@@ -5185,7 +5251,7 @@ zfs_ioc_userobjspace_upgrade(zfs_cmd_t *zc)
objset_t *os;
int error;
- error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
if (error != 0)
return (error);
@@ -5209,7 +5275,7 @@ zfs_ioc_userobjspace_upgrade(zfs_cmd_t *zc)
}
dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
- dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+ dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG);
return (error);
}
@@ -5745,6 +5811,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* (optional) "compressok" -> (value ignored)
* presence indicates compressed DRR_WRITE records are permitted
+ * (optional) "rawok" -> (value ignored)
+ * presence indicates raw encrypted records should be used.
* (optional) "resume_object" and "resume_offset" -> (uint64)
* if present, resume send stream from specified object and offset.
* }
@@ -5763,6 +5831,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
boolean_t largeblockok;
boolean_t embedok;
boolean_t compressok;
+ boolean_t rawok;
uint64_t resumeobj = 0;
uint64_t resumeoff = 0;
@@ -5775,6 +5844,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
compressok = nvlist_exists(innvl, "compressok");
+ rawok = nvlist_exists(innvl, "rawok");
(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
@@ -5784,7 +5854,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
off = fp->f_offset;
error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
- fd, resumeobj, resumeoff, fp->f_vnode, &off);
+ rawok, fd, resumeobj, resumeoff, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5824,6 +5894,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
/* LINTED E_FUNC_SET_NOT_USED */
boolean_t embedok;
boolean_t compressok;
+ boolean_t rawok;
uint64_t space;
error = dsl_pool_hold(snapname, FTAG, &dp);
@@ -5839,6 +5910,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
compressok = nvlist_exists(innvl, "compressok");
+ rawok = nvlist_exists(innvl, "rawok");
error = nvlist_lookup_string(innvl, "from", &fromname);
if (error == 0) {
@@ -5852,8 +5924,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
if (error != 0)
goto out;
- error = dmu_send_estimate(tosnap, fromsnap, compressok,
- &space);
+ error = dmu_send_estimate(tosnap, fromsnap,
+ compressok || rawok, &space);
dsl_dataset_rele(fromsnap, FTAG);
} else if (strchr(fromname, '#') != NULL) {
/*
@@ -5868,7 +5940,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
if (error != 0)
goto out;
error = dmu_send_estimate_from_txg(tosnap,
- frombm.zbm_creation_txg, compressok, &space);
+ frombm.zbm_creation_txg, compressok || rawok,
+ &space);
} else {
/*
* from is not properly formatted as a snapshot or
@@ -5879,7 +5952,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
}
} else {
// If estimating the size of a full send, use dmu_send_estimate
- error = dmu_send_estimate(tosnap, NULL, compressok, &space);
+ error = dmu_send_estimate(tosnap, NULL, compressok || rawok,
+ &space);
}
fnvlist_add_uint64(outnvl, "space", space);
@@ -5928,6 +6002,124 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
return (err);
}
+/*
+ * Load a user's wrapping key into the kernel.
+ * innvl: {
+ * "hidden_args" -> { "wkeydata" -> value }
+ * raw uint8_t array of encryption wrapping key data (32 bytes)
+ * (optional) "noop" -> (value ignored)
+ * presence indicated key should only be verified, not loaded
+ * }
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int ret;
+ dsl_crypto_params_t *dcp = NULL;
+ nvlist_t *hidden_args;
+ boolean_t noop = nvlist_exists(innvl, "noop");
+
+ if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
+ if (ret != 0) {
+ ret = SET_ERROR(EINVAL);
+ goto error;
+ }
+
+ ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
+ hidden_args, &dcp);
+ if (ret != 0)
+ goto error;
+
+ ret = spa_keystore_load_wkey(dsname, dcp, noop);
+ if (ret != 0)
+ goto error;
+
+ dsl_crypto_params_free(dcp, noop);
+
+ return (0);
+
+error:
+ dsl_crypto_params_free(dcp, B_TRUE);
+ return (ret);
+}
+
+/*
+ * Unload a user's wrapping key from the kernel.
+ * Both innvl and outnvl are unused.
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int ret = 0;
+
+ if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+ ret = (SET_ERROR(EINVAL));
+ goto out;
+ }
+
+ ret = spa_keystore_unload_wkey(dsname);
+ if (ret != 0)
+ goto out;
+
+out:
+ return (ret);
+}
+
+/*
+ * Changes a user's wrapping key used to decrypt a dataset. The keyformat,
+ * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified
+ * here to change how the key is derived in userspace.
+ *
+ * innvl: {
+ * "hidden_args" (optional) -> { "wkeydata" -> value }
+ * raw uint8_t array of new encryption wrapping key data (32 bytes)
+ * "props" (optional) -> { prop -> value }
+ * }
+ *
+ * outnvl is unused
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ int ret;
+ uint64_t cmd = DCP_CMD_NONE;
+ dsl_crypto_params_t *dcp = NULL;
+ nvlist_t *args = NULL, *hidden_args = NULL;
+
+ if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+ ret = (SET_ERROR(EINVAL));
+ goto error;
+ }
+
+ (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd);
+ (void) nvlist_lookup_nvlist(innvl, "props", &args);
+ (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
+
+ ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp);
+ if (ret != 0)
+ goto error;
+
+ ret = spa_keystore_change_key(dsname, dcp);
+ if (ret != 0)
+ goto error;
+
+ dsl_crypto_params_free(dcp, B_FALSE);
+
+ return (0);
+
+error:
+ dsl_crypto_params_free(dcp, B_TRUE);
+ return (ret);
+}
+
static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
static void
@@ -6099,6 +6291,16 @@ zfs_ioctl_init(void)
zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW,
zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
+ zfs_ioc_load_key, zfs_secpolicy_load_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE);
+ zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY,
+ zfs_ioc_unload_key, zfs_secpolicy_load_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE);
+ zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY,
+ zfs_ioc_change_key, zfs_secpolicy_change_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY,
+ B_TRUE, B_TRUE);
zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index 0e3f37781..b60045a95 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -1048,7 +1048,8 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
* We claim to always be readonly so we can open snapshots;
* other ZPL code will prevent us from writing to snapshots.
*/
- error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, B_TRUE,
+ zfsvfs, &os);
if (error) {
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
@@ -1080,7 +1081,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
error = zfsvfs_init(zfsvfs, os);
if (error != 0) {
- dmu_objset_disown(os, zfsvfs);
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
*zfvp = NULL;
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
@@ -1669,7 +1670,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb);
out:
if (error) {
- dmu_objset_disown(zfsvfs->z_os, zfsvfs);
+ dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
zfsvfs_free(zfsvfs);
/*
* make sure we don't have dangling sb->s_fs_info which
@@ -1729,7 +1730,8 @@ zfs_umount(struct super_block *sb)
zfsvfs_t *zfsvfs = sb->s_fs_info;
objset_t *os;
- arc_remove_prune_callback(zfsvfs->z_arc_prune);
+ if (zfsvfs->z_arc_prune != NULL)
+ arc_remove_prune_callback(zfsvfs->z_arc_prune);
VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
os = zfsvfs->z_os;
zpl_bdi_destroy(sb);
@@ -1749,7 +1751,7 @@ zfs_umount(struct super_block *sb)
/*
* Finally release the objset
*/
- dmu_objset_disown(os, zfsvfs);
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
}
zfsvfs_free(zfsvfs);
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 6b0346893..f15e8cddb 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -193,8 +193,8 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
* Read a log block and make sure it's valid.
*/
static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
- char **end)
+zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
+ blkptr_t *nbp, void *dst, char **end)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t aflags = ARC_FLAG_WAIT;
@@ -208,11 +208,14 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
zio_flags |= ZIO_FLAG_SPECULATIVE;
+ if (!decrypt)
+ zio_flags |= ZIO_FLAG_RAW;
+
SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
- error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+ error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
+ &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
zio_cksum_t cksum = bp->blk_cksum;
@@ -287,6 +290,14 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
if (zilog->zl_header->zh_claim_txg == 0)
zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+ /*
+ * If we are not using the resulting data, we are just checking that
+ * it hasn't been corrupted so we don't need to waste CPU time
+ * decompressing and decrypting it.
+ */
+ if (wbuf == NULL)
+ zio_flags |= ZIO_FLAG_RAW;
+
SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
@@ -307,7 +318,8 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
*/
int
zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
- zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
+ boolean_t decrypt)
{
const zil_header_t *zh = zilog->zl_header;
boolean_t claimed = !!zh->zh_claim_txg;
@@ -348,7 +360,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
if (blk_seq > claim_blk_seq)
break;
- if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
+
+ error = parse_blk_func(zilog, &blk, arg, txg);
+ if (error != 0)
break;
ASSERT3U(max_blk_seq, <, blk_seq);
max_blk_seq = blk_seq;
@@ -357,7 +371,8 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
break;
- error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
+ error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
+ lrbuf, &end);
if (error != 0)
break;
@@ -367,7 +382,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
ASSERT3U(reclen, >=, sizeof (lr_t));
if (lr->lrc_seq > claim_lr_seq)
goto done;
- if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
+
+ error = parse_lr_func(zilog, lr, arg, txg);
+ if (error != 0)
goto done;
ASSERT3U(max_lr_seq, <, lr->lrc_seq);
max_lr_seq = lr->lrc_seq;
@@ -382,7 +399,8 @@ done:
zilog->zl_parse_lr_count = lr_count;
ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
- (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
+ (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) ||
+ (decrypt && error == EIO));
zil_bp_tree_fini(zilog);
zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
@@ -423,9 +441,12 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
* waited for all writes to be stable first), so it is semantically
* correct to declare this the end of the log.
*/
- if (lr->lr_blkptr.blk_birth >= first_txg &&
- (error = zil_read_log_data(zilog, lr, NULL)) != 0)
- return (error);
+ if (lr->lr_blkptr.blk_birth >= first_txg) {
+ error = zil_read_log_data(zilog, lr, NULL);
+ if (error != 0)
+ return (error);
+ }
+
return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
@@ -579,7 +600,7 @@ zil_create(zilog_t *zilog)
BP_ZERO(&blk);
}
- error = zio_alloc_zil(zilog->zl_spa, txg, &blk,
+ error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
ZIL_MIN_BLKSZ, &slog);
fastwrite = TRUE;
@@ -673,7 +694,7 @@ zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
{
ASSERT(list_is_empty(&zilog->zl_lwb_list));
(void) zil_parse(zilog, zil_free_log_block,
- zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
+ zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
}
int
@@ -687,7 +708,7 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
int error;
error = dmu_objset_own_obj(dp, ds->ds_object,
- DMU_OST_ANY, B_FALSE, FTAG, &os);
+ DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
if (error != 0) {
/*
* EBUSY indicates that the objset is inconsistent, in which
@@ -708,8 +729,10 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
if (!BP_IS_HOLE(&zh->zh_log))
zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
BP_ZERO(&zh->zh_log);
+ if (os->os_encrypted)
+ os->os_next_write_raw = B_TRUE;
dsl_dataset_dirty(dmu_objset_ds(os), tx);
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_FALSE, FTAG);
return (0);
}
@@ -723,7 +746,7 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
ASSERT3U(zh->zh_claim_txg, <=, first_txg);
if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
(void) zil_parse(zilog, zil_claim_log_block,
- zil_claim_log_record, tx, first_txg);
+ zil_claim_log_record, tx, first_txg, B_FALSE);
zh->zh_claim_txg = first_txg;
zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
@@ -734,7 +757,7 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
}
ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_FALSE, FTAG);
return (0);
}
@@ -792,7 +815,8 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
* which will update spa_max_claim_txg. See spa_load() for details.
*/
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
- zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
+ zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa),
+ B_FALSE);
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}
@@ -1060,7 +1084,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
BP_ZERO(bp);
- error = zio_alloc_zil(spa, txg, bp, zil_blksz, &slog);
+ error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
if (slog) {
ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused);
@@ -2269,7 +2293,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t replay_func[TX_MAX_TYPE])
zilog->zl_replay_time = ddi_get_lbolt();
ASSERT(zilog->zl_replay_blks == 0);
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
- zh->zh_claim_txg);
+ zh->zh_claim_txg, B_TRUE);
vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
zil_destroy(zilog, B_FALSE);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 1d69d8d8d..959b9a5a8 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -43,6 +43,7 @@
#include <sys/time.h>
#include <sys/trace_zio.h>
#include <sys/abd.h>
+#include <sys/dsl_crypt.h>
/*
* ==========================================================================
@@ -368,7 +369,7 @@ zio_pop_transforms(zio_t *zio)
/*
* ==========================================================================
- * I/O transform callbacks for subblocks and decompression
+ * I/O transform callbacks for subblocks, decompression, and decryption
* ==========================================================================
*/
static void
@@ -394,6 +395,126 @@ zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
}
}
+static void
+zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
+{
+ int ret;
+ void *tmp;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ dmu_object_type_t ot = BP_GET_TYPE(bp);
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+
+ ASSERT(BP_USES_CRYPT(bp));
+ ASSERT3U(size, !=, 0);
+
+ if (zio->io_error != 0)
+ return;
+
+ /*
+ * Verify the cksum of MACs stored in an indirect bp. It will always
+ * be possible to verify this since it does not require an encryption
+ * key.
+ */
+ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
+ zio_crypt_decode_mac_bp(bp, mac);
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ /*
+ * We haven't decompressed the data yet, but
+ * zio_crypt_do_indirect_mac_checksum() requires
+ * decompressed data to be able to parse out the MACs
+ * from the indirect block. We decompress it now and
+ * throw away the result after we are finished.
+ */
+ tmp = zio_buf_alloc(lsize);
+ ret = zio_decompress_data(BP_GET_COMPRESS(bp),
+ zio->io_abd, tmp, zio->io_size, lsize);
+ if (ret != 0) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+ ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
+ tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
+ zio_buf_free(tmp, lsize);
+ } else {
+ ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
+ zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
+ }
+ abd_copy(data, zio->io_abd, size);
+
+ if (ret != 0)
+ goto error;
+
+ return;
+ }
+
+ /*
+ * If this is an authenticated block, just check the MAC. It would be
+ * nice to separate this out into its own flag, but for the moment
+ * enum zio_flag is out of bits.
+ */
+ if (BP_IS_AUTHENTICATED(bp)) {
+ if (ot == DMU_OT_OBJSET) {
+ ret = spa_do_crypt_objset_mac_abd(B_FALSE, zio->io_spa,
+ zio->io_bookmark.zb_objset, zio->io_abd, size,
+ BP_SHOULD_BYTESWAP(bp));
+ } else {
+ zio_crypt_decode_mac_bp(bp, mac);
+ ret = spa_do_crypt_mac_abd(B_FALSE, zio->io_spa,
+ zio->io_bookmark.zb_objset, zio->io_abd, size, mac);
+ }
+ abd_copy(data, zio->io_abd, size);
+
+ if (ret != 0)
+ goto error;
+
+ return;
+ }
+
+ zio_crypt_decode_params_bp(bp, salt, iv);
+
+ if (ot == DMU_OT_INTENT_LOG) {
+ tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
+ zio_crypt_decode_mac_zil(tmp, mac);
+ abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
+ } else {
+ zio_crypt_decode_mac_bp(bp, mac);
+ }
+
+ ret = spa_do_crypt_abd(B_FALSE, zio->io_spa, zio->io_bookmark.zb_objset,
+ bp, bp->blk_birth, size, data, zio->io_abd, iv, mac, salt,
+ &no_crypt);
+ if (no_crypt)
+ abd_copy(data, zio->io_abd, size);
+
+ if (ret != 0)
+ goto error;
+
+ return;
+
+error:
+ /* assert that the key was found unless this was speculative */
+ ASSERT(ret != ENOENT || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
+
+ /*
+ * If there was a decryption / authentication error return EIO as
+ * the io_error. If this was not a speculative zio, create an ereport.
+ */
+ if (ret == ECKSUM) {
+ ret = SET_ERROR(EIO);
+ if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
+ zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+ zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0);
+ }
+ } else {
+ zio->io_error = ret;
+ }
+}
+
/*
* ==========================================================================
* I/O parent/child relationships and pipeline interlocks
@@ -606,7 +727,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
ASSERT(vd || stage == ZIO_STAGE_OPEN);
- IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
+ IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
bzero(zio, sizeof (zio_t));
@@ -844,9 +965,12 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
* Data can be NULL if we are going to call zio_write_override() to
* provide the already-allocated BP. But we may need the data to
* verify a dedup hit (if requested). In this case, don't try to
- * dedup (just take the already-allocated BP verbatim).
+ * dedup (just take the already-allocated BP verbatim). Encrypted
+ * dedup blocks need data as well so we also disable dedup in this
+ * case.
*/
- if (data == NULL && zio->io_prop.zp_dedup_verify) {
+ if (data == NULL &&
+ (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
}
@@ -1186,16 +1310,23 @@ static int
zio_read_bp_init(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
+ uint64_t psize =
+ BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
- !(zio->io_flags & ZIO_FLAG_RAW)) {
- uint64_t psize =
- BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
+ !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
psize, psize, zio_decompress);
}
+ if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
+ BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
+ zio->io_child_type == ZIO_CHILD_LOGICAL) {
+ zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+ psize, psize, zio_decrypt);
+ }
+
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
int psize = BPE_GET_PSIZE(bp);
void *data = abd_borrow_buf(zio->io_abd, psize);
@@ -1222,7 +1353,6 @@ zio_read_bp_init(zio_t *zio)
static int
zio_write_bp_init(zio_t *zio)
{
-
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
@@ -1261,7 +1391,8 @@ zio_write_bp_init(zio_t *zio)
ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
- if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+ if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
+ !zp->zp_encrypt) {
BP_SET_DEDUP(bp, 1);
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
return (ZIO_PIPELINE_CONTINUE);
@@ -1290,8 +1421,6 @@ zio_write_compress(zio_t *zio)
uint64_t psize = zio->io_size;
int pass = 1;
- EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
-
/*
* If our children haven't all reached the ready stage,
* wait for them and then repeat this pipeline stage.
@@ -1341,13 +1470,15 @@ zio_write_compress(zio_t *zio)
}
/* If it's a compressed write that is not raw, compress the buffer. */
- if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
+ if (compress != ZIO_COMPRESS_OFF &&
+ !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
void *cbuf = zio_buf_alloc(lsize);
psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
- } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+ } else if (!zp->zp_dedup && !zp->zp_encrypt &&
+ psize <= BPE_PAYLOAD_SIZE &&
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
encode_embedded_bp_compressed(bp,
@@ -1445,6 +1576,8 @@ zio_write_compress(zio_t *zio)
if (zp->zp_dedup) {
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(!zp->zp_encrypt ||
+ DMU_OT_IS_ENCRYPTED(zp->zp_type));
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
}
if (zp->zp_nopwrite) {
@@ -1868,7 +2001,8 @@ zio_suspend(spa_t *spa, zio_t *zio)
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
"failure and has been suspended.\n", spa_name(spa));
- zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
+ zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
+ NULL, NULL, 0, 0);
mutex_enter(&spa->spa_suspend_lock);
@@ -2298,11 +2432,19 @@ zio_write_gang_block(zio_t *pio)
uint64_t resid = pio->io_size;
uint64_t lsize;
int copies = gio->io_prop.zp_copies;
- int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
+ int gbh_copies;
zio_prop_t zp;
int g, error;
-
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+
+ /*
+ * encrypted blocks need DVA[2] free so encrypted gang headers can't
+ * have a third copy.
+ */
+ gbh_copies = MIN(copies + 1, spa_max_replication(spa));
+ if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
+ gbh_copies = SPA_DVAS_PER_BP - 1;
+
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
@@ -2376,12 +2518,16 @@ zio_write_gang_block(zio_t *pio)
zp.zp_checksum = gio->io_prop.zp_checksum;
zp.zp_compress = ZIO_COMPRESS_OFF;
+ zp.zp_encrypt = gio->io_prop.zp_encrypt;
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
zp.zp_dedup = B_FALSE;
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
+ bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
+ bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
+ bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
@@ -2460,6 +2606,7 @@ zio_nop_write(zio_t *zio)
if (BP_IS_HOLE(bp_orig) ||
!(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE) ||
+ BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
@@ -2609,7 +2756,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
* pushed the I/O transforms. That's an important optimization
* because otherwise we'd compress/encrypt all dmu_sync() data twice.
* However, we should never get a raw, override zio so in these
- * cases we can compare the io_data directly. This is useful because
+ * cases we can compare the io_abd directly. This is useful because
* it allows us to do dedup verification even if we don't have access
* to the original data (for instance, if the encryption keys aren't
* loaded).
@@ -3097,8 +3244,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
-zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
- boolean_t *slog)
+zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
+ uint64_t size, boolean_t *slog)
{
int error = 1;
zio_alloc_list_t io_alloc_list;
@@ -3130,6 +3277,23 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
BP_SET_LEVEL(new_bp, 0);
BP_SET_DEDUP(new_bp, 0);
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+
+ /*
+ * encrypted blocks will require an IV and salt. We generate
+ * these now since we will not be rewriting the bp at
+ * rewrite time.
+ */
+ if (os->os_encrypted) {
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+
+ BP_SET_CRYPT(new_bp, B_TRUE);
+ VERIFY0(spa_crypt_get_salt(spa,
+ dmu_objset_id(os), salt));
+ VERIFY0(zio_crypt_generate_iv(iv));
+
+ zio_crypt_encode_params_bp(new_bp, salt, iv);
+ }
}
return (error);
@@ -3464,6 +3628,146 @@ zio_vdev_io_bypass(zio_t *zio)
/*
* ==========================================================================
+ * Encrypt and store encryption parameters
+ * ==========================================================================
+ */
+
+
+/*
+ * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
+ * managing the storage of encryption parameters and passing them to the
+ * lower-level encryption functions.
+ */
+static int
+zio_encrypt(zio_t *zio)
+{
+ zio_prop_t *zp = &zio->io_prop;
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t psize = BP_GET_PSIZE(bp);
+ dmu_object_type_t ot = BP_GET_TYPE(bp);
+ void *enc_buf = NULL;
+ abd_t *eabd = NULL;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+
+ /* the root zio already encrypted the data */
+ if (zio->io_child_type == ZIO_CHILD_GANG)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ /* only ZIL blocks are re-encrypted on rewrite */
+ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
+ BP_SET_CRYPT(bp, B_FALSE);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ /* if we are doing raw encryption set the provided encryption params */
+ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
+ BP_SET_CRYPT(bp, B_TRUE);
+ BP_SET_BYTEORDER(bp, zp->zp_byteorder);
+ if (ot != DMU_OT_OBJSET)
+ zio_crypt_encode_mac_bp(bp, zp->zp_mac);
+ if (DMU_OT_IS_ENCRYPTED(ot))
+ zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ /* indirect blocks only maintain a cksum of the lower level MACs */
+ if (BP_GET_LEVEL(bp) > 0) {
+ BP_SET_CRYPT(bp, B_TRUE);
+ VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
+ zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
+ mac));
+ zio_crypt_encode_mac_bp(bp, mac);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ /*
+ * Objset blocks are a special case since they have 2 256-bit MACs
+ * embedded within them.
+ */
+ if (ot == DMU_OT_OBJSET) {
+ ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+ BP_SET_CRYPT(bp, B_TRUE);
+ VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa,
+ zio->io_bookmark.zb_objset, zio->io_abd, psize,
+ BP_SHOULD_BYTESWAP(bp)));
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ /* unencrypted object types are only authenticated with a MAC */
+ if (!DMU_OT_IS_ENCRYPTED(ot)) {
+ BP_SET_CRYPT(bp, B_TRUE);
+ VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa,
+ zio->io_bookmark.zb_objset, zio->io_abd, psize, mac));
+ zio_crypt_encode_mac_bp(bp, mac);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ /*
+ * Later passes of sync-to-convergence may decide to rewrite data
+ * in place to avoid more disk reallocations. This presents a problem
+ * for encryption because this consitutes rewriting the new data with
+ * the same encryption key and IV. However, this only applies to blocks
+ * in the MOS (particularly the spacemaps) and we do not encrypt the
+ * MOS. We assert that the zio is allocating or an intent log write
+ * to enforce this.
+ */
+ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
+ ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
+ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
+ ASSERT3U(psize, !=, 0);
+
+ enc_buf = zio_buf_alloc(psize);
+ eabd = abd_get_from_buf(enc_buf, psize);
+ abd_take_ownership_of_buf(eabd, B_TRUE);
+
+ /*
+ * For an explanation of what encryption parameters are stored
+ * where, see the block comment in zio_crypt.c.
+ */
+ if (ot == DMU_OT_INTENT_LOG) {
+ zio_crypt_decode_params_bp(bp, salt, iv);
+ } else {
+ BP_SET_CRYPT(bp, B_TRUE);
+ }
+
+ /* Perform the encryption. This should not fail */
+ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, zio->io_bookmark.zb_objset, bp,
+ zio->io_txg, psize, zio->io_abd, eabd, iv, mac, salt, &no_crypt));
+
+ /* encode encryption metadata into the bp */
+ if (ot == DMU_OT_INTENT_LOG) {
+ /*
+ * ZIL blocks store the MAC in the embedded checksum, so the
+ * transform must always be applied.
+ */
+ zio_crypt_encode_mac_zil(enc_buf, mac);
+ zio_push_transform(zio, eabd, psize, psize, NULL);
+ } else {
+ BP_SET_CRYPT(bp, B_TRUE);
+ zio_crypt_encode_params_bp(bp, salt, iv);
+ zio_crypt_encode_mac_bp(bp, mac);
+
+ if (no_crypt) {
+ ASSERT3U(ot, ==, DMU_OT_DNODE);
+ abd_free(eabd);
+ } else {
+ zio_push_transform(zio, eabd, psize, psize, NULL);
+ }
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
* Generate and verify checksums
* ==========================================================================
*/
@@ -3523,8 +3827,8 @@ zio_checksum_verify(zio_t *zio)
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
zfs_ereport_start_checksum(zio->io_spa,
- zio->io_vd, zio, zio->io_offset,
- zio->io_size, NULL, &info);
+ zio->io_vd, &zio->io_bookmark, zio,
+ zio->io_offset, zio->io_size, NULL, &info);
}
}
@@ -3824,7 +4128,7 @@ zio_done(zio_t *zio)
if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
- zio->io_vd, zio, 0, 0);
+ zio->io_vd, &zio->io_bookmark, zio, 0, 0);
}
if (zio->io_error) {
@@ -3837,7 +4141,7 @@ zio_done(zio_t *zio)
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
!vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
- zio->io_vd, zio, 0, 0);
+ zio->io_vd, &zio->io_bookmark, zio, 0, 0);
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
@@ -3846,9 +4150,9 @@ zio_done(zio_t *zio)
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
- spa_log_error(zio->io_spa, zio);
+ spa_log_error(zio->io_spa, &zio->io_bookmark);
zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
- NULL, zio, 0, 0);
+ NULL, &zio->io_bookmark, zio, 0, 0);
}
}
@@ -4046,6 +4350,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_free_bp_init,
zio_issue_async,
zio_write_compress,
+ zio_encrypt,
zio_checksum_generate,
zio_nop_write,
zio_ddt_read_start,
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index 6dfcb0631..19d281bef 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -308,6 +308,25 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
mutex_exit(&spa->spa_cksum_tmpls_lock);
}
+/* convenience function to update a checksum to accomodate an encryption MAC */
+static void
+zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor)
+{
+ /*
+ * Weak checksums do not have their entropy spread evenly
+ * across the bits of the checksum. Therefore, when truncating
+ * a weak checksum we XOR the first 2 words with the last 2 so
+ * that we don't "lose" any entropy unnecessarily.
+ */
+ if (xor) {
+ cksum->zc_word[0] ^= cksum->zc_word[2];
+ cksum->zc_word[1] ^= cksum->zc_word[3];
+ }
+
+ cksum->zc_word[2] = saved->zc_word[2];
+ cksum->zc_word[3] = saved->zc_word[3];
+}
+
/*
* Generate the checksum.
*/
@@ -319,8 +338,9 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
- zio_cksum_t cksum;
+ zio_cksum_t cksum, saved;
spa_t *spa = zio->io_spa;
+ boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0;
ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(ci->ci_func[0] != NULL);
@@ -331,6 +351,8 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
zio_eck_t eck;
size_t eck_offset;
+ bzero(&saved, sizeof (zio_cksum_t));
+
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t zilc;
abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
@@ -347,31 +369,36 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
zio_checksum_gang_verifier(&eck.zec_cksum, bp);
- abd_copy_from_buf_off(abd, &eck.zec_cksum,
- eck_offset + offsetof(zio_eck_t, zec_cksum),
- sizeof (zio_cksum_t));
} else if (checksum == ZIO_CHECKSUM_LABEL) {
zio_checksum_label_verifier(&eck.zec_cksum, offset);
- abd_copy_from_buf_off(abd, &eck.zec_cksum,
- eck_offset + offsetof(zio_eck_t, zec_cksum),
- sizeof (zio_cksum_t));
} else {
- bp->blk_cksum = eck.zec_cksum;
+ saved = eck.zec_cksum;
+ eck.zec_cksum = bp->blk_cksum;
}
abd_copy_from_buf_off(abd, &zec_magic,
eck_offset + offsetof(zio_eck_t, zec_magic),
sizeof (zec_magic));
+ abd_copy_from_buf_off(abd, &eck.zec_cksum,
+ eck_offset + offsetof(zio_eck_t, zec_cksum),
+ sizeof (zio_cksum_t));
ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&cksum);
+ if (bp != NULL && BP_USES_CRYPT(bp) &&
+ BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ zio_checksum_handle_crypt(&cksum, &saved, insecure);
abd_copy_from_buf_off(abd, &cksum,
eck_offset + offsetof(zio_eck_t, zec_cksum),
sizeof (zio_cksum_t));
} else {
+ saved = bp->blk_cksum;
ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
- &bp->blk_cksum);
+ &cksum);
+ if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ zio_checksum_handle_crypt(&cksum, &saved, insecure);
+ bp->blk_cksum = cksum;
}
}
@@ -458,6 +485,26 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
+ /*
+ * MAC checksums are a special case since half of this checksum will
+ * actually be the encryption MAC. This will be verified by the
+ * decryption process, so we just check the truncated checksum now.
+ * Objset blocks use embedded MACs so we don't truncate the checksum
+ * for them.
+ */
+ if (bp != NULL && BP_USES_CRYPT(bp) &&
+ BP_GET_TYPE(bp) != DMU_OT_OBJSET) {
+ if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) {
+ actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2];
+ actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3];
+ }
+
+ actual_cksum.zc_word[2] = 0;
+ actual_cksum.zc_word[3] = 0;
+ expected_cksum.zc_word[2] = 0;
+ expected_cksum.zc_word[3] = 0;
+ }
+
if (info != NULL) {
info->zbc_expected = expected_cksum;
info->zbc_actual = actual_cksum;
diff --git a/module/zfs/zio_crypt.c b/module/zfs/zio_crypt.c
new file mode 100644
index 000000000..8fcf51550
--- /dev/null
+++ b/module/zfs/zio_crypt.c
@@ -0,0 +1,2037 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authenication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearrannged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpretted.
+ *
+ * At the objset level, we maintain 2 seperate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintianed in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaindata as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaindata. We use an HMAC
+ * here so that a reproducible checksum of the plaindata is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000
+#define ZFS_CURRENT_MAX_SALT_USES \
+ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+ {"", ZC_TYPE_NONE, 0, "inherit"},
+ {"", ZC_TYPE_NONE, 0, "on"},
+ {"", ZC_TYPE_NONE, 0, "off"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"}
+};
+
+static int
+hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
+ uint_t km_len, uint8_t *out_buf)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_key_t key;
+ crypto_data_t input_cd, output_cd;
+
+ /* initialize HMAC mechanism */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ /* initialize the salt as a crypto key */
+ key.ck_format = CRYPTO_KEY_RAW;
+ key.ck_length = BYTES_TO_BITS(salt_len);
+ key.ck_data = salt;
+
+ /* initialize crypto data for the input and output data */
+ input_cd.cd_format = CRYPTO_DATA_RAW;
+ input_cd.cd_offset = 0;
+ input_cd.cd_length = km_len;
+ input_cd.cd_raw.iov_base = (char *)key_material;
+ input_cd.cd_raw.iov_len = input_cd.cd_length;
+
+ output_cd.cd_format = CRYPTO_DATA_RAW;
+ output_cd.cd_offset = 0;
+ output_cd.cd_length = SHA512_DIGEST_LEN;
+ output_cd.cd_raw.iov_base = (char *)out_buf;
+ output_cd.cd_raw.iov_len = output_cd.cd_length;
+
+ ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+static int
+hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
+ uint8_t *out_buf, uint_t out_len)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_context_t ctx;
+ crypto_key_t key;
+ crypto_data_t T_cd, info_cd, c_cd;
+ uint_t i, T_len = 0, pos = 0;
+ uint8_t c;
+ uint_t N = (out_len + SHA512_DIGEST_LEN) / SHA512_DIGEST_LEN;
+ uint8_t T[SHA512_DIGEST_LEN];
+
+ if (N > 255)
+ return (SET_ERROR(EINVAL));
+
+ /* initialize HMAC mechanism */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ /* initialize the salt as a crypto key */
+ key.ck_format = CRYPTO_KEY_RAW;
+ key.ck_length = BYTES_TO_BITS(SHA512_DIGEST_LEN);
+ key.ck_data = extract_key;
+
+ /* initialize crypto data for the input and output data */
+ T_cd.cd_format = CRYPTO_DATA_RAW;
+ T_cd.cd_offset = 0;
+ T_cd.cd_raw.iov_base = (char *)T;
+
+ c_cd.cd_format = CRYPTO_DATA_RAW;
+ c_cd.cd_offset = 0;
+ c_cd.cd_length = 1;
+ c_cd.cd_raw.iov_base = (char *)&c;
+ c_cd.cd_raw.iov_len = c_cd.cd_length;
+
+ info_cd.cd_format = CRYPTO_DATA_RAW;
+ info_cd.cd_offset = 0;
+ info_cd.cd_length = info_len;
+ info_cd.cd_raw.iov_base = (char *)info;
+ info_cd.cd_raw.iov_len = info_cd.cd_length;
+
+ for (i = 1; i <= N; i++) {
+ c = i;
+
+ T_cd.cd_length = T_len;
+ T_cd.cd_raw.iov_len = T_cd.cd_length;
+
+ ret = crypto_mac_init(&mech, &key, NULL, &ctx, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ ret = crypto_mac_update(ctx, &T_cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ ret = crypto_mac_update(ctx, &info_cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ ret = crypto_mac_update(ctx, &c_cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ T_len = SHA512_DIGEST_LEN;
+ T_cd.cd_length = T_len;
+ T_cd.cd_raw.iov_len = T_cd.cd_length;
+
+ ret = crypto_mac_final(ctx, &T_cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(T, out_buf + pos,
+ (i != N) ? SHA512_DIGEST_LEN : (out_len - pos));
+ pos += SHA512_DIGEST_LEN;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * HKDF is designed to be a relatively fast function for deriving keys from a
+ * master key + a salt. We use this function to generate new encryption keys
+ * so as to avoid hitting the cryptographic limits of the underlying
+ * encryption modes. Note that, for the sake of deriving encryption keys, the
+ * info parameter is called the "salt" everywhere else in the code.
+ */
+static int
+hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt,
+ uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key,
+ uint_t out_len)
+{
+ int ret;
+ uint8_t extract_key[SHA512_DIGEST_LEN];
+
+ ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len,
+ extract_key);
+ if (ret != 0)
+ goto error;
+
+ ret = hkdf_sha512_expand(extract_key, info, info_len, output_key,
+ out_len);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+ rw_destroy(&key->zk_salt_lock);
+
+ /* free crypto templates */
+ crypto_destroy_ctx_template(key->zk_current_tmpl);
+ crypto_destroy_ctx_template(key->zk_hmac_tmpl);
+
+ /* zero out sensitive data */
+ bzero(key, sizeof (zio_crypt_key_t));
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ uint_t keydata_len;
+
+ ASSERT(key != NULL);
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+ bzero(key, sizeof (zio_crypt_key_t));
+
+ /* fill keydata buffers and salt with random data */
+ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for the ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = BYTES_TO_BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+ key->zk_hmac_key.ck_length = BYTES_TO_BITS(SHA512_HMAC_KEYLEN);
+
+ /*
+ * Initialize the crypto templates. It's ok if this fails because
+ * this is just an optimization.
+ */
+ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+ &key->zk_hmac_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_hmac_tmpl = NULL;
+
+ key->zk_crypt = crypt;
+ key->zk_salt_count = 0;
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy(key);
+ return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+ int ret = 0;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ crypto_mechanism_t mech;
+ uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+ /* generate a new salt */
+ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+ /* someone beat us to the salt rotation, just unlock and return */
+ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+ goto out_unlock;
+
+ /* derive the current key from the master key and the new salt */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+ if (ret != 0)
+ goto out_unlock;
+
+ /* assign the salt and reset the usage count */
+ bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+ key->zk_salt_count = 0;
+
+ /* destroy the old context template and create the new one */
+ crypto_destroy_ctx_template(key->zk_current_tmpl);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ rw_exit(&key->zk_salt_lock);
+
+ return (0);
+
+out_unlock:
+ rw_exit(&key->zk_salt_lock);
+error:
+ return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+ int ret;
+ boolean_t salt_change;
+
+ rw_enter(&key->zk_salt_lock, RW_READER);
+
+ bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+ salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+ ZFS_CURRENT_MAX_SALT_USES);
+
+ rw_exit(&key->zk_salt_lock);
+
+ if (salt_change) {
+ ret = zio_crypt_key_change_salt(key);
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the cphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+static int
+zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
+ uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
+{
+ int ret;
+ crypto_data_t plaindata, cipherdata;
+ CK_AES_CCM_PARAMS ccmp;
+ CK_AES_GCM_PARAMS gcmp;
+ crypto_mechanism_t mech;
+ zio_crypt_info_t crypt_info;
+ uint_t plain_full_len, maclen;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW);
+
+ /* lookup the encryption info */
+ crypt_info = zio_crypt_table[crypt];
+
+ /* the mac will always be the last iovec_t in the cipher uio */
+ maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
+
+ ASSERT(maclen <= ZIO_DATA_MAC_LEN);
+
+ /* setup encryption mechanism (same as crypt) */
+ mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
+
+ /*
+ * Strangely, the ICP requires that plain_full_len must include
+ * the MAC length when decrypting, even though the UIO does not
+ * need to have the extra space allocated.
+ */
+ if (encrypt) {
+ plain_full_len = datalen;
+ } else {
+ plain_full_len = datalen + maclen;
+ }
+
+ /*
+ * setup encryption params (currently only AES CCM and AES GCM
+ * are supported)
+ */
+ if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
+ ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
+ ccmp.ulAuthDataSize = auth_len;
+ ccmp.authData = authbuf;
+ ccmp.ulMACSize = maclen;
+ ccmp.nonce = ivbuf;
+ ccmp.ulDataSize = plain_full_len;
+
+ mech.cm_param = (char *)(&ccmp);
+ mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
+ } else {
+ gcmp.ulIvLen = ZIO_DATA_IV_LEN;
+ gcmp.ulIvBits = BYTES_TO_BITS(ZIO_DATA_IV_LEN);
+ gcmp.ulAADLen = auth_len;
+ gcmp.pAAD = authbuf;
+ gcmp.ulTagBits = BYTES_TO_BITS(maclen);
+ gcmp.pIv = ivbuf;
+
+ mech.cm_param = (char *)(&gcmp);
+ mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+ }
+
+ /* populate the cipher and plain data structs. */
+ plaindata.cd_format = CRYPTO_DATA_UIO;
+ plaindata.cd_offset = 0;
+ plaindata.cd_uio = puio;
+ plaindata.cd_miscdata = NULL;
+ plaindata.cd_length = plain_full_len;
+
+ cipherdata.cd_format = CRYPTO_DATA_UIO;
+ cipherdata.cd_offset = 0;
+ cipherdata.cd_uio = cuio;
+ cipherdata.cd_miscdata = NULL;
+ cipherdata.cd_length = datalen + maclen;
+
+ /* perform the actual encryption */
+ if (encrypt) {
+ ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata,
+ NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+ } else {
+ ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata,
+ NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
+ ret = SET_ERROR(ECKSUM);
+ goto error;
+ }
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+ uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+ int ret;
+ uio_t puio, cuio;
+ iovec_t plain_iovecs[2], cipher_iovecs[3];
+ uint64_t crypt = key->zk_crypt;
+ uint64_t le_guid = LE_64(key->zk_guid);
+ uint_t enc_len, keydata_len;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+ /* generate iv for wrapping the master and hmac key */
+ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* initialize uio_ts */
+ plain_iovecs[0].iov_base = key->zk_master_keydata;
+ plain_iovecs[0].iov_len = keydata_len;
+ plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+ plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+ cipher_iovecs[0].iov_base = keydata_out;
+ cipher_iovecs[0].iov_len = keydata_len;
+ cipher_iovecs[1].iov_base = hmac_keydata_out;
+ cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+ cipher_iovecs[2].iov_base = mac;
+ cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+ enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+ puio.uio_iov = plain_iovecs;
+ puio.uio_iovcnt = 2;
+ puio.uio_segflg = UIO_SYSSPACE;
+ cuio.uio_iov = cipher_iovecs;
+ cuio.uio_iovcnt = 3;
+ cuio.uio_segflg = UIO_SYSSPACE;
+
+ /* encrypt the keys and store the resulting ciphertext and mac */
+ ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
+ &puio, &cuio, (uint8_t *)&le_guid, sizeof (uint64_t));
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t guid,
+ uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac,
+ zio_crypt_key_t *key)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ uio_t puio, cuio;
+ iovec_t plain_iovecs[2], cipher_iovecs[3];
+ uint_t enc_len, keydata_len;
+ uint64_t le_guid = LE_64(guid);
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+ /* initialize uio_ts */
+ plain_iovecs[0].iov_base = key->zk_master_keydata;
+ plain_iovecs[0].iov_len = keydata_len;
+ plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+ plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+ cipher_iovecs[0].iov_base = keydata;
+ cipher_iovecs[0].iov_len = keydata_len;
+ cipher_iovecs[1].iov_base = hmac_keydata;
+ cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+ cipher_iovecs[2].iov_base = mac;
+ cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+ enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+ puio.uio_iov = plain_iovecs;
+ puio.uio_segflg = UIO_SYSSPACE;
+ puio.uio_iovcnt = 2;
+ cuio.uio_iov = cipher_iovecs;
+ cuio.uio_iovcnt = 3;
+ cuio.uio_segflg = UIO_SYSSPACE;
+
+ /* decrypt the keys and store the result in the output buffers */
+ ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
+ &puio, &cuio, (uint8_t *)&le_guid, sizeof (uint64_t));
+ if (ret != 0)
+ goto error;
+
+ /* generate a fresh salt */
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = BYTES_TO_BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+ key->zk_hmac_key.ck_length = BYTES_TO_BITS(SHA512_HMAC_KEYLEN);
+
+ /*
+ * Initialize the crypto templates. It's ok if this fails because
+ * this is just an optimization.
+ */
+ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+ &key->zk_hmac_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_hmac_tmpl = NULL;
+
+ key->zk_crypt = crypt;
+ key->zk_guid = guid;
+ key->zk_salt_count = 0;
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy(key);
+ return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+ int ret;
+
+ /* randomly generate the IV */
+ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ bzero(ivbuf, ZIO_DATA_IV_LEN);
+ return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+ uint8_t *digestbuf)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_data_t in_data, digest_data;
+ uint8_t raw_digestbuf[SHA512_DIGEST_LEN];
+
+ /* initialize sha512-hmac mechanism and crypto data */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ /* initialize the crypto data */
+ in_data.cd_format = CRYPTO_DATA_RAW;
+ in_data.cd_offset = 0;
+ in_data.cd_length = datalen;
+ in_data.cd_raw.iov_base = (char *)data;
+ in_data.cd_raw.iov_len = in_data.cd_length;
+
+ digest_data.cd_format = CRYPTO_DATA_RAW;
+ digest_data.cd_offset = 0;
+ digest_data.cd_length = SHA512_DIGEST_LEN;
+ digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
+ digest_data.cd_raw.iov_len = digest_data.cd_length;
+
+ /* generate the hmac */
+ ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
+ &digest_data, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_digestbuf, digestbuf, ZIO_DATA_MAC_LEN);
+
+ return (0);
+
+error:
+ bzero(digestbuf, ZIO_DATA_MAC_LEN);
+ return (ret);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+ uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+ int ret;
+ uint8_t digestbuf[SHA512_DIGEST_LEN];
+
+ ret = zio_crypt_do_hmac(key, data, datalen, digestbuf);
+ if (ret != 0)
+ return (ret);
+
+ bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+ bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+ return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint32_t val32;
+
+ ASSERT(BP_IS_ENCRYPTED(bp));
+
+ bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+ bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, val32);
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_PROTECTED(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_IS_AUTHENTICATED(bp)) {
+ bzero(salt, ZIO_DATA_SALT_LEN);
+ bzero(iv, ZIO_DATA_IV_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+ bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+ val32 = (uint32_t)BP_GET_IV2(bp);
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+ bcopy(&val64, salt, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+ bcopy(&val64, iv, sizeof (uint64_t));
+
+ val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+ ASSERT(BP_USES_CRYPT(bp));
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+ bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+ sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ bzero(mac, ZIO_DATA_MAC_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+ bcopy(&val64, mac, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+ bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+ zil_chain_t *zilc = data;
+
+ bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+ sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+ /*
+ * The ZIL MAC is embedded in the block it protects, which will
+ * not have been byteswapped by the time this function has been called.
+ * As a result, we don't need to worry about byteswapping the MAC.
+ */
+ const zil_chain_t *zilc = data;
+
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+ uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+ uint8_t *src;
+ dnode_phys_t *dnp, *sdnp, *ddnp;
+
+ src = abd_borrow_buf_copy(src_abd, datalen);
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+ DN_MAX_BONUS_LEN(dnp));
+ }
+ }
+
+ abd_return_buf(src_abd, src, datalen);
+}
+
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp)
+{
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+
+ /*
+ * psize cannot be set to zero or it will trigger asserts, but the
+ * value doesn't really matter as long as it is constant.
+ */
+ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, boolean_t should_bswap,
+ blkptr_t *bp)
+{
+ int ret;
+ crypto_data_t cd;
+ uint64_t le_blkprop;
+ blkptr_t tmpbp = *bp;
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+ ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp);
+
+ le_blkprop = (ZFS_HOST_BYTEORDER) ?
+ tmpbp.blk_prop : BSWAP_64(tmpbp.blk_prop);
+
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&le_blkprop;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ zio_crypt_decode_mac_bp(&tmpbp, mac);
+ cd.cd_length = ZIO_DATA_MAC_LEN;
+ cd.cd_raw.iov_base = (char *)mac;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, boolean_t should_bswap,
+ blkptr_t *bp)
+{
+ blkptr_t tmpbp = *bp;
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+ ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp);
+ zio_crypt_decode_mac_bp(&tmpbp, mac);
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ SHA2Update(ctx, &tmpbp.blk_prop, sizeof (uint64_t));
+ SHA2Update(ctx, mac, ZIO_DATA_MAC_LEN);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t crypt_len;
+ blkptr_t tmpbp = *bp;
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+ ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp);
+ zio_crypt_decode_mac_bp(&tmpbp, mac);
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ crypt_len = sizeof (uint64_t);
+ bcopy(&tmpbp.blk_prop, *aadp, crypt_len);
+ *aadp += crypt_len;
+ *aad_len += crypt_len;
+
+ crypt_len = ZIO_DATA_MAC_LEN;
+ bcopy(mac, *aadp, crypt_len);
+ *aadp += crypt_len;
+ *aad_len += crypt_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, boolean_t should_bswap,
+ dnode_phys_t *dnp)
+{
+ int ret, i;
+ dnode_phys_t *adnp;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ crypto_data_t cd;
+ uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+
+ /* authenticate the core dnode (masking out non-portable bits) */
+ bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+ adnp = (dnode_phys_t *)tmp_dncore;
+ if (le_bswap) {
+ adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+ adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+ adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+ adnp->dn_used = BSWAP_64(adnp->dn_used);
+ }
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+
+ cd.cd_length = sizeof (tmp_dncore);
+ cd.cd_raw.iov_base = (char *)adnp;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx,
+ should_bswap, &dnp->dn_blkptr[i]);
+ if (ret != 0)
+ goto error;
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx,
+ should_bswap, DN_SPILL_BLKPTR(dnp));
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 seperate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * the metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+ boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_context_t ctx;
+ crypto_data_t cd;
+ objset_phys_t *osp = data;
+ uint64_t intval;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ uint8_t raw_portable_mac[SHA512_DIGEST_LEN];
+ uint8_t raw_local_mac[SHA512_DIGEST_LEN];
+
+ /* initialize HMAC mechanism */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+
+ /* calculate the portable MAC from the portable fields and metadnode */
+ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the os_type */
+ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in fields from the metadnode */
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, should_bswap,
+ &osp->os_meta_dnode);
+ if (ret)
+ goto error;
+
+ /* store the final digest in a temporary buffer and copy what we need */
+ cd.cd_length = SHA512_DIGEST_LEN;
+ cd.cd_raw.iov_base = (char *)raw_portable_mac;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_final(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+ /*
+ * The local MAC protects the user and group accounting. If these
+ * objects are not present, the local MAC is zeroed out.
+ */
+ if (osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE) {
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (0);
+ }
+
+ /* calculate the local MAC from the userused and groupused dnodes */
+ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the non-portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in fields from the user accounting dnodes */
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, should_bswap,
+ &osp->os_userused_dnode);
+ if (ret)
+ goto error;
+
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, should_bswap,
+ &osp->os_groupused_dnode);
+ if (ret)
+ goto error;
+
+ /* store the final digest in a temporary buffer and copy what we need */
+ cd.cd_length = SHA512_DIGEST_LEN;
+ cd.cd_raw.iov_base = (char *)raw_local_mac;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_final(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+ return (0);
+
+error:
+ bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(uio_t *uio)
+{
+ if (uio->uio_iov)
+ kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ blkptr_t *bp;
+ int i, epb = datalen >> SPA_BLKPTRSHIFT;
+ SHA2_CTX ctx;
+ uint8_t digestbuf[SHA512_DIGEST_LEN];
+
+ /* checksum all of the MACs from the layer below */
+ SHA2Init(SHA512, &ctx);
+ for (i = 0, bp = buf; i < epb; i++, bp++) {
+ zio_crypt_bp_do_indrect_checksum_updates(&ctx, byteswap, bp);
+ }
+ SHA2Final(digestbuf, &ctx);
+
+ if (generate) {
+ bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+ return (0);
+ }
+
+ if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
+ return (SET_ERROR(ECKSUM));
+
+ return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+
+ int ret;
+ void *buf;
+
+ buf = abd_borrow_buf_copy(abd, datalen);
+ ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+ byteswap, cksum);
+ abd_return_buf(abd, buf, datalen);
+
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio,
+ uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+ boolean_t *no_crypt)
+{
+ int ret;
+ uint64_t txtype;
+ uint_t nr_src, nr_dst, lr_len, crypt_len;
+ uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+ iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+ uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+ zil_chain_t *zilc;
+ lr_t *lr;
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+ /* cipherbuf always needs an extra iovec for the MAC */
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ nr_src = 0;
+ nr_dst = 1;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ nr_src = 1;
+ nr_dst = 0;
+ }
+
+ /* find the start and end record of the log block */
+ zilc = (zil_chain_t *)src;
+ slrp = src + sizeof (zil_chain_t);
+ aadp = aadbuf;
+ blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+ /* calculate the number of encrypted iovecs we will need */
+ for (; slrp < blkend; slrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ nr_iovecs++;
+ if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+ nr_iovecs++;
+ }
+
+ nr_src += nr_iovecs;
+ nr_dst += nr_iovecs;
+
+ /* allocate the iovec arrays */
+ if (nr_src != 0) {
+ src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+ if (!src_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ if (nr_dst != 0) {
+ dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+ if (!dst_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ /*
+ * Copy the plain zil header over and authenticate everything except
+ * the checksum that will store our MAC. If we are writing the data
+ * the embedded checksum will not have been calculated yet, so we don't
+ * authenticate that.
+ */
+ bcopy(src, dst, sizeof (zil_chain_t));
+ bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+ aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+ aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+ /* loop over records again, filling in iovecs */
+ nr_iovecs = 0;
+ slrp = src + sizeof (zil_chain_t);
+ dlrp = dst + sizeof (zil_chain_t);
+
+ for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ /* copy the common lr_t */
+ bcopy(slrp, dlrp, sizeof (lr_t));
+ bcopy(slrp, aadp, sizeof (lr_t));
+ aadp += sizeof (lr_t);
+ aad_len += sizeof (lr_t);
+
+ /*
+ * If this is a TX_WRITE record we want to encrypt everything
+ * except the bp if exists. If the bp does exist we want to
+ * authenticate it.
+ */
+ if (txtype == TX_WRITE) {
+ crypt_len = sizeof (lr_write_t) -
+ sizeof (lr_t) - sizeof (blkptr_t);
+ src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ /* copy the bp now since it will not be encrypted */
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ sizeof (blkptr_t));
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ aadp, sizeof (blkptr_t));
+ aadp += sizeof (blkptr_t);
+ aad_len += sizeof (blkptr_t);
+ nr_iovecs++;
+ total_len += crypt_len;
+
+ if (lr_len != sizeof (lr_write_t)) {
+ crypt_len = lr_len - sizeof (lr_write_t);
+ src_iovecs[nr_iovecs].iov_base =
+ slrp + sizeof (lr_write_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base =
+ dlrp + sizeof (lr_write_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+ nr_iovecs++;
+ total_len += crypt_len;
+ }
+ } else {
+ crypt_len = lr_len - sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+ nr_iovecs++;
+ total_len += crypt_len;
+ }
+ }
+
+ *no_crypt = (nr_iovecs == 0);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+
+ if (encrypt) {
+ puio->uio_iov = src_iovecs;
+ puio->uio_iovcnt = nr_src;
+ cuio->uio_iov = dst_iovecs;
+ cuio->uio_iovcnt = nr_dst;
+ } else {
+ puio->uio_iov = dst_iovecs;
+ puio->uio_iovcnt = nr_dst;
+ cuio->uio_iov = src_iovecs;
+ cuio->uio_iovcnt = nr_src;
+ }
+
+ return (0);
+
+error:
+ zio_buf_free(aadbuf, datalen);
+ if (src_iovecs != NULL)
+ kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+ if (dst_iovecs != NULL)
+ kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+ *enc_len = 0;
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio,
+ uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+ boolean_t *no_crypt)
+{
+ int ret;
+ uint_t nr_src, nr_dst, crypt_len;
+ uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+ uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+ iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+ uint8_t *src, *dst, *aadp;
+ dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ nr_src = 0;
+ nr_dst = 1;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ nr_src = 1;
+ nr_dst = 0;
+ }
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+ aadp = aadbuf;
+
+ /*
+ * Count the number of iovecs we will need to do the encryption by
+ * counting the number of bonus buffers that need to be encrypted.
+ */
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ /*
+ * This block may still be byteswapped. However, all of the
+ * values we use are either uint8_t's (for which byteswapping
+ * is a noop) or a * != 0 check, which will work regardless
+ * of whether or not we byteswap.
+ */
+ if (sdnp[i].dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+ sdnp[i].dn_bonuslen != 0) {
+ nr_iovecs++;
+ }
+ }
+
+ nr_src += nr_iovecs;
+ nr_dst += nr_iovecs;
+
+ if (nr_src != 0) {
+ src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+ if (!src_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ if (nr_dst != 0) {
+ dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+ if (!dst_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ nr_iovecs = 0;
+
+ /*
+ * Iterate through the dnodes again, this time filling in the uios
+ * we allocated earlier. We also concatenate any data we want to
+ * authenticate onto aadbuf.
+ */
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+
+ /* copy over the core fields and blkptrs (kept as plaintext) */
+ bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+ sizeof (blkptr_t));
+ }
+
+ /*
+ * Handle authenticated data. We authenticate everything in
+ * the dnode that can be brought over when we do a raw send.
+ * This includes all of the core fields as well as the MACs
+ * stored in the bp checksums and all of the portable bits
+ * from blk_prop. We include the dnode padding here in case it
+ * ever gets used in the future. Some dn_flags and dn_used are
+ * not portable so we mask those out values out of the
+ * authenticated data.
+ */
+ crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+ bcopy(dnp, aadp, crypt_len);
+ adnp = (dnode_phys_t *)aadp;
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+ aadp += crypt_len;
+ aad_len += crypt_len;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ byteswap, &dnp->dn_blkptr[j]);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ byteswap, DN_SPILL_BLKPTR(dnp));
+ }
+
+ /*
+ * If this bonus buffer needs to be encrypted, we prepare an
+ * iovec_t. The encryption / decryption functions will fill
+ * this in for us with the encrypted or decrypted data.
+ * Otherwise we add the bonus buffer to the authenticated
+ * data buffer and copy it over to the destination. The
+ * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+ * we can guarantee alignment with the AES block size
+ * (128 bits).
+ */
+ crypt_len = DN_MAX_BONUS_LEN(dnp);
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ nr_iovecs++;
+ total_len += crypt_len;
+ } else {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+ bcopy(DN_BONUS(dnp), aadp, crypt_len);
+ aadp += crypt_len;
+ aad_len += crypt_len;
+ }
+ }
+
+ *no_crypt = (nr_iovecs == 0);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+
+ if (encrypt) {
+ puio->uio_iov = src_iovecs;
+ puio->uio_iovcnt = nr_src;
+ cuio->uio_iov = dst_iovecs;
+ cuio->uio_iovcnt = nr_dst;
+ } else {
+ puio->uio_iov = dst_iovecs;
+ puio->uio_iovcnt = nr_dst;
+ cuio->uio_iov = src_iovecs;
+ cuio->uio_iovcnt = nr_src;
+ }
+
+ return (0);
+
+error:
+ zio_buf_free(aadbuf, datalen);
+ if (src_iovecs != NULL)
+ kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+ if (dst_iovecs != NULL)
+ kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+ *enc_len = 0;
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio,
+ uint_t *enc_len)
+{
+ int ret;
+ uint_t nr_plain = 1, nr_cipher = 2;
+ iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+
+ /* allocate the iovecs for the plain and cipher data */
+ plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
+ KM_SLEEP);
+ if (!plain_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+
+ cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+ KM_SLEEP);
+ if (!cipher_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+
+ plain_iovecs[0].iov_base = plainbuf;
+ plain_iovecs[0].iov_len = datalen;
+ cipher_iovecs[0].iov_base = cipherbuf;
+ cipher_iovecs[0].iov_len = datalen;
+
+ *enc_len = datalen;
+ puio->uio_iov = plain_iovecs;
+ puio->uio_iovcnt = nr_plain;
+ cuio->uio_iov = cipher_iovecs;
+ cuio->uio_iovcnt = nr_cipher;
+
+ return (0);
+
+error:
+ if (plain_iovecs != NULL)
+ kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+ if (cipher_iovecs != NULL)
+ kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+ *enc_len = 0;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, dmu_object_type_t ot, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uint8_t *mac,
+ uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+ uint_t *auth_len, boolean_t *no_crypt)
+{
+ int ret;
+ iovec_t *mac_iov;
+
+ ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+ /* route to handler */
+ switch (ot) {
+ case DMU_OT_INTENT_LOG:
+ ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+ datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+ no_crypt);
+ break;
+ case DMU_OT_DNODE:
+ ret = zio_crypt_init_uios_dnode(encrypt, plainbuf, cipherbuf,
+ datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+ no_crypt);
+ break;
+ default:
+ ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+ datalen, puio, cuio, enc_len);
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ break;
+ }
+
+ if (ret != 0)
+ goto error;
+
+ /* populate the uios */
+ puio->uio_segflg = UIO_SYSSPACE;
+ cuio->uio_segflg = UIO_SYSSPACE;
+
+ mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
+ mac_iov->iov_base = mac;
+ mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, uint8_t *salt,
+ dmu_object_type_t ot, uint8_t *iv, uint8_t *mac, uint_t datalen,
+ boolean_t byteswap, uint8_t *plainbuf, uint8_t *cipherbuf,
+ boolean_t *no_crypt)
+{
+ int ret;
+ boolean_t locked = B_FALSE;
+ uint64_t crypt = key->zk_crypt;
+ uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+ uint_t enc_len, auth_len;
+ uio_t puio, cuio;
+ uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+ crypto_key_t tmp_ckey, *ckey = NULL;
+ crypto_ctx_template_t tmpl;
+ uint8_t *authbuf = NULL;
+
+ bzero(&puio, sizeof (uio_t));
+ bzero(&cuio, sizeof (uio_t));
+
+ /* create uios for encryption */
+ ret = zio_crypt_init_uios(encrypt, ot, plainbuf, cipherbuf, datalen,
+ byteswap, mac, &puio, &cuio, &enc_len, &authbuf, &auth_len,
+ no_crypt);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * If the needed key is the current one, just use it. Otherwise we
+ * need to generate a temporary one from the given salt + master key.
+ * If we are encrypting, we must return a copy of the current salt
+ * so that it can be stored in the blkptr_t.
+ */
+ rw_enter(&key->zk_salt_lock, RW_READER);
+ locked = B_TRUE;
+
+ if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+ ckey = &key->zk_current_key;
+ tmpl = key->zk_current_tmpl;
+ } else {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+
+ tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+ tmp_ckey.ck_data = enc_keydata;
+ tmp_ckey.ck_length = BYTES_TO_BITS(keydata_len);
+
+ ckey = &tmp_ckey;
+ tmpl = NULL;
+ }
+
+ /* perform the encryption / decryption */
+ ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
+ &puio, &cuio, authbuf, auth_len);
+ if (ret != 0)
+ goto error;
+
+ if (locked) {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+ }
+
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+
+ return (0);
+
+error:
+ if (locked)
+ rw_exit(&key->zk_salt_lock);
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+
+ return (ret);
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, uint8_t *salt,
+ dmu_object_type_t ot, uint8_t *iv, uint8_t *mac, uint_t datalen,
+ boolean_t byteswap, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+ int ret;
+ void *ptmp, *ctmp;
+
+ if (encrypt) {
+ ptmp = abd_borrow_buf_copy(pabd, datalen);
+ ctmp = abd_borrow_buf(cabd, datalen);
+ } else {
+ ptmp = abd_borrow_buf(pabd, datalen);
+ ctmp = abd_borrow_buf_copy(cabd, datalen);
+ }
+
+ ret = zio_do_crypt_data(encrypt, key, salt, ot, iv, mac,
+ datalen, byteswap, ptmp, ctmp, no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (0);
+
+error:
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (ret);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+ "can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 40fdf0777..4d11b52ab 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -451,7 +451,7 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (zv == NULL || zv->zv_objset == NULL) {
if (zv != NULL)
rw_exit(&zv->zv_suspend_lock);
- if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
+ if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
FTAG, &os)) != 0) {
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
@@ -478,7 +478,7 @@ out:
kmem_free(doi, sizeof (dmu_object_info_t));
if (owned) {
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_TRUE, FTAG);
if (zv != NULL)
zv->zv_objset = NULL;
} else {
@@ -1268,7 +1268,7 @@ zvol_first_open(zvol_state_t *zv)
}
/* lie and say we're read-only */
- error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zv, &os);
+ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, 1, zv, &os);
if (error)
goto out_mutex;
@@ -1277,7 +1277,7 @@ zvol_first_open(zvol_state_t *zv)
error = zvol_setup_zv(zv);
if (error) {
- dmu_objset_disown(os, zv);
+ dmu_objset_disown(os, 1, zv);
zv->zv_objset = NULL;
}
@@ -1295,7 +1295,7 @@ zvol_last_close(zvol_state_t *zv)
zvol_shutdown_zv(zv);
- dmu_objset_disown(zv->zv_objset, zv);
+ dmu_objset_disown(zv->zv_objset, 1, zv);
zv->zv_objset = NULL;
}
@@ -1756,7 +1756,7 @@ zvol_create_minor_impl(const char *name)
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
- error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
if (error)
goto out_doi;
@@ -1822,7 +1822,7 @@ zvol_create_minor_impl(const char *name)
zv->zv_objset = NULL;
out_dmu_objset_disown:
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_TRUE, FTAG);
out_doi:
kmem_free(doi, sizeof (dmu_object_info_t));
@@ -1887,11 +1887,11 @@ zvol_prefetch_minors_impl(void *arg)
char *dsname = job->name;
objset_t *os = NULL;
- job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, FTAG,
- &os);
+ job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
+ FTAG, &os);
if (job->error == 0) {
dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
- dmu_objset_disown(os, FTAG);
+ dmu_objset_disown(os, B_TRUE, FTAG);
}
}