summaryrefslogtreecommitdiffstats
path: root/module/zfs/dmu.c
diff options
context:
space:
mode:
authorTom Caputi <[email protected]>2017-08-14 13:36:48 -0400
committerBrian Behlendorf <[email protected]>2017-08-14 10:36:48 -0700
commitb52563034230b35f0562b6f40ad1a00f02bd9a05 (patch)
tree794ccc5160e997e280cb6e36c7778ce9f7a96548 /module/zfs/dmu.c
parent376994828fd3753aba75d492859727ca76f6a293 (diff)
Native Encryption for ZFS on Linux
This change incorporates three major pieces: The first change is a keystore that manages wrapping and encryption keys for encrypted datasets. These commands mostly involve manipulating the new DSL Crypto Key ZAP Objects that live in the MOS. Each encrypted dataset has its own DSL Crypto Key that is protected with a user's key. This level of indirection allows users to change their keys without re-encrypting their entire datasets. The change implements the new subcommands "zfs load-key", "zfs unload-key" and "zfs change-key" which allow the user to manage their encryption keys and settings. In addition, several new flags and properties have been added to allow dataset creation and to make mounting and unmounting more convenient. The second piece of this patch provides the ability to encrypt, decyrpt, and authenticate protected datasets. Each object set maintains a Merkel tree of Message Authentication Codes that protect the lower layers, similarly to how checksums are maintained. This part impacts the zio layer, which handles the actual encryption and generation of MACs, as well as the ARC and DMU, which need to be able to handle encrypted buffers and protected data. The last addition is the ability to do raw, encrypted sends and receives. The idea here is to send raw encrypted and compressed data and receive it exactly as is on a backup system. This means that the dataset on the receiving system is protected using the same user key that is in use on the sending side. By doing so, datasets can be efficiently backed up to an untrusted system without fear of data being compromised. Reviewed by: Matthew Ahrens <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Jorgen Lundman <[email protected]> Signed-off-by: Tom Caputi <[email protected]> Closes #494 Closes #5769
Diffstat (limited to 'module/zfs/dmu.c')
-rw-r--r--module/zfs/dmu.c268
1 files changed, 206 insertions, 62 deletions
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 717bd121f..e098a4966 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -73,60 +73,60 @@ unsigned long zfs_per_txg_dirty_frees_percent = 30;
int zfs_dmu_offset_next_sync = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
- { DMU_BSWAP_UINT8, TRUE, "unallocated" },
- { DMU_BSWAP_ZAP, TRUE, "object directory" },
- { DMU_BSWAP_UINT64, TRUE, "object array" },
- { DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
- { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj header" },
- { DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
- { DMU_BSWAP_UINT64, TRUE, "SPA space map" },
- { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
- { DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
- { DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
- { DMU_BSWAP_UINT64, TRUE, "DSL directory" },
- { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
- { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
- { DMU_BSWAP_ZAP, TRUE, "DSL props" },
- { DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
- { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
- { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
- { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
- { DMU_BSWAP_UINT8, FALSE, "zvol object" },
- { DMU_BSWAP_ZAP, TRUE, "zvol prop" },
- { DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
- { DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
- { DMU_BSWAP_ZAP, TRUE, "other ZAP" },
- { DMU_BSWAP_ZAP, TRUE, "persistent error log" },
- { DMU_BSWAP_UINT8, TRUE, "SPA history" },
- { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
- { DMU_BSWAP_ZAP, TRUE, "Pool properties" },
- { DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
- { DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
- { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
- { DMU_BSWAP_UINT8, TRUE, "FUID table" },
- { DMU_BSWAP_UINT64, TRUE, "FUID table size" },
- { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
- { DMU_BSWAP_ZAP, TRUE, "scan work queue" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
- { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
- { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
- { DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
- { DMU_BSWAP_UINT8, TRUE, "System attributes" },
- { DMU_BSWAP_ZAP, TRUE, "SA master node" },
- { DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
- { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
- { DMU_BSWAP_ZAP, TRUE, "scan translations" },
- { DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
- { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
- { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
- { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "object directory" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "object array" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "ZIL intent log" },
+ { DMU_BSWAP_DNODE, TRUE, TRUE, "DMU dnode" },
+ { DMU_BSWAP_OBJSET, TRUE, FALSE, "DMU objset" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL directory" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL directory child map"},
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset snap map" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL props" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL dataset" },
+ { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
+ { DMU_BSWAP_OLDACL, TRUE, TRUE, "ZFS V0 ACL" },
+ { DMU_BSWAP_UINT8, FALSE, TRUE, "ZFS plain file" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS directory" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS delete queue" },
+ { DMU_BSWAP_UINT8, FALSE, TRUE, "zvol object" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
+ { DMU_BSWAP_UINT8, FALSE, TRUE, "other uint8[]" },
+ { DMU_BSWAP_UINT64, FALSE, TRUE, "other uint64[]" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "Pool properties" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL permissions" },
+ { DMU_BSWAP_ACL, TRUE, TRUE, "ZFS ACL" },
+ { DMU_BSWAP_UINT8, TRUE, TRUE, "ZFS SYSACL" },
+ { DMU_BSWAP_UINT8, TRUE, TRUE, "FUID table" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset next clones"},
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group used" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group quota" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "snapshot refcount tags"},
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
+ { DMU_BSWAP_UINT8, TRUE, TRUE, "System attributes" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "SA master node" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr registration" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr layouts" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
+ { DMU_BSWAP_UINT8, FALSE, TRUE, "deduplicated block" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL deadlist map" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL deadlist map hdr" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dir clones" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
};
const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
@@ -198,6 +198,8 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) {
@@ -221,6 +223,8 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
@@ -321,11 +325,18 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
* returns ENOENT, EIO, or 0.
*/
int
-dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
+ dmu_buf_t **dbp)
{
dnode_t *dn;
dmu_buf_impl_t *db;
int error;
+ uint32_t db_flags = DB_RF_MUST_SUCCEED;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+ if (flags & DMU_READ_NO_DECRYPT)
+ db_flags |= DB_RF_NO_DECRYPT;
error = dnode_hold(os, object, FTAG, &dn);
if (error)
@@ -355,12 +366,24 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dnode_rele(dn, FTAG);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
+ error = dbuf_read(db, NULL, db_flags);
+ if (error) {
+ dnode_evict_bonus(dn);
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ return (error);
+ }
*dbp = &db->db;
return (0);
}
+int
+dmu_bonus_hold(objset_t *os, uint64_t obj, void *tag, dmu_buf_t **dbp)
+{
+ return (dmu_bonus_hold_impl(os, obj, tag, DMU_READ_NO_PREFETCH, dbp));
+}
+
/*
* returns ENOENT, EIO, or 0.
*
@@ -601,8 +624,8 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
* indirect blocks prefeteched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
*
- * Note that if the indirect blocks above the blocks being prefetched are not in
- * cache, they will be asychronously read in.
+ * Note that if the indirect blocks above the blocks being prefetched are not
+ * in cache, they will be asychronously read in.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
@@ -1462,6 +1485,83 @@ dmu_return_arcbuf(arc_buf_t *buf)
arc_buf_destroy(buf, FTAG);
}
+void
+dmu_assign_arcbuf_impl(dmu_buf_t *handle, arc_buf_t *buf, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+ dbuf_assign_arcbuf(db, buf, tx);
+}
+
+void
+dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt,
+ const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
+{
+ dmu_object_type_t type;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+ uint64_t dsobj = dmu_objset_id(db->db_objset);
+
+ ASSERT3P(db->db_buf, !=, NULL);
+ ASSERT3U(dsobj, !=, 0);
+
+ dmu_buf_will_change_crypt_params(handle, tx);
+
+ DB_DNODE_ENTER(db);
+ type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ /*
+ * This technically violates the assumption the dmu code makes
+ * that dnode blocks are only released in syncing context.
+ */
+ (void) arc_release(db->db_buf, db);
+ arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac);
+}
+
+void
+dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset,
+ dmu_buf_t *handle, dmu_tx_t *tx)
+{
+ dmu_buf_t *dst_handle;
+ dmu_buf_impl_t *dstdb;
+ dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle;
+ arc_buf_t *abuf;
+ uint64_t datalen;
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ ASSERT3P(srcdb->db_buf, !=, NULL);
+
+ /* hold the db that we want to write to */
+ VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle,
+ DMU_READ_NO_DECRYPT));
+ dstdb = (dmu_buf_impl_t *)dst_handle;
+ datalen = arc_buf_size(srcdb->db_buf);
+
+ /* allocated an arc buffer that matches the type of srcdb->db_buf */
+ if (arc_is_encrypted(srcdb->db_buf)) {
+ arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac);
+ abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os),
+ byteorder, salt, iv, mac, DB_DNODE(dstdb)->dn_type,
+ datalen, arc_buf_lsize(srcdb->db_buf),
+ arc_get_compression(srcdb->db_buf));
+ } else {
+ /* we won't get a compressed db back from dmu_buf_hold() */
+ ASSERT3U(arc_get_compression(srcdb->db_buf),
+ ==, ZIO_COMPRESS_OFF);
+ abuf = arc_loan_buf(os->os_spa,
+ DMU_OT_IS_METADATA(DB_DNODE(dstdb)->dn_type), datalen);
+ }
+
+ ASSERT3U(datalen, ==, arc_buf_size(abuf));
+
+ /* copy the data to the new buffer and assign it to the dstdb */
+ bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen);
+ dbuf_assign_arcbuf(dstdb, abuf, tx);
+ dmu_buf_rele(dst_handle, FTAG);
+}
+
/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
@@ -1537,7 +1637,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
BP_SET_LSIZE(bp, db->db_size);
} else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
- bp->blk_fill = 1;
+ BP_SET_FILL(bp, 1);
}
}
}
@@ -1843,6 +1943,20 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
}
int
+dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_nlevels(dn, nlevels, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dmu_tx_t *tx)
{
@@ -1916,6 +2030,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t dedup = B_FALSE;
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
+ boolean_t encrypt = B_FALSE;
int copies = os->os_copies;
/*
@@ -2003,16 +2118,44 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
- zp->zp_checksum = checksum;
- zp->zp_compress = compress;
- ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
+ /*
+ * All objects in an encrypted objset are protected from modification
+ * via a MAC. Encrypted objects store their IV and salt in the last DVA
+ * in the bp, so we cannot use all copies. Encrypted objects are also
+ * not subject to nopwrite since writing the same data will still
+ * result in a new ciphertext. Only encrypted blocks can be dedup'd
+ * to avoid ambiguity in the dedup code since the DDT does not store
+ * object types.
+ */
+ if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
+ encrypt = B_TRUE;
+
+ if (DMU_OT_IS_ENCRYPTED(type)) {
+ copies = MIN(copies, SPA_DVAS_PER_BP - 1);
+ nopwrite = B_FALSE;
+ } else {
+ dedup = B_FALSE;
+ }
+
+ if (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)
+ compress = ZIO_COMPRESS_EMPTY;
+ }
+ zp->zp_compress = compress;
+ zp->zp_checksum = checksum;
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
+ zp->zp_encrypt = encrypt;
+ zp->zp_byteorder = ZFS_HOST_BYTEORDER;
+ bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
+ bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
+ bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
+
+ ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
}
/*
@@ -2267,6 +2410,7 @@ EXPORT_SYMBOL(dmu_object_info_from_dnode);
EXPORT_SYMBOL(dmu_object_info_from_db);
EXPORT_SYMBOL(dmu_object_size_from_db);
EXPORT_SYMBOL(dmu_object_dnsize_from_db);
+EXPORT_SYMBOL(dmu_object_set_nlevels);
EXPORT_SYMBOL(dmu_object_set_blocksize);
EXPORT_SYMBOL(dmu_object_set_checksum);
EXPORT_SYMBOL(dmu_object_set_compress);