diff options
author | Tom Caputi <[email protected]> | 2017-08-14 13:36:48 -0400 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2017-08-14 10:36:48 -0700 |
commit | b52563034230b35f0562b6f40ad1a00f02bd9a05 (patch) | |
tree | 794ccc5160e997e280cb6e36c7778ce9f7a96548 /module/zfs/dbuf.c | |
parent | 376994828fd3753aba75d492859727ca76f6a293 (diff) |
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Jorgen Lundman <[email protected]>
Signed-off-by: Tom Caputi <[email protected]>
Closes #494
Closes #5769
Diffstat (limited to 'module/zfs/dbuf.c')
-rw-r--r-- | module/zfs/dbuf.c | 209 |
1 files changed, 173 insertions, 36 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 625e06701..745715861 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -964,7 +964,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) } static void -dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; @@ -984,7 +984,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else if (zio == NULL || zio->io_error == 0) { + } else if (err == 0) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { @@ -1003,7 +1003,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; - int err; + int err, zio_flags = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1021,6 +1021,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) */ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + arc_buf_t *dn_buf = (dn->dn_dbuf != NULL) ? + dn->dn_dbuf->db_buf : NULL; + + /* if the underlying dnode block is encrypted, decrypt it */ + if (dn_buf != NULL && dn->dn_objset->os_encrypted && + DMU_OT_IS_ENCRYPTED(dn->dn_bonustype) && + (flags & DB_RF_NO_DECRYPT) == 0 && + arc_is_encrypted(dn_buf)) { + err = arc_untransform(dn_buf, dn->dn_objset->os_spa, + dmu_objset_id(dn->dn_objset), B_TRUE); + if (err != 0) { + DB_DNODE_EXIT(db); + mutex_exit(&db->db_mtx); + return (err); + } + } ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); @@ -1088,11 +1104,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); + /* + * All bps of an encrypted os should have the encryption bit set. + * If this is not true it indicates tampering and we report an error. + */ + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { + spa_log_error(db->db_objset->os_spa, &zb); + zfs_panic_recover("unencrypted block in encrypted " + "object set %llu", dmu_objset_id(db->db_objset)); + return (SET_ERROR(EIO)); + } + dbuf_add_ref(db, NULL); + zio_flags = (flags & DB_RF_CANFAIL) ? + ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; + + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) + zio_flags |= ZIO_FLAG_RAW; + err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, - dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); return (err); @@ -1141,18 +1173,31 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + dnode_t *dn = DB_DNODE(db); int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); - if (compress_type == ZIO_COMPRESS_OFF) { - dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); - } else { + if (arc_is_encrypted(db->db_buf)) { + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + + arc_get_raw_params(db->db_buf, &byteorder, salt, + iv, mac); + dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db, + dmu_objset_id(dn->dn_objset), byteorder, salt, iv, + mac, dn->dn_type, size, arc_buf_lsize(db->db_buf), + compress_type); + } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, size, arc_buf_lsize(db->db_buf), compress_type); + } else { + dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { @@ -1188,16 +1233,21 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { + spa_t *spa = dn->dn_objset->os_spa; + /* - * If the arc buf is compressed, we need to decompress it to - * read the data. This could happen during the "zfs receive" of - * a stream which is compressed and deduplicated. + * If the arc buf is compressed or encrypted, we need to + * untransform it to read the data. This could happen during + * the "zfs receive" of a stream which is deduplicated and + * either raw or compressed. We do not need to do this if the + * caller wants raw encrypted data. */ - if (db->db_buf != NULL && - arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { - dbuf_fix_old_data(db, - spa_syncing_txg(dmu_objset_spa(db->db_objset))); - err = arc_decompress(db->db_buf); + if (db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 && + (arc_is_encrypted(db->db_buf) || + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { + dbuf_fix_old_data(db, spa_syncing_txg(spa)); + err = arc_untransform(db->db_buf, spa, + dmu_objset_id(db->db_objset), B_FALSE); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); @@ -1316,6 +1366,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; + dr->dt.dl.dr_raw = B_FALSE; /* * Release the already-written buffer, so we leave it in @@ -1908,11 +1959,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (B_FALSE); } -void -dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +static void +dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; dbuf_dirty_record_t *dr; ASSERT(tx->tx_txg != 0); @@ -1944,13 +1994,20 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; + flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); - (void) dbuf_read(db, NULL, rf); + (void) dbuf_read(db, NULL, flags); (void) dbuf_dirty(db, tx); } void +dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_will_dirty_impl(db_fake, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); +} + +void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -1977,6 +2034,29 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) (void) dbuf_dirty(db, tx); } +/* + * This function is effectively the same as dmu_buf_will_dirty(), but + * indicates the caller expects raw encrypted data in the db. It will + * also set the raw flag on the created dirty record. + */ +void +dmu_buf_will_change_crypt_params(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dbuf_dirty_record_t *dr; + + dmu_buf_will_dirty_impl(db_fake, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); + + dr = db->db_last_dirty; + while (dr != NULL && dr->dr_txg > tx->tx_txg) + dr = dr->dr_next; + + ASSERT3P(dr, !=, NULL); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dr->dt.dl.dr_raw = B_TRUE; +} + #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void @@ -2117,10 +2197,11 @@ dbuf_destroy(dmu_buf_impl_t *db) if (db->db_blkid == DMU_BONUS_BLKID) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - ASSERT(db->db.db_data != NULL); - kmem_free(db->db.db_data, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - db->db_state = DB_UNCACHED; + if (db->db.db_data != NULL) { + kmem_free(db->db.db_data, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); + db->db_state = DB_UNCACHED; + } } dbuf_clear_data(db); @@ -2416,7 +2497,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) * prefetch if the next block down is our target. */ static void -dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) +dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; uint64_t nextblkid; @@ -2438,7 +2519,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - if (zio->io_flags & ZIO_FLAG_RAW) { + if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); @@ -2463,7 +2544,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { + if (BP_IS_HOLE(bp) || err != 0) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); @@ -2491,7 +2572,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to - * complete. + * complete. Note that the prefetch might fail if the dataset is encrypted and + * the encryption key is unmapped before the IO completes. */ void dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, @@ -3121,6 +3203,41 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) } /* + * Ensure the dbuf's data is untransformed if the associated dirty + * record requires it. This is used by dbuf_sync_leaf() to ensure + * that a dnode block is decrypted before we write new data to it. + * For raw writes we assert that the buffer is already encrypted. + */ +static void +dbuf_check_crypt(dbuf_dirty_record_t *dr) +{ + int err; + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!dr->dt.dl.dr_raw && arc_is_encrypted(db->db_buf)) { + /* + * Unfortunately, there is currently no mechanism for + * syncing context to handle decryption errors. An error + * here is only possible if an attacker maliciously + * changed a dnode block and updated the associated + * checksums going up the block tree. + */ + err = arc_untransform(db->db_buf, db->db_objset->os_spa, + dmu_objset_id(db->db_objset), B_TRUE); + if (err) + panic("Invalid dnode block MAC"); + } else if (dr->dt.dl.dr_raw) { + /* + * Writing raw encrypted data requires the db's arc buffer + * to be converted to raw by the caller. + */ + ASSERT(arc_is_encrypted(db->db_buf)); + } +} + +/* * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it * is critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. @@ -3241,9 +3358,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(*datap != NULL); ASSERT0(db->db_level); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, + ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); - bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + bcopy(*datap, DN_BONUS(dn->dn_phys), + DN_MAX_BONUS_LEN(dn->dn_phys)); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { @@ -3290,6 +3408,13 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } + /* + * If this is a dnode block, ensure it is appropriately encrypted + * or decrypted, depending on what we are writing to it this txg. + */ + if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) + dbuf_check_crypt(dr); + if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && @@ -3307,16 +3432,27 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * DNONE_DNODE blocks). */ int psize = arc_buf_size(*datap); + int lsize = arc_buf_lsize(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); - if (compress_type == ZIO_COMPRESS_OFF) { - *datap = arc_alloc_buf(os->os_spa, db, type, psize); - } else { + if (arc_is_encrypted(*datap)) { + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + + arc_get_raw_params(*datap, &byteorder, salt, iv, mac); + *datap = arc_alloc_raw_buf(os->os_spa, db, + dmu_objset_id(os), byteorder, salt, iv, mac, + dn->dn_type, psize, lsize, compress_type); + } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); int lsize = arc_buf_lsize(*datap); *datap = arc_alloc_compressed_buf(os->os_spa, db, psize, lsize, compress_type); + } else { + *datap = arc_alloc_buf(os->os_spa, db, type, psize); } bcopy(db->db.db_data, (*datap)->b_data, psize); } @@ -3453,7 +3589,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) - bp->blk_fill = fill; + BP_SET_FILL(bp, fill); mutex_exit(&db->db_mtx); @@ -3778,7 +3914,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { - arc_done_func_t *children_ready_cb = NULL; + arc_write_done_func_t *children_ready_cb = NULL; ASSERT(arc_released(data)); /* @@ -3810,6 +3946,7 @@ EXPORT_SYMBOL(dbuf_free_range); EXPORT_SYMBOL(dbuf_new_size); EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); +EXPORT_SYMBOL(dmu_buf_will_change_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); |