aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/dbuf.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/dbuf.c')
-rw-r--r--module/zfs/dbuf.c209
1 files changed, 173 insertions, 36 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 625e06701..745715861 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -964,7 +964,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
}
static void
-dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
@@ -984,7 +984,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
- } else if (zio == NULL || zio->io_error == 0) {
+ } else if (err == 0) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
@@ -1003,7 +1003,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
dnode_t *dn;
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
- int err;
+ int err, zio_flags = 0;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -1021,6 +1021,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ arc_buf_t *dn_buf = (dn->dn_dbuf != NULL) ?
+ dn->dn_dbuf->db_buf : NULL;
+
+ /* if the underlying dnode block is encrypted, decrypt it */
+ if (dn_buf != NULL && dn->dn_objset->os_encrypted &&
+ DMU_OT_IS_ENCRYPTED(dn->dn_bonustype) &&
+ (flags & DB_RF_NO_DECRYPT) == 0 &&
+ arc_is_encrypted(dn_buf)) {
+ err = arc_untransform(dn_buf, dn->dn_objset->os_spa,
+ dmu_objset_id(dn->dn_objset), B_TRUE);
+ if (err != 0) {
+ DB_DNODE_EXIT(db);
+ mutex_exit(&db->db_mtx);
+ return (err);
+ }
+ }
ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
@@ -1088,11 +1104,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
db->db.db_object, db->db_level, db->db_blkid);
+ /*
+ * All bps of an encrypted os should have the encryption bit set.
+ * If this is not true it indicates tampering and we report an error.
+ */
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
+ spa_log_error(db->db_objset->os_spa, &zb);
+ zfs_panic_recover("unencrypted block in encrypted "
+ "object set %llu", dmu_objset_id(db->db_objset));
+ return (SET_ERROR(EIO));
+ }
+
dbuf_add_ref(db, NULL);
+ zio_flags = (flags & DB_RF_CANFAIL) ?
+ ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
+
+ if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
+ zio_flags |= ZIO_FLAG_RAW;
+
err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
- dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
- (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);
return (err);
@@ -1141,18 +1173,31 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ dnode_t *dn = DB_DNODE(db);
int size = arc_buf_size(db->db_buf);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa = db->db_objset->os_spa;
enum zio_compress compress_type =
arc_get_compression(db->db_buf);
- if (compress_type == ZIO_COMPRESS_OFF) {
- dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
- } else {
+ if (arc_is_encrypted(db->db_buf)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(db->db_buf, &byteorder, salt,
+ iv, mac);
+ dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
+ dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
+ mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
+ compress_type);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
size, arc_buf_lsize(db->db_buf), compress_type);
+ } else {
+ dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
@@ -1188,16 +1233,21 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
+ spa_t *spa = dn->dn_objset->os_spa;
+
/*
- * If the arc buf is compressed, we need to decompress it to
- * read the data. This could happen during the "zfs receive" of
- * a stream which is compressed and deduplicated.
+ * If the arc buf is compressed or encrypted, we need to
+ * untransform it to read the data. This could happen during
+ * the "zfs receive" of a stream which is deduplicated and
+ * either raw or compressed. We do not need to do this if the
+ * caller wants raw encrypted data.
*/
- if (db->db_buf != NULL &&
- arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
- dbuf_fix_old_data(db,
- spa_syncing_txg(dmu_objset_spa(db->db_objset)));
- err = arc_decompress(db->db_buf);
+ if (db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 &&
+ (arc_is_encrypted(db->db_buf) ||
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ dbuf_fix_old_data(db, spa_syncing_txg(spa));
+ err = arc_untransform(db->db_buf, spa,
+ dmu_objset_id(db->db_objset), B_FALSE);
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
@@ -1316,6 +1366,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
+ dr->dt.dl.dr_raw = B_FALSE;
/*
* Release the already-written buffer, so we leave it in
@@ -1908,11 +1959,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
}
-void
-dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+static void
+dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
dbuf_dirty_record_t *dr;
ASSERT(tx->tx_txg != 0);
@@ -1944,13 +1994,20 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
DB_DNODE_ENTER(db);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
- rf |= DB_RF_HAVESTRUCT;
+ flags |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
- (void) dbuf_read(db, NULL, rf);
+ (void) dbuf_read(db, NULL, flags);
(void) dbuf_dirty(db, tx);
}
void
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty_impl(db_fake,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
+}
+
+void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@@ -1977,6 +2034,29 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
(void) dbuf_dirty(db, tx);
}
+/*
+ * This function is effectively the same as dmu_buf_will_dirty(), but
+ * indicates the caller expects raw encrypted data in the db. It will
+ * also set the raw flag on the created dirty record.
+ */
+void
+dmu_buf_will_change_crypt_params(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dbuf_dirty_record_t *dr;
+
+ dmu_buf_will_dirty_impl(db_fake,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
+
+ dr = db->db_last_dirty;
+ while (dr != NULL && dr->dr_txg > tx->tx_txg)
+ dr = dr->dr_next;
+
+ ASSERT3P(dr, !=, NULL);
+ ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+ dr->dt.dl.dr_raw = B_TRUE;
+}
+
#pragma weak dmu_buf_fill_done = dbuf_fill_done
/* ARGSUSED */
void
@@ -2117,10 +2197,11 @@ dbuf_destroy(dmu_buf_impl_t *db)
if (db->db_blkid == DMU_BONUS_BLKID) {
int slots = DB_DNODE(db)->dn_num_slots;
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
- ASSERT(db->db.db_data != NULL);
- kmem_free(db->db.db_data, bonuslen);
- arc_space_return(bonuslen, ARC_SPACE_BONUS);
- db->db_state = DB_UNCACHED;
+ if (db->db.db_data != NULL) {
+ kmem_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
+ db->db_state = DB_UNCACHED;
+ }
}
dbuf_clear_data(db);
@@ -2416,7 +2497,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
* prefetch if the next block down is our target.
*/
static void
-dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
uint64_t nextblkid;
@@ -2438,7 +2519,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
*/
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
- if (zio->io_flags & ZIO_FLAG_RAW) {
+ if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
} else {
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
@@ -2463,7 +2544,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+ if (BP_IS_HOLE(bp) || err != 0) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
@@ -2491,7 +2572,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
- * complete.
+ * complete. Note that the prefetch might fail if the dataset is encrypted and
+ * the encryption key is unmapped before the IO completes.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
@@ -3121,6 +3203,41 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
}
/*
+ * Ensure the dbuf's data is untransformed if the associated dirty
+ * record requires it. This is used by dbuf_sync_leaf() to ensure
+ * that a dnode block is decrypted before we write new data to it.
+ * For raw writes we assert that the buffer is already encrypted.
+ */
+static void
+dbuf_check_crypt(dbuf_dirty_record_t *dr)
+{
+ int err;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!dr->dt.dl.dr_raw && arc_is_encrypted(db->db_buf)) {
+ /*
+ * Unfortunately, there is currently no mechanism for
+ * syncing context to handle decryption errors. An error
+ * here is only possible if an attacker maliciously
+ * changed a dnode block and updated the associated
+ * checksums going up the block tree.
+ */
+ err = arc_untransform(db->db_buf, db->db_objset->os_spa,
+ dmu_objset_id(db->db_objset), B_TRUE);
+ if (err)
+ panic("Invalid dnode block MAC");
+ } else if (dr->dt.dl.dr_raw) {
+ /*
+ * Writing raw encrypted data requires the db's arc buffer
+ * to be converted to raw by the caller.
+ */
+ ASSERT(arc_is_encrypted(db->db_buf));
+ }
+}
+
+/*
* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
* is critical the we not allow the compiler to inline this function in to
* dbuf_sync_list() thereby drastically bloating the stack usage.
@@ -3241,9 +3358,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=,
+ ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
- bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ bcopy(*datap, DN_BONUS(dn->dn_phys),
+ DN_MAX_BONUS_LEN(dn->dn_phys));
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
@@ -3290,6 +3408,13 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
+ /*
+ * If this is a dnode block, ensure it is appropriately encrypted
+ * or decrypted, depending on what we are writing to it this txg.
+ */
+ if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
+ dbuf_check_crypt(dr);
+
if (db->db_state != DB_NOFILL &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
@@ -3307,16 +3432,27 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* DNONE_DNODE blocks).
*/
int psize = arc_buf_size(*datap);
+ int lsize = arc_buf_lsize(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
enum zio_compress compress_type = arc_get_compression(*datap);
- if (compress_type == ZIO_COMPRESS_OFF) {
- *datap = arc_alloc_buf(os->os_spa, db, type, psize);
- } else {
+ if (arc_is_encrypted(*datap)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
+ *datap = arc_alloc_raw_buf(os->os_spa, db,
+ dmu_objset_id(os), byteorder, salt, iv, mac,
+ dn->dn_type, psize, lsize, compress_type);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
int lsize = arc_buf_lsize(*datap);
*datap = arc_alloc_compressed_buf(os->os_spa, db,
psize, lsize, compress_type);
+ } else {
+ *datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
bcopy(db->db.db_data, (*datap)->b_data, psize);
}
@@ -3453,7 +3589,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
if (!BP_IS_EMBEDDED(bp))
- bp->blk_fill = fill;
+ BP_SET_FILL(bp, fill);
mutex_exit(&db->db_mtx);
@@ -3778,7 +3914,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
- arc_done_func_t *children_ready_cb = NULL;
+ arc_write_done_func_t *children_ready_cb = NULL;
ASSERT(arc_released(data));
/*
@@ -3810,6 +3946,7 @@ EXPORT_SYMBOL(dbuf_free_range);
EXPORT_SYMBOL(dbuf_new_size);
EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
+EXPORT_SYMBOL(dmu_buf_will_change_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);