diff options
Diffstat (limited to 'module/zfs/dbuf.c')
-rw-r--r-- | module/zfs/dbuf.c | 209 |
1 files changed, 173 insertions, 36 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 625e06701..745715861 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -964,7 +964,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) } static void -dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; @@ -984,7 +984,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else if (zio == NULL || zio->io_error == 0) { + } else if (err == 0) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { @@ -1003,7 +1003,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; - int err; + int err, zio_flags = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1021,6 +1021,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) */ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + arc_buf_t *dn_buf = (dn->dn_dbuf != NULL) ? + dn->dn_dbuf->db_buf : NULL; + + /* if the underlying dnode block is encrypted, decrypt it */ + if (dn_buf != NULL && dn->dn_objset->os_encrypted && + DMU_OT_IS_ENCRYPTED(dn->dn_bonustype) && + (flags & DB_RF_NO_DECRYPT) == 0 && + arc_is_encrypted(dn_buf)) { + err = arc_untransform(dn_buf, dn->dn_objset->os_spa, + dmu_objset_id(dn->dn_objset), B_TRUE); + if (err != 0) { + DB_DNODE_EXIT(db); + mutex_exit(&db->db_mtx); + return (err); + } + } ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); @@ -1088,11 +1104,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); + /* + * All bps of an encrypted os should have the encryption bit set. + * If this is not true it indicates tampering and we report an error. + */ + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { + spa_log_error(db->db_objset->os_spa, &zb); + zfs_panic_recover("unencrypted block in encrypted " + "object set %llu", dmu_objset_id(db->db_objset)); + return (SET_ERROR(EIO)); + } + dbuf_add_ref(db, NULL); + zio_flags = (flags & DB_RF_CANFAIL) ? + ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; + + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) + zio_flags |= ZIO_FLAG_RAW; + err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, - dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); return (err); @@ -1141,18 +1173,31 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + dnode_t *dn = DB_DNODE(db); int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); - if (compress_type == ZIO_COMPRESS_OFF) { - dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); - } else { + if (arc_is_encrypted(db->db_buf)) { + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + + arc_get_raw_params(db->db_buf, &byteorder, salt, + iv, mac); + dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db, + dmu_objset_id(dn->dn_objset), byteorder, salt, iv, + mac, dn->dn_type, size, arc_buf_lsize(db->db_buf), + compress_type); + } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, size, arc_buf_lsize(db->db_buf), compress_type); + } else { + dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { @@ -1188,16 +1233,21 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { + spa_t *spa = dn->dn_objset->os_spa; + /* - * If the arc buf is compressed, we need to decompress it to - * read the data. This could happen during the "zfs receive" of - * a stream which is compressed and deduplicated. + * If the arc buf is compressed or encrypted, we need to + * untransform it to read the data. This could happen during + * the "zfs receive" of a stream which is deduplicated and + * either raw or compressed. We do not need to do this if the + * caller wants raw encrypted data. */ - if (db->db_buf != NULL && - arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { - dbuf_fix_old_data(db, - spa_syncing_txg(dmu_objset_spa(db->db_objset))); - err = arc_decompress(db->db_buf); + if (db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 && + (arc_is_encrypted(db->db_buf) || + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { + dbuf_fix_old_data(db, spa_syncing_txg(spa)); + err = arc_untransform(db->db_buf, spa, + dmu_objset_id(db->db_objset), B_FALSE); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); @@ -1316,6 +1366,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; + dr->dt.dl.dr_raw = B_FALSE; /* * Release the already-written buffer, so we leave it in @@ -1908,11 +1959,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (B_FALSE); } -void -dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +static void +dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; dbuf_dirty_record_t *dr; ASSERT(tx->tx_txg != 0); @@ -1944,13 +1994,20 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; + flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); - (void) dbuf_read(db, NULL, rf); + (void) dbuf_read(db, NULL, flags); (void) dbuf_dirty(db, tx); } void +dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_will_dirty_impl(db_fake, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); +} + +void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -1977,6 +2034,29 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) (void) dbuf_dirty(db, tx); } +/* + * This function is effectively the same as dmu_buf_will_dirty(), but + * indicates the caller expects raw encrypted data in the db. It will + * also set the raw flag on the created dirty record. + */ +void +dmu_buf_will_change_crypt_params(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dbuf_dirty_record_t *dr; + + dmu_buf_will_dirty_impl(db_fake, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); + + dr = db->db_last_dirty; + while (dr != NULL && dr->dr_txg > tx->tx_txg) + dr = dr->dr_next; + + ASSERT3P(dr, !=, NULL); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dr->dt.dl.dr_raw = B_TRUE; +} + #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void @@ -2117,10 +2197,11 @@ dbuf_destroy(dmu_buf_impl_t *db) if (db->db_blkid == DMU_BONUS_BLKID) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - ASSERT(db->db.db_data != NULL); - kmem_free(db->db.db_data, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - db->db_state = DB_UNCACHED; + if (db->db.db_data != NULL) { + kmem_free(db->db.db_data, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); + db->db_state = DB_UNCACHED; + } } dbuf_clear_data(db); @@ -2416,7 +2497,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) * prefetch if the next block down is our target. */ static void -dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) +dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; uint64_t nextblkid; @@ -2438,7 +2519,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - if (zio->io_flags & ZIO_FLAG_RAW) { + if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); @@ -2463,7 +2544,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { + if (BP_IS_HOLE(bp) || err != 0) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); @@ -2491,7 +2572,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to - * complete. + * complete. Note that the prefetch might fail if the dataset is encrypted and + * the encryption key is unmapped before the IO completes. */ void dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, @@ -3121,6 +3203,41 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) } /* + * Ensure the dbuf's data is untransformed if the associated dirty + * record requires it. This is used by dbuf_sync_leaf() to ensure + * that a dnode block is decrypted before we write new data to it. + * For raw writes we assert that the buffer is already encrypted. + */ +static void +dbuf_check_crypt(dbuf_dirty_record_t *dr) +{ + int err; + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!dr->dt.dl.dr_raw && arc_is_encrypted(db->db_buf)) { + /* + * Unfortunately, there is currently no mechanism for + * syncing context to handle decryption errors. An error + * here is only possible if an attacker maliciously + * changed a dnode block and updated the associated + * checksums going up the block tree. + */ + err = arc_untransform(db->db_buf, db->db_objset->os_spa, + dmu_objset_id(db->db_objset), B_TRUE); + if (err) + panic("Invalid dnode block MAC"); + } else if (dr->dt.dl.dr_raw) { + /* + * Writing raw encrypted data requires the db's arc buffer + * to be converted to raw by the caller. + */ + ASSERT(arc_is_encrypted(db->db_buf)); + } +} + +/* * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it * is critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. @@ -3241,9 +3358,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(*datap != NULL); ASSERT0(db->db_level); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, + ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); - bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + bcopy(*datap, DN_BONUS(dn->dn_phys), + DN_MAX_BONUS_LEN(dn->dn_phys)); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { @@ -3290,6 +3408,13 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } + /* + * If this is a dnode block, ensure it is appropriately encrypted + * or decrypted, depending on what we are writing to it this txg. + */ + if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) + dbuf_check_crypt(dr); + if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && @@ -3307,16 +3432,27 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * DNONE_DNODE blocks). */ int psize = arc_buf_size(*datap); + int lsize = arc_buf_lsize(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); - if (compress_type == ZIO_COMPRESS_OFF) { - *datap = arc_alloc_buf(os->os_spa, db, type, psize); - } else { + if (arc_is_encrypted(*datap)) { + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + + arc_get_raw_params(*datap, &byteorder, salt, iv, mac); + *datap = arc_alloc_raw_buf(os->os_spa, db, + dmu_objset_id(os), byteorder, salt, iv, mac, + dn->dn_type, psize, lsize, compress_type); + } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); int lsize = arc_buf_lsize(*datap); *datap = arc_alloc_compressed_buf(os->os_spa, db, psize, lsize, compress_type); + } else { + *datap = arc_alloc_buf(os->os_spa, db, type, psize); } bcopy(db->db.db_data, (*datap)->b_data, psize); } @@ -3453,7 +3589,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) - bp->blk_fill = fill; + BP_SET_FILL(bp, fill); mutex_exit(&db->db_mtx); @@ -3778,7 +3914,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { - arc_done_func_t *children_ready_cb = NULL; + arc_write_done_func_t *children_ready_cb = NULL; ASSERT(arc_released(data)); /* @@ -3810,6 +3946,7 @@ EXPORT_SYMBOL(dbuf_free_range); EXPORT_SYMBOL(dbuf_new_size); EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); +EXPORT_SYMBOL(dmu_buf_will_change_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); |