aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorAlexander Motin <[email protected]>2024-04-08 15:03:18 -0400
committerGitHub <[email protected]>2024-04-08 12:03:18 -0700
commiteeca9a91d6866879f4d57b4d0644e5da951f3daa (patch)
tree3b84a83e9f3b3be5666c6843a6314d33966df6db /module
parent76d1dde94ca9cac03fa641b4cf9259d98a706e12 (diff)
Fix read errors race after block cloning
Investigating read errors triggering panic fixed in #16042 I've found that we have a race in a sync process between the moment dirty record for cloned block is removed and the moment dbuf is destroyed. If dmu_buf_hold_array_by_dnode() take a hold on a cloned dbuf before it is synced/destroyed, then dbuf_read_impl() may see it still in DB_NOFILL state, but without the dirty record. Such case is not an error, but equivalent to DB_UNCACHED, since the dbuf block pointer is already updated by dbuf_write_ready(). Unfortunately it is impossible to safely change the dbuf state to DB_UNCACHED there, since there may already be another cloning in progress, that dropped dbuf lock before creating a new dirty record, protected only by the range lock. Reviewed-by: Rob Norris <[email protected]> Reviewed-by: Robert Evans <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #16052
Diffstat (limited to 'module')
-rw-r--r--module/zfs/dbuf.c41
1 files changed, 20 insertions, 21 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index d43f84e84..8c42b116d 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1563,7 +1563,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err, zio_flags;
- blkptr_t bp, *bpp;
+ blkptr_t bp, *bpp = NULL;
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1577,29 +1577,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
goto early_unlock;
}
- if (db->db_state == DB_UNCACHED) {
- if (db->db_blkptr == NULL) {
- bpp = NULL;
- } else {
- bp = *db->db_blkptr;
+ /*
+ * If we have a pending block clone, we don't want to read the
+ * underlying block, but the content of the block being cloned,
+ * pointed by the dirty record, so we have the most recent data.
+ * If there is no dirty record, then we hit a race in a sync
+ * process when the dirty record is already removed, while the
+ * dbuf is not yet destroyed. Such case is equivalent to uncached.
+ */
+ if (db->db_state == DB_NOFILL) {
+ dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+ if (dr != NULL) {
+ if (!dr->dt.dl.dr_brtwrite) {
+ err = EIO;
+ goto early_unlock;
+ }
+ bp = dr->dt.dl.dr_overridden_by;
bpp = &bp;
}
- } else {
- dbuf_dirty_record_t *dr;
-
- ASSERT3S(db->db_state, ==, DB_NOFILL);
+ }
- /*
- * Block cloning: If we have a pending block clone,
- * we don't want to read the underlying block, but the content
- * of the block being cloned, so we have the most recent data.
- */
- dr = list_head(&db->db_dirty_records);
- if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
- err = EIO;
- goto early_unlock;
- }
- bp = dr->dt.dl.dr_overridden_by;
+ if (bpp == NULL && db->db_blkptr != NULL) {
+ bp = *db->db_blkptr;
bpp = &bp;
}