diff options
author | Alex Reece <[email protected]> | 2016-04-21 11:23:37 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-04-27 16:24:15 -0700 |
commit | 463a8cfe2b293934edd2ee79115b20c4598353d6 (patch) | |
tree | cac82326668d5f9b7135234b0d3962228e5f4568 | |
parent | 8a5fc74880251aa651306d1dbcc60715698996f0 (diff) |
Illumos 6844 - dnode_next_offset can detect fictional holes
6844 dnode_next_offset can detect fictional holes
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
dnode_next_offset is used in a variety of places to iterate over the
holes or allocated blocks in a dnode. It operates under the premise that
it can iterate over the blockpointers of a dnode in open context while
holding only the dn_struct_rwlock as reader. Unfortunately, this premise
does not hold.
When we create the zio for a dbuf, we pass in the actual block pointer
in the indirect block above that dbuf. When we later zero the bp in
zio_write_compress, we are directly modifying the bp. The state of the
bp is now inconsistent from the perspective of dnode_next_offset: the bp
will appear to be a hole until zio_dva_allocate finally finishes filling
it in. In the meantime, dnode_next_offset can detect a hole in the dnode
when none exists.
I was able to experimentally demonstrate this behavior with the
following setup:
1. Create a file with 1 million dbufs.
2. Create a thread that randomly dirties L2 blocks by writing to the
first L0 block under them.
3. Observe dnode_next_offset, waiting for it to skip over a hole in the
middle of a file.
4. Do dnode_next_offset in a loop until we skip over such a non-existent
hole.
The fix is to ensure that it is valid to iterate over the indirect
blocks in a dnode while holding the dn_struct_rwlock by passing the zio
a copy of the BP and updating the actual BP in dbuf_write_ready while
holding the lock.
References:
https://www.illumos.org/issues/6844
https://github.com/openzfs/openzfs/pull/82
DLPX-35372
Ported-by: Brian Behlendorf <[email protected]>
Closes #4548
-rw-r--r-- | include/sys/dbuf.h | 3 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 25 |
2 files changed, 23 insertions, 5 deletions
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 9147c9d4d..079358911 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -121,6 +121,9 @@ typedef struct dbuf_dirty_record { /* How much space was changed to dsl_pool_dirty_space() for this? */ unsigned int dr_accounted; + /* A copy of the bp that points to us */ + blkptr_t dr_bp_copy; + union dirty_types { struct dirty_indirect { diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 3d3b76963..fd51d59a8 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3024,7 +3024,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) uint64_t fill = 0; int i; - ASSERT3P(db->db_blkptr, ==, bp); + ASSERT3P(db->db_blkptr, !=, NULL); + ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -3046,7 +3047,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); - ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + ASSERT(!(BP_IS_HOLE(bp)) && db->db_blkptr == &dn->dn_phys->dn_spill); } #endif @@ -3087,6 +3088,10 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) bp->blk_fill = fill; mutex_exit(&db->db_mtx); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + *db->db_blkptr = *bp; + rw_exit(&dn->dn_struct_rwlock); } /* @@ -3265,6 +3270,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) zio_t *zio; int wp_flag = 0; + ASSERT(dmu_tx_is_syncing(tx)); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; @@ -3323,6 +3330,14 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); + /* + * We copy the blkptr now (rather than when we instantiate the dirty + * record), because its value can change between open context and + * syncing context. We do not need to hold dn_struct_rwlock to read + * db_blkptr because we are in syncing context. + */ + dr->dr_bp_copy = *db->db_blkptr; + if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* @@ -3332,7 +3347,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) void *contents = (data != NULL) ? data->b_data : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, - db->db_blkptr, contents, db->db.db_size, &zp, + &dr->dr_bp_copy, contents, db->db.db_size, &zp, dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); @@ -3343,14 +3358,14 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); dr->dr_zio = zio_write(zio, os->os_spa, txg, - db->db_blkptr, NULL, db->db.db_size, &zp, + &dr->dr_bp_copy, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); dr->dr_zio = arc_write(zio, os->os_spa, txg, - db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), + &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); |