diff options
author | Paul Dagnelie <[email protected]> | 2019-07-08 13:18:50 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-07-08 13:18:50 -0700 |
commit | f664f1ee7fc9dd7101171f6518c67951cb5bd8cf (patch) | |
tree | 790db3e84a6edf3bd67b1869cd81f3b312d06ba7 /module/zfs/dnode_sync.c | |
parent | cb709642216b5ac9be10039471c3c4bc27cb7cf2 (diff) |
Decrease contention on dn_struct_rwlock
Currently, sequential async write workloads spend a lot of time
contending on the dn_struct_rwlock. This lock is responsible for
protecting the entire block tree below it; this naturally results
in some serialization during heavy write workloads. This can be
resolved by having per-dbuf locking, which will allow multiple
writers in the same object at the same time.
We introduce a new rwlock, the db_rwlock. This lock is responsible
for protecting the contents of the dbuf that it is a part of; when
reading a block pointer from a dbuf, you hold the lock as a reader.
When writing data to a dbuf, you hold it as a writer. This allows
multiple threads to write to different parts of a file at the same
time.
Reviewed by: Brad Lewis <[email protected]>
Reviewed by: Matt Ahrens [email protected]
Reviewed by: George Wilson [email protected]
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Paul Dagnelie <[email protected]>
External-issue: DLPX-52564
External-issue: DLPX-53085
External-issue: DLPX-57384
Closes #8946
Diffstat (limited to 'module/zfs/dnode_sync.c')
-rw-r--r-- | module/zfs/dnode_sync.c | 35 |
1 files changed, 29 insertions, 6 deletions
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index d3acf1baa..4e002d326 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -51,7 +51,6 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) /* this dnode can't be paged out because it's dirty */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); @@ -61,8 +60,24 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, dn->dn_object, dn->dn_phys->dn_nlevels); + /* + * Lock ordering requires that we hold the children's db_mutexes (by + * calling dbuf_find()) before holding the parent's db_rwlock. The lock + * order is imposed by dbuf_read's steps of "grab the lock to protect + * db_parent, get db_parent, hold db_parent's db_rwlock". + */ + dmu_buf_impl_t *children[DN_MAX_NBLKPTR]; + ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR); + for (i = 0; i < nblkptr; i++) { + children[i] = + dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + } + /* transfer dnode's block pointers to new indirect block */ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + if (dn->dn_dbuf != NULL) + rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); @@ -72,8 +87,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) /* set dbuf's parent pointers to new indirect buf */ for (i = 0; i < nblkptr; i++) { - dmu_buf_impl_t *child = - dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + dmu_buf_impl_t *child = children[i]; if (child == NULL) continue; @@ -106,6 +120,10 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + rw_exit(&db->db_rwlock); + if (dn->dn_dbuf != NULL) + rw_exit(&dn->dn_dbuf->db_rwlock); + dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); @@ -182,7 +200,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_level == 1); rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, + err = dbuf_hold_impl(dn, db->db_level - 1, (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) @@ -280,7 +298,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, * ancestor of the first or last block to be freed. The first and * last L1 indirect blocks are always dirtied by dnode_free_range(). */ + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + dmu_buf_unlock_parent(db, dblt, FTAG); dbuf_release_bp(db); bp = db->db.db_data; @@ -306,7 +326,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); + rw_enter(&db->db_rwlock, RW_WRITER); + free_blocks(dn, bp, end - start + 1, tx); + rw_exit(&db->db_rwlock); } else { for (uint64_t id = start; id <= end; id++, bp++) { if (BP_IS_HOLE(bp)) @@ -323,10 +345,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, } if (free_indirects) { + rw_enter(&db->db_rwlock, RW_WRITER); for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); @@ -378,7 +402,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } |