summaryrefslogtreecommitdiffstats
path: root/module/zfs/dnode_sync.c
diff options
context:
space:
mode:
authorPaul Dagnelie <[email protected]>2019-07-08 13:18:50 -0700
committerBrian Behlendorf <[email protected]>2019-07-08 13:18:50 -0700
commitf664f1ee7fc9dd7101171f6518c67951cb5bd8cf (patch)
tree790db3e84a6edf3bd67b1869cd81f3b312d06ba7 /module/zfs/dnode_sync.c
parentcb709642216b5ac9be10039471c3c4bc27cb7cf2 (diff)
Decrease contention on dn_struct_rwlock
Currently, sequential async write workloads spend a lot of time contending on the dn_struct_rwlock. This lock is responsible for protecting the entire block tree below it; this naturally results in some serialization during heavy write workloads. This can be resolved by having per-dbuf locking, which will allow multiple writers in the same object at the same time. We introduce a new rwlock, the db_rwlock. This lock is responsible for protecting the contents of the dbuf that it is a part of; when reading a block pointer from a dbuf, you hold the lock as a reader. When writing data to a dbuf, you hold it as a writer. This allows multiple threads to write to different parts of a file at the same time. Reviewed by: Brad Lewis <[email protected]> Reviewed by: Matt Ahrens [email protected] Reviewed by: George Wilson [email protected] Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Paul Dagnelie <[email protected]> External-issue: DLPX-52564 External-issue: DLPX-53085 External-issue: DLPX-57384 Closes #8946
Diffstat (limited to 'module/zfs/dnode_sync.c')
-rw-r--r--module/zfs/dnode_sync.c35
1 files changed, 29 insertions, 6 deletions
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index d3acf1baa..4e002d326 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -51,7 +51,6 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
/* this dnode can't be paged out because it's dirty */
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
@@ -61,8 +60,24 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
dn->dn_object, dn->dn_phys->dn_nlevels);
+ /*
+ * Lock ordering requires that we hold the children's db_mutexes (by
+ * calling dbuf_find()) before holding the parent's db_rwlock. The lock
+ * order is imposed by dbuf_read's steps of "grab the lock to protect
+ * db_parent, get db_parent, hold db_parent's db_rwlock".
+ */
+ dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
+ ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
+ for (i = 0; i < nblkptr; i++) {
+ children[i] =
+ dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+ }
+
/* transfer dnode's block pointers to new indirect block */
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ if (dn->dn_dbuf != NULL)
+ rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);
+ rw_enter(&db->db_rwlock, RW_WRITER);
ASSERT(db->db.db_data);
ASSERT(arc_released(db->db_buf));
ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
@@ -72,8 +87,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
/* set dbuf's parent pointers to new indirect buf */
for (i = 0; i < nblkptr; i++) {
- dmu_buf_impl_t *child =
- dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+ dmu_buf_impl_t *child = children[i];
if (child == NULL)
continue;
@@ -106,6 +120,10 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+ rw_exit(&db->db_rwlock);
+ if (dn->dn_dbuf != NULL)
+ rw_exit(&dn->dn_dbuf->db_rwlock);
+
dbuf_rele(db, FTAG);
rw_exit(&dn->dn_struct_rwlock);
@@ -182,7 +200,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
ASSERT(db->db_level == 1);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(dn, db->db_level-1,
+ err = dbuf_hold_impl(dn, db->db_level - 1,
(db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT)
@@ -280,7 +298,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
* ancestor of the first or last block to be freed. The first and
* last L1 indirect blocks are always dirtied by dnode_free_range().
*/
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
+ dmu_buf_unlock_parent(db, dblt, FTAG);
dbuf_release_bp(db);
bp = db->db.db_data;
@@ -306,7 +326,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx);
- free_blocks(dn, bp, end-start+1, tx);
+ rw_enter(&db->db_rwlock, RW_WRITER);
+ free_blocks(dn, bp, end - start + 1, tx);
+ rw_exit(&db->db_rwlock);
} else {
for (uint64_t id = start; id <= end; id++, bp++) {
if (BP_IS_HOLE(bp))
@@ -323,10 +345,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
}
if (free_indirects) {
+ rw_enter(&db->db_rwlock, RW_WRITER);
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
ASSERT(BP_IS_HOLE(bp));
bzero(db->db.db_data, db->db.db_size);
free_blocks(dn, db->db_blkptr, 1, tx);
+ rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);
@@ -378,7 +402,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);
-
free_children(db, blkid, nblks, free_indirects, tx);
dbuf_rele(db, FTAG);
}