summaryrefslogtreecommitdiffstats
path: root/module/zfs/dnode.c
diff options
context:
space:
mode:
authorPaul Dagnelie <[email protected]>2019-07-08 13:18:50 -0700
committerBrian Behlendorf <[email protected]>2019-07-08 13:18:50 -0700
commitf664f1ee7fc9dd7101171f6518c67951cb5bd8cf (patch)
tree790db3e84a6edf3bd67b1869cd81f3b312d06ba7 /module/zfs/dnode.c
parentcb709642216b5ac9be10039471c3c4bc27cb7cf2 (diff)
Decrease contention on dn_struct_rwlock
Currently, sequential async write workloads spend a lot of time contending on the dn_struct_rwlock. This lock is responsible for protecting the entire block tree below it; this naturally results in some serialization during heavy write workloads. This can be resolved by having per-dbuf locking, which will allow multiple writers in the same object at the same time. We introduce a new rwlock, the db_rwlock. This lock is responsible for protecting the contents of the dbuf that it is a part of; when reading a block pointer from a dbuf, you hold the lock as a reader. When writing data to a dbuf, you hold it as a writer. This allows multiple threads to write to different parts of a file at the same time. Reviewed by: Brad Lewis <[email protected]> Reviewed by: Matt Ahrens [email protected] Reviewed by: George Wilson [email protected] Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Paul Dagnelie <[email protected]> External-issue: DLPX-52564 External-issue: DLPX-53085 External-issue: DLPX-57384 Closes #8946
Diffstat (limited to 'module/zfs/dnode.c')
-rw-r--r--module/zfs/dnode.c73
1 files changed, 46 insertions, 27 deletions
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index c06f614e1..4d654e9e7 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1331,7 +1331,6 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
}
blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
-
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
rw_exit(&mdn->dn_struct_rwlock);
@@ -1742,10 +1741,11 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
/* resize the old block */
err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
- if (err == 0)
+ if (err == 0) {
dbuf_new_size(db, size, tx);
- else if (err != ENOENT)
+ } else if (err != ENOENT) {
goto fail;
+ }
dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
@@ -1983,7 +1983,6 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
int trunc = FALSE;
int epbs;
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
blksz = dn->dn_datablksz;
blkshift = dn->dn_datablkshift;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -2000,7 +1999,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
head = P2NPHASE(off, blksz);
blkoff = P2PHASE(off, blksz);
if ((off >> blkshift) > dn->dn_maxblkid)
- goto out;
+ return;
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
@@ -2009,12 +2008,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
*/
blkid = 0;
nblks = 1;
- if (dn->dn_nlevels > 1)
+ if (dn->dn_nlevels > 1) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_dirty_l1(dn, 0, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ }
goto done;
} else if (off >= blksz) {
/* Freeing past end-of-data */
- goto out;
+ return;
} else {
/* Freeing part of the block. */
head = blksz - off;
@@ -2024,19 +2026,26 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
}
/* zero out any partial block data at the start of the range */
if (head) {
+ int res;
ASSERT3U(blkoff + head, ==, blksz);
if (len < head)
head = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
- TRUE, FALSE, FTAG, &db) == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+ TRUE, FALSE, FTAG, &db);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (res == 0) {
caddr_t data;
+ boolean_t dirty;
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
+ FTAG);
/* don't dirty if it isn't on disk and isn't dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
+ dirty = db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+ dmu_buf_unlock_parent(db, dblt, FTAG);
+ if (dirty) {
dmu_buf_will_dirty(&db->db, tx);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
data = db->db.db_data;
bzero(data + blkoff, head);
}
@@ -2048,11 +2057,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
/* If the range was less than one block, we're done */
if (len == 0)
- goto out;
+ return;
/* If the remaining range is past end of file, we're done */
if ((off >> blkshift) > dn->dn_maxblkid)
- goto out;
+ return;
ASSERT(ISP2(blksz));
if (trunc)
@@ -2063,16 +2072,23 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
ASSERT0(P2PHASE(off, blksz));
/* zero out any partial block data at the end of the range */
if (tail) {
+ int res;
if (len < tail)
tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
- TRUE, FALSE, FTAG, &db) == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+ TRUE, FALSE, FTAG, &db);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (res == 0) {
+ boolean_t dirty;
/* don't dirty if not on disk and not dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
+ db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
+ FTAG);
+ dirty = db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+ dmu_buf_unlock_parent(db, type, FTAG);
+ if (dirty) {
dmu_buf_will_dirty(&db->db, tx);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, tail);
}
dbuf_rele(db, FTAG);
@@ -2082,7 +2098,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
/* If the range did not include a full block, we are done */
if (len == 0)
- goto out;
+ return;
ASSERT(IS_P2ALIGNED(off, blksz));
ASSERT(trunc || IS_P2ALIGNED(len, blksz));
@@ -2112,6 +2128,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
* amount of space if we copy the freed BPs into deadlists.
*/
if (dn->dn_nlevels > 1) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
uint64_t first, last;
first = blkid >> epbs;
@@ -2156,6 +2173,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
dnode_dirty_l1(dn, i, tx);
}
+ rw_exit(&dn->dn_struct_rwlock);
}
done:
@@ -2178,9 +2196,6 @@ done:
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
-out:
-
- rw_exit(&dn->dn_struct_rwlock);
}
static boolean_t
@@ -2289,6 +2304,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
boolean_t hole;
int i, inc, error, span;
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
ASSERT(txg == 0 || !hole);
@@ -2321,9 +2338,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
return (error);
}
data = db->db.db_data;
+ rw_enter(&db->db_rwlock, RW_READER);
}
-
if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
db->db_blkptr->blk_birth <= txg ||
BP_IS_HOLE(db->db_blkptr))) {
@@ -2396,8 +2413,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
error = SET_ERROR(ESRCH);
}
- if (db)
+ if (db != NULL) {
+ rw_exit(&db->db_rwlock);
dbuf_rele(db, FTAG);
+ }
return (error);
}