diff options
author | Alexander Motin <[email protected]> | 2014-07-18 08:53:38 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2015-07-06 09:34:13 -0700 |
commit | e16b3fcc610fab2dcf3381486b2640dc2a2213cb (patch) | |
tree | 8571cd7ae1db3137b7f36ae93ff07f447a16fecc /module/zfs | |
parent | 4bda3bd0e72d582a785b6552ce16b99e04414fbe (diff) |
Illumos 5008 - lock contention (rrw_exit) while running a read only load
5008 lock contention (rrw_exit) while running a read only load
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Alex Reece <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Reviewed by: Richard Yao <[email protected]>
Reviewed by: Saso Kiselkov <[email protected]>
Approved by: Garrett D'Amore <[email protected]>
Porting notes:
This patch ported perfectly cleanly to ZoL. During testing 100% cached
small-block reads, extreme contention was noticed on rrl->rr_lock from
rrw_exit() due to the frequent entering and leaving ZPL. Illumos picked
up this patch from FreeBSD and it also helps under Linux.
On a 1-minute 4K cached read test with 10 fio processes pinned to a single
socket on a 4-socket (10 thread per socket) NUMA system, contentions on
rrl->rr_lock were reduced from 508799 to 43085.
Ported-by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #3555
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/rrwlock.c | 88 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 6 | ||||
-rw-r--r-- | module/zfs/zfs_vfsops.c | 14 |
3 files changed, 98 insertions, 10 deletions
diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c index 29a22534e..51394c01c 100644 --- a/module/zfs/rrwlock.c +++ b/module/zfs/rrwlock.c @@ -305,3 +305,91 @@ rrw_tsd_destroy(void *arg) (void *)curthread, (void *)rn->rn_rrl); } } + +/* + * A reader-mostly lock implementation, tuning above reader-writer locks + * for hightly parallel read acquisitions, while pessimizing writes. + * + * The idea is to split single busy lock into array of locks, so that + * each reader can lock only one of them for read, depending on result + * of simple hash function. That proportionally reduces lock congestion. + * Writer same time has to sequentially aquire write on all the locks. + * That makes write aquisition proportionally slower, but in places where + * it is used (filesystem unmount) performance is not critical. + * + * All the functions below are direct wrappers around functions above. + */ +void +rrm_init(rrmlock_t *rrl, boolean_t track_all) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_init(&rrl->locks[i], track_all); +} + +void +rrm_destroy(rrmlock_t *rrl) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_destroy(&rrl->locks[i]); +} + +void +rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrm_enter_read(rrl, tag); + else + rrm_enter_write(rrl); +} + +/* + * This maps the current thread to a specific lock. Note that the lock + * must be released by the same thread that acquired it. We do this + * mapping by taking the thread pointer mod a prime number. We examine + * only the low 32 bits of the thread pointer, because 32-bit division + * is faster than 64-bit division, and the high 32 bits have little + * entropy anyway. + */ +#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) + +void +rrm_enter_read(rrmlock_t *rrl, void *tag) +{ + rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); +} + +void +rrm_enter_write(rrmlock_t *rrl) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_enter_write(&rrl->locks[i]); +} + +void +rrm_exit(rrmlock_t *rrl, void *tag) +{ + int i; + + if (rrl->locks[0].rr_writer == curthread) { + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_exit(&rrl->locks[i], tag); + } else { + rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); + } +} + +boolean_t +rrm_held(rrmlock_t *rrl, krw_t rw) +{ + if (rw == RW_WRITER) { + return (rrw_held(&rrl->locks[0], rw)); + } else { + return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); + } +} diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index c44927036..d997616ae 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1451,7 +1451,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer) if (get_zfs_sb(name, zsbp) != 0) error = zfs_sb_create(name, zsbp); if (error == 0) { - rrw_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER : + rrm_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER : RW_READER, tag); if ((*zsbp)->z_unmounted) { /* @@ -1459,7 +1459,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer) * thread should be just about to disassociate the * objset from the zsb. */ - rrw_exit(&(*zsbp)->z_teardown_lock, tag); + rrm_exit(&(*zsbp)->z_teardown_lock, tag); return (SET_ERROR(EBUSY)); } } @@ -1469,7 +1469,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer) static void zfs_sb_rele(zfs_sb_t *zsb, void *tag) { - rrw_exit(&zsb->z_teardown_lock, tag); + rrm_exit(&zsb->z_teardown_lock, tag); if (zsb->z_sb) { deactivate_super(zsb->z_sb); diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index ae1bc324b..a7005a2a1 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -771,7 +771,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zsb->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - rrw_init(&zsb->z_teardown_lock, B_FALSE); + rrm_init(&zsb->z_teardown_lock, B_FALSE); rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL); @@ -890,7 +890,7 @@ zfs_sb_free(zfs_sb_t *zsb) mutex_destroy(&zsb->z_znodes_lock); mutex_destroy(&zsb->z_lock); list_destroy(&zsb->z_all_znodes); - rrw_destroy(&zsb->z_teardown_lock); + rrm_destroy(&zsb->z_teardown_lock); rw_destroy(&zsb->z_teardown_inactive_lock); rw_destroy(&zsb->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) @@ -1221,7 +1221,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) } } - rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG); + rrm_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* @@ -1252,7 +1252,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) */ if (!unmounting && (zsb->z_unmounted || zsb->z_os == NULL)) { rw_exit(&zsb->z_teardown_inactive_lock); - rrw_exit(&zsb->z_teardown_lock, FTAG); + rrm_exit(&zsb->z_teardown_lock, FTAG); return (SET_ERROR(EIO)); } @@ -1280,7 +1280,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) */ if (unmounting) { zsb->z_unmounted = B_TRUE; - rrw_exit(&zsb->z_teardown_lock, FTAG); + rrm_exit(&zsb->z_teardown_lock, FTAG); rw_exit(&zsb->z_teardown_inactive_lock); } @@ -1599,7 +1599,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname) znode_t *zp; uint64_t sa_obj = 0; - ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock)); + ASSERT(RRM_WRITE_HELD(&zsb->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock)); /* @@ -1663,7 +1663,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname) bail: /* release the VFS ops */ rw_exit(&zsb->z_teardown_inactive_lock); - rrw_exit(&zsb->z_teardown_lock, FTAG); + rrm_exit(&zsb->z_teardown_lock, FTAG); if (err) { /* |