diff options
author | Brian Behlendorf <[email protected]> | 2021-12-17 09:52:13 -0800 |
---|---|---|
committer | Tony Hutter <[email protected]> | 2021-12-22 09:31:58 -0800 |
commit | a4831fa023f07999570f1f200b1c856300315977 (patch) | |
tree | fdc49fd6b7666f050449f4de34e7596deef1bcef | |
parent | b327131a1fe5340cf3c2e8b46ff4d80029c636b7 (diff) |
Fix zvol_open() lock inversion
When restructuring the zvol_open() logic for the Linux 5.13 kernel
a lock inversion was accidentally introduced. In the updated code
the spa_namespace_lock is now taken before the zv_suspend_lock
allowing the following scenario to occur:
down_read <=== waiting for zv_suspend_lock
zvol_open <=== holds spa_namespace_lock
__blkdev_get
blkdev_get_by_dev
blkdev_open
...
mutex_lock <== waiting for spa_namespace_lock
spa_open_common
spa_open
dsl_pool_hold
dmu_objset_hold_flags
dmu_objset_hold
dsl_prop_get
dsl_prop_get_integer
zvol_create_minor
dmu_recv_end
zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
zfs_ioc_recv
...
This commit resolves the issue by moving the acquisition of the
spa_namespace_lock back to after the zv_suspend_lock which restores
the original ordering.
Additionally, as part of this change the error exit paths were
simplified where possible.
Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: Rich Ercolani <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #12863
-rw-r--r-- | module/os/linux/zfs/zvol_os.c | 121 |
1 files changed, 58 insertions, 63 deletions
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 25732f5e1..5f20d092a 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -461,8 +461,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; int error = 0; - boolean_t drop_suspend = B_TRUE; - boolean_t drop_namespace = B_FALSE; + boolean_t drop_suspend = B_FALSE; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t start = gethrtime(); @@ -482,7 +481,36 @@ retry: return (SET_ERROR(-ENXIO)); } - if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + mutex_enter(&zv->zv_state_lock); + /* + * Make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + } else { + drop_suspend = B_TRUE; + } + } else { + drop_suspend = B_TRUE; + } + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + boolean_t drop_namespace = B_FALSE; + + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + /* * In all other call paths the spa_namespace_lock is taken * before the bdev->bd_mutex lock. However, on open(2) @@ -507,84 +535,51 @@ retry: * the kernel so the only option is to return the error for * the caller to handle it. */ - if (!mutex_tryenter(&spa_namespace_lock)) { - rw_exit(&zvol_state_lock); + if (!mutex_owned(&spa_namespace_lock)) { + if (!mutex_tryenter(&spa_namespace_lock)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); #ifdef HAVE_BLKDEV_GET_ERESTARTSYS - schedule(); - return (SET_ERROR(-ERESTARTSYS)); -#else - if ((gethrtime() - start) > timeout) + schedule(); return (SET_ERROR(-ERESTARTSYS)); +#else + if ((gethrtime() - start) > timeout) + return (SET_ERROR(-ERESTARTSYS)); - schedule_timeout(MSEC_TO_TICK(10)); - goto retry; + schedule_timeout(MSEC_TO_TICK(10)); + goto retry; #endif - } else { - drop_namespace = B_TRUE; - } - } - - mutex_enter(&zv->zv_state_lock); - /* - * make sure zvol is not suspended during first open - * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock - */ - if (zv->zv_open_count == 0) { - if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { - mutex_exit(&zv->zv_state_lock); - rw_enter(&zv->zv_suspend_lock, RW_READER); - mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ - if (zv->zv_open_count != 0) { - rw_exit(&zv->zv_suspend_lock); - drop_suspend = B_FALSE; + } else { + drop_namespace = B_TRUE; } } - } else { - drop_suspend = B_FALSE; - } - rw_exit(&zvol_state_lock); - - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - if (zv->zv_open_count == 0) { - ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); - if (error) - goto out_mutex; - } - if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - error = -EROFS; - goto out_open_count; + if (drop_namespace) + mutex_exit(&spa_namespace_lock); } - zv->zv_open_count++; - - mutex_exit(&zv->zv_state_lock); - if (drop_namespace) - mutex_exit(&spa_namespace_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - - zfs_check_media_change(bdev); - - return (0); + if (error == 0) { + if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + if (zv->zv_open_count == 0) + zvol_last_close(zv); -out_open_count: - if (zv->zv_open_count == 0) - zvol_last_close(zv); + error = SET_ERROR(-EROFS); + } else { + zv->zv_open_count++; + } + } -out_mutex: mutex_exit(&zv->zv_state_lock); - if (drop_namespace) - mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - return (SET_ERROR(error)); + if (error == 0) + zfs_check_media_change(bdev); + + return (error); } static void |