diff options
author | Brian Behlendorf <[email protected]> | 2021-12-01 16:07:12 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2021-12-01 17:07:12 -0700 |
commit | 77e2756de08ccd4e8ce755f78bdcda1e3e0c55e5 (patch) | |
tree | 230b1de25c7af90496defdd1c5914f75223a8c38 /module/os/linux/zfs/zvol_os.c | |
parent | 31d2f42b2a55485b017f428b8f7c1836750ecde9 (diff) |
Linux 5.13 compat: retry zvol_open() when contended
Due to a possible lock inversion the zvol open call path on Linux
needs to be able to retry in the case where the spa_namespace_lock
cannot be acquired.
For Linux 5.12 an older kernel this was accomplished by returning
-ERESTARTSYS from zvol_open() to request that blkdev_get() drop
the bdev->bd_mutex lock, reaquire it, then call the open callback
again. However, as of the 5.13 kernel this behavior was removed.
Therefore, for 5.12 and older kernels we preserved the existing
retry logic, but for 5.13 and newer kernels we retry internally in
zvol_open(). This should always succeed except in the case where
a pool's vdev are layed on zvols, in which case it may fail. To
handle this case vdev_disk_open() has been updated to retry when
opening a device when -ERESTARTSYS is returned.
Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: Tony Nguyen <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Issue #12301
Closes #12759
Diffstat (limited to 'module/os/linux/zfs/zvol_os.c')
-rw-r--r-- | module/os/linux/zfs/zvol_os.c | 59 |
1 files changed, 55 insertions, 4 deletions
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index c17423426..75a31b38d 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -46,6 +46,7 @@ unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; unsigned int zvol_threads = 32; +unsigned int zvol_open_timeout_ms = 1000; struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ @@ -490,7 +491,13 @@ zvol_open(struct block_device *bdev, fmode_t flag) zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_TRUE; + boolean_t drop_namespace = B_FALSE; +#ifndef HAVE_BLKDEV_GET_ERESTARTSYS + hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); + hrtime_t start = gethrtime(); +retry: +#endif rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make @@ -504,6 +511,49 @@ zvol_open(struct block_device *bdev, fmode_t flag) return (SET_ERROR(-ENXIO)); } + if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + /* + * In all other call paths the spa_namespace_lock is taken + * before the bdev->bd_mutex lock. However, on open(2) + * the __blkdev_get() function calls fops->open() with the + * bdev->bd_mutex lock held. This can result in a deadlock + * when zvols from one pool are used as vdevs in another. + * + * To prevent a lock inversion deadlock we preemptively + * take the spa_namespace_lock. Normally the lock will not + * be contended and this is safe because spa_open_common() + * handles the case where the caller already holds the + * spa_namespace_lock. + * + * When the lock cannot be aquired after multiple retries + * this must be the vdev on zvol deadlock case and we have + * no choice but to return an error. For 5.12 and older + * kernels returning -ERESTARTSYS will result in the + * bdev->bd_mutex being dropped, then reacquired, and + * fops->open() being called again. This process can be + * repeated safely until both locks are acquired. For 5.13 + * and newer the -ERESTARTSYS retry logic was removed from + * the kernel so the only option is to return the error for + * the caller to handle it. + */ + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + +#ifdef HAVE_BLKDEV_GET_ERESTARTSYS + schedule(); + return (SET_ERROR(-ERESTARTSYS)); +#else + if ((gethrtime() - start) > timeout) + return (SET_ERROR(-ERESTARTSYS)); + + schedule_timeout(MSEC_TO_TICK(10)); + goto retry; +#endif + } else { + drop_namespace = B_TRUE; + } + } + mutex_enter(&zv->zv_state_lock); /* * make sure zvol is not suspended during first open @@ -543,6 +593,8 @@ zvol_open(struct block_device *bdev, fmode_t flag) zv->zv_open_count++; mutex_exit(&zv->zv_state_lock); + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); @@ -556,12 +608,11 @@ out_open_count: out_mutex: mutex_exit(&zv->zv_state_lock); + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - if (error == -EINTR) { - error = -ERESTARTSYS; - schedule(); - } + return (SET_ERROR(error)); } |