summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config/kernel-blkdev.m422
-rw-r--r--module/os/linux/zfs/vdev_disk.c8
-rw-r--r--module/os/linux/zfs/zvol_os.c59
-rw-r--r--module/zfs/zvol.c38
-rwxr-xr-xtests/test-runner/bin/zts-report.py.in1
5 files changed, 90 insertions, 38 deletions
diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 61e66421f..9c60e5dd4 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -294,6 +294,27 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [
])
])
+dnl #
+dnl # 5.13 API change
+dnl # blkdev_get_by_path() no longer handles ERESTARTSYS
+dnl #
+dnl # Unfortunately we're forced to rely solely on the kernel version
+dnl # number in order to determine the expected behavior. This was an
+dnl # internal change to blkdev_get_by_dev(), see commit a8ed1a0607.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [
+ AC_MSG_CHECKING([whether blkdev_get_by_path() handles ERESTARTSYS])
+ AS_VERSION_COMPARE([$LINUX_VERSION], [5.13.0], [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_BLKDEV_GET_ERESTARTSYS, 1,
+ [blkdev_get_by_path() handles ERESTARTSYS])
+ ],[
+ AC_MSG_RESULT(no)
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
ZFS_AC_KERNEL_SRC_BLKDEV_PUT
@@ -318,4 +339,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE
ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
+ ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
])
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index fe9752a67..bcec4c6ba 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -265,6 +265,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
* a ENOENT failure at this point is highly likely to be transient
* and it is reasonable to sleep and retry before giving up. In
* practice delays have been observed to be on the order of 100ms.
+ *
+ * When ERESTARTSYS is returned it indicates the block device is
+ * a zvol which could not be opened due to the deadlock detection
+ * logic in zvol_open(). Extend the timeout and retry the open
+ * subsequent attempts are expected to eventually succeed.
*/
hrtime_t start = gethrtime();
bdev = ERR_PTR(-ENXIO);
@@ -273,6 +278,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
zfs_vdev_holder);
if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
schedule_timeout(MSEC_TO_TICK(10));
+ } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) {
+ timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
+ continue;
} else if (IS_ERR(bdev)) {
break;
}
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index c17423426..75a31b38d 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -46,6 +46,7 @@ unsigned int zvol_request_sync = 0;
unsigned int zvol_prefetch_bytes = (128 * 1024);
unsigned long zvol_max_discard_blocks = 16384;
unsigned int zvol_threads = 32;
+unsigned int zvol_open_timeout_ms = 1000;
struct zvol_state_os {
struct gendisk *zvo_disk; /* generic disk */
@@ -490,7 +491,13 @@ zvol_open(struct block_device *bdev, fmode_t flag)
zvol_state_t *zv;
int error = 0;
boolean_t drop_suspend = B_TRUE;
+ boolean_t drop_namespace = B_FALSE;
+#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+ hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
+ hrtime_t start = gethrtime();
+retry:
+#endif
rw_enter(&zvol_state_lock, RW_READER);
/*
* Obtain a copy of private_data under the zvol_state_lock to make
@@ -504,6 +511,49 @@ zvol_open(struct block_device *bdev, fmode_t flag)
return (SET_ERROR(-ENXIO));
}
+ if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+ /*
+ * In all other call paths the spa_namespace_lock is taken
+ * before the bdev->bd_mutex lock. However, on open(2)
+ * the __blkdev_get() function calls fops->open() with the
+ * bdev->bd_mutex lock held. This can result in a deadlock
+ * when zvols from one pool are used as vdevs in another.
+ *
+ * To prevent a lock inversion deadlock we preemptively
+ * take the spa_namespace_lock. Normally the lock will not
+ * be contended and this is safe because spa_open_common()
+ * handles the case where the caller already holds the
+ * spa_namespace_lock.
+ *
+ * When the lock cannot be aquired after multiple retries
+ * this must be the vdev on zvol deadlock case and we have
+ * no choice but to return an error. For 5.12 and older
+ * kernels returning -ERESTARTSYS will result in the
+ * bdev->bd_mutex being dropped, then reacquired, and
+ * fops->open() being called again. This process can be
+ * repeated safely until both locks are acquired. For 5.13
+ * and newer the -ERESTARTSYS retry logic was removed from
+ * the kernel so the only option is to return the error for
+ * the caller to handle it.
+ */
+ if (!mutex_tryenter(&spa_namespace_lock)) {
+ rw_exit(&zvol_state_lock);
+
+#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
+ schedule();
+ return (SET_ERROR(-ERESTARTSYS));
+#else
+ if ((gethrtime() - start) > timeout)
+ return (SET_ERROR(-ERESTARTSYS));
+
+ schedule_timeout(MSEC_TO_TICK(10));
+ goto retry;
+#endif
+ } else {
+ drop_namespace = B_TRUE;
+ }
+ }
+
mutex_enter(&zv->zv_state_lock);
/*
* make sure zvol is not suspended during first open
@@ -543,6 +593,8 @@ zvol_open(struct block_device *bdev, fmode_t flag)
zv->zv_open_count++;
mutex_exit(&zv->zv_state_lock);
+ if (drop_namespace)
+ mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
@@ -556,12 +608,11 @@ out_open_count:
out_mutex:
mutex_exit(&zv->zv_state_lock);
+ if (drop_namespace)
+ mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
- if (error == -EINTR) {
- error = -ERESTARTSYS;
- schedule();
- }
+
return (SET_ERROR(error));
}
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index e4b641989..721a7b4b8 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -909,54 +909,26 @@ int
zvol_first_open(zvol_state_t *zv, boolean_t readonly)
{
objset_t *os;
- int error, locked = 0;
- boolean_t ro;
+ int error;
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(mutex_owned(&spa_namespace_lock));
- /*
- * In all other cases the spa_namespace_lock is taken before the
- * bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
- * function calls fops->open() with the bdev->bd_mutex lock held.
- * This deadlock can be easily observed with zvols used as vdevs.
- *
- * To avoid a potential lock inversion deadlock we preemptively
- * try to take the spa_namespace_lock(). Normally it will not
- * be contended and this is safe because spa_open_common() handles
- * the case where the caller already holds the spa_namespace_lock.
- *
- * When it is contended we risk a lock inversion if we were to
- * block waiting for the lock. Luckily, the __blkdev_get()
- * function allows us to return -ERESTARTSYS which will result in
- * bdev->bd_mutex being dropped, reacquired, and fops->open() being
- * called again. This process can be repeated safely until both
- * locks are acquired.
- */
- if (!mutex_owned(&spa_namespace_lock)) {
- locked = mutex_tryenter(&spa_namespace_lock);
- if (!locked)
- return (SET_ERROR(EINTR));
- }
-
- ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
+ boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
if (error)
- goto out_mutex;
+ return (SET_ERROR(error));
zv->zv_objset = os;
error = zvol_setup_zv(zv);
-
if (error) {
dmu_objset_disown(os, 1, zv);
zv->zv_objset = NULL;
}
-out_mutex:
- if (locked)
- mutex_exit(&spa_namespace_lock);
- return (SET_ERROR(error));
+ return (error);
}
void
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index cbbcb9641..e4ee177cc 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -285,7 +285,6 @@ elif sys.platform.startswith('linux'):
'alloc_class/alloc_class_011_neg': ['FAIL', known_reason],
'alloc_class/alloc_class_012_pos': ['FAIL', known_reason],
'alloc_class/alloc_class_013_pos': ['FAIL', '11888'],
- 'cli_root/zfs_copies/zfs_copies_003_pos': ['FAIL', '12301'],
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
'cli_root/zpool_expand/zpool_expand_001_pos': ['FAIL', known_reason],
'cli_root/zpool_expand/zpool_expand_005_pos': ['FAIL', known_reason],