5 files changed, 90 insertions, 38 deletions
diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 61e66421f..9c60e5dd4 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -294,6 +294,27 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [
 	])
 ])
 
+dnl #
+dnl # 5.13 API change
+dnl # blkdev_get_by_path() no longer handles ERESTARTSYS
+dnl #
+dnl # Unfortunately we're forced to rely solely on the kernel version
+dnl # number in order to determine the expected behavior.  This was an
+dnl # internal change to blkdev_get_by_dev(), see commit a8ed1a0607.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [
+	AC_MSG_CHECKING([whether blkdev_get_by_path() handles ERESTARTSYS])
+	AS_VERSION_COMPARE([$LINUX_VERSION], [5.13.0], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_GET_ERESTARTSYS, 1,
+			[blkdev_get_by_path() handles ERESTARTSYS])
+	],[
+		AC_MSG_RESULT(no)
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
@@ -318,4 +339,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
+	ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
 ])
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index fe9752a67..bcec4c6ba 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -265,6 +265,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	 * a ENOENT failure at this point is highly likely to be transient
 	 * and it is reasonable to sleep and retry before giving up.  In
 	 * practice delays have been observed to be on the order of 100ms.
+	 *
+	 * When ERESTARTSYS is returned it indicates the block device is
+	 * a zvol which could not be opened due to the deadlock detection
+	 * logic in zvol_open().  Extend the timeout and retry the open
+	 * subsequent attempts are expected to eventually succeed.
 	 */
 	hrtime_t start = gethrtime();
 	bdev = ERR_PTR(-ENXIO);
@@ -273,6 +278,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 		    zfs_vdev_holder);
 		if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
 			schedule_timeout(MSEC_TO_TICK(10));
+		} else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) {
+			timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
+			continue;
 		} else if (IS_ERR(bdev)) {
 			break;
 		}
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index c17423426..75a31b38d 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -46,6 +46,7 @@ unsigned int zvol_request_sync = 0;
 unsigned int zvol_prefetch_bytes = (128 * 1024);
 unsigned long zvol_max_discard_blocks = 16384;
 unsigned int zvol_threads = 32;
+unsigned int zvol_open_timeout_ms = 1000;
 
 struct zvol_state_os {
 	struct gendisk		*zvo_disk;	/* generic disk */
@@ -490,7 +491,13 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 	zvol_state_t *zv;
 	int error = 0;
 	boolean_t drop_suspend = B_TRUE;
+	boolean_t drop_namespace = B_FALSE;
+#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+	hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
+	hrtime_t start = gethrtime();
 
+retry:
+#endif
 	rw_enter(&zvol_state_lock, RW_READER);
 	/*
 	 * Obtain a copy of private_data under the zvol_state_lock to make
@@ -504,6 +511,49 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 		return (SET_ERROR(-ENXIO));
 	}
 
+	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+		/*
+		 * In all other call paths the spa_namespace_lock is taken
+		 * before the bdev->bd_mutex lock.  However, on open(2)
+		 * the __blkdev_get() function calls fops->open() with the
+		 * bdev->bd_mutex lock held.  This can result in a deadlock
+		 * when zvols from one pool are used as vdevs in another.
+		 *
+		 * To prevent a lock inversion deadlock we preemptively
+		 * take the spa_namespace_lock.  Normally the lock will not
+		 * be contended and this is safe because spa_open_common()
+		 * handles the case where the caller already holds the
+		 * spa_namespace_lock.
+		 *
+		 * When the lock cannot be aquired after multiple retries
+		 * this must be the vdev on zvol deadlock case and we have
+		 * no choice but to return an error.  For 5.12 and older
+		 * kernels returning -ERESTARTSYS will result in the
+		 * bdev->bd_mutex being dropped, then reacquired, and
+		 * fops->open() being called again.  This process can be
+		 * repeated safely until both locks are acquired.  For 5.13
+		 * and newer the -ERESTARTSYS retry logic was removed from
+		 * the kernel so the only option is to return the error for
+		 * the caller to handle it.
+		 */
+		if (!mutex_tryenter(&spa_namespace_lock)) {
+			rw_exit(&zvol_state_lock);
+
+#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
+			schedule();
+			return (SET_ERROR(-ERESTARTSYS));
+#else
+			if ((gethrtime() - start) > timeout)
+				return (SET_ERROR(-ERESTARTSYS));
+
+			schedule_timeout(MSEC_TO_TICK(10));
+			goto retry;
+#endif
+		} else {
+			drop_namespace = B_TRUE;
+		}
+	}
+
 	mutex_enter(&zv->zv_state_lock);
 	/*
 	 * make sure zvol is not suspended during first open
@@ -543,6 +593,8 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 	zv->zv_open_count++;
 
 	mutex_exit(&zv->zv_state_lock);
+	if (drop_namespace)
+		mutex_exit(&spa_namespace_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 
@@ -556,12 +608,11 @@ out_open_count:
 
 out_mutex:
 	mutex_exit(&zv->zv_state_lock);
+	if (drop_namespace)
+		mutex_exit(&spa_namespace_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
-	if (error == -EINTR) {
-		error = -ERESTARTSYS;
-		schedule();
-	}
+
 	return (SET_ERROR(error));
 }
 
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index e4b641989..721a7b4b8 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -909,54 +909,26 @@ int
 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 {
 	objset_t *os;
-	int error, locked = 0;
-	boolean_t ro;
+	int error;
 
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(mutex_owned(&spa_namespace_lock));
 
-	/*
-	 * In all other cases the spa_namespace_lock is taken before the
-	 * bdev->bd_mutex lock.	 But in this case the Linux __blkdev_get()
-	 * function calls fops->open() with the bdev->bd_mutex lock held.
-	 * This deadlock can be easily observed with zvols used as vdevs.
-	 *
-	 * To avoid a potential lock inversion deadlock we preemptively
-	 * try to take the spa_namespace_lock().  Normally it will not
-	 * be contended and this is safe because spa_open_common() handles
-	 * the case where the caller already holds the spa_namespace_lock.
-	 *
-	 * When it is contended we risk a lock inversion if we were to
-	 * block waiting for the lock.	Luckily, the __blkdev_get()
-	 * function allows us to return -ERESTARTSYS which will result in
-	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
-	 * called again.  This process can be repeated safely until both
-	 * locks are acquired.
-	 */
-	if (!mutex_owned(&spa_namespace_lock)) {
-		locked = mutex_tryenter(&spa_namespace_lock);
-		if (!locked)
-			return (SET_ERROR(EINTR));
-	}
-
-	ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
+	boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
 	if (error)
-		goto out_mutex;
+		return (SET_ERROR(error));
 
 	zv->zv_objset = os;
 
 	error = zvol_setup_zv(zv);
-
 	if (error) {
 		dmu_objset_disown(os, 1, zv);
 		zv->zv_objset = NULL;
 	}
 
-out_mutex:
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
-	return (SET_ERROR(error));
+	return (error);
 }
 
 void
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index cbbcb9641..e4ee177cc 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -285,7 +285,6 @@ elif sys.platform.startswith('linux'):
         'alloc_class/alloc_class_011_neg': ['FAIL', known_reason],
         'alloc_class/alloc_class_012_pos': ['FAIL', known_reason],
         'alloc_class/alloc_class_013_pos': ['FAIL', '11888'],
-        'cli_root/zfs_copies/zfs_copies_003_pos': ['FAIL', '12301'],
         'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
         'cli_root/zpool_expand/zpool_expand_001_pos': ['FAIL', known_reason],
         'cli_root/zpool_expand/zpool_expand_005_pos': ['FAIL', known_reason],