aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTony Hutter <[email protected]>2024-06-28 09:52:03 -0700
committerTony Hutter <[email protected]>2024-07-16 15:33:23 -0700
commitc24a039042ef846940adae8e1dd1fc33cbb1f147 (patch)
treebe229f9dc1f936fd5ab570363d8d7e0fc085b326
parentf4e2aed42a704e2f238886b0dac50c06671b2d70 (diff)
Linux 6.9: Call add_disk() from workqueue to fix zfs_allow_010_pos (#16282)
The 6.9 kernel behaves differently in how it releases block devices. In the common case it will async release the device only after the return to userspace. This is different from the 6.8 and older kernels which release the block devices synchronously. To get around this, call add_disk() from a workqueue so that the kernel uses a different codepath to release our zvols in the way we expect. This stops zfs_allow_010_pos from hanging. Fixes: #16089 Signed-off-by: Tony Hutter <[email protected]> Reviewed-by: Tino Reichardt <[email protected]> Reviewed-by: Rob Norris <[email protected]>
-rw-r--r--module/os/linux/zfs/zvol_os.c102
1 files changed, 97 insertions, 5 deletions
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 41a9b1c53..85d2bebc4 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -41,6 +41,7 @@
#include <linux/blkdev_compat.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/workqueue.h>
#ifdef HAVE_BLK_MQ
#include <linux/blk-mq.h>
@@ -1337,6 +1338,101 @@ zvol_wait_close(zvol_state_t *zv)
{
}
+struct add_disk_work {
+ struct delayed_work work;
+ struct gendisk *disk;
+ int error;
+};
+
+static int
+__zvol_os_add_disk(struct gendisk *disk)
+{
+ int error = 0;
+#ifdef HAVE_ADD_DISK_RET
+ error = add_disk(disk);
+#else
+ add_disk(disk);
+#endif
+ return (error);
+}
+
+#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+static void
+zvol_os_add_disk_work(struct work_struct *work)
+{
+ struct add_disk_work *add_disk_work;
+ add_disk_work = container_of(work, struct add_disk_work, work.work);
+ add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
+}
+#endif
+
+/*
+ * SPECIAL CASE:
+ *
+ * This function basically calls add_disk() from a workqueue. You may be
+ * thinking: why not just call add_disk() directly?
+ *
+ * When you call add_disk(), the zvol appears to the world. When this happens,
+ * the kernel calls disk_scan_partitions() on the zvol, which behaves
+ * differently on the 6.9+ kernels:
+ *
+ * - 6.8 and older kernels -
+ * disk_scan_partitions()
+ * handle = bdev_open_by_dev(
+ * zvol_open()
+ * bdev_release(handle);
+ * zvol_release()
+ *
+ *
+ * - 6.9+ kernels -
+ * disk_scan_partitions()
+ * file = bdev_file_open_by_dev()
+ * zvol_open()
+ * fput(file)
+ * < wait for return to userspace >
+ * zvol_release()
+ *
+ * The difference is that the bdev_release() from the 6.8 kernel is synchronous
+ * while the fput() from the 6.9 kernel is async. Or more specifically it's
+ * async that has to wait until we return to userspace (since it adds the fput
+ * into the caller's work queue with the TWA_RESUME flag set). This is not the
+ * behavior we want, since we want do things like create+destroy a zvol within
+ * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
+ * reference to the zvol while we're in the IOCTL, which can't wait until we
+ * return to userspace.
+ *
+ * We can get around this since fput() has a special codepath for when it's
+ * running in a kernel thread or interrupt. In those cases, it just puts the
+ * fput into the system workqueue, which we can force to run with
+ * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it
+ * run from a kernel thread and "tricks" the fput() codepaths.
+ *
+ * Note that __flush_workqueue() is slowly getting deprecated. This may be ok
+ * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
+ * fput) to happen, which it eventually, naturally, will from the system_wq
+ * without us explicitly calling __flush_workqueue().
+ */
+static int
+zvol_os_add_disk(struct gendisk *disk)
+{
+#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */
+ struct add_disk_work add_disk_work;
+
+ INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
+ add_disk_work.disk = disk;
+ add_disk_work.error = 0;
+
+ /* Use *_delayed_work functions since they're not GPL'd */
+ schedule_delayed_work(&add_disk_work.work, 0);
+ flush_delayed_work(&add_disk_work.work);
+
+ __flush_workqueue(system_wq);
+ return (add_disk_work.error);
+#else /* <= 6.8 kernel */
+ return (__zvol_os_add_disk(disk));
+#endif
+}
+
/*
* Create a block device minor node and setup the linkage between it
* and the specified volume. Once this function returns the block
@@ -1541,11 +1637,7 @@ out_doi:
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
rw_exit(&zvol_state_lock);
-#ifdef HAVE_ADD_DISK_RET
- error = add_disk(zv->zv_zso->zvo_disk);
-#else
- add_disk(zv->zv_zso->zvo_disk);
-#endif
+ error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
} else {
ida_simple_remove(&zvol_ida, idx);
}