aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/sys/vdev_disk.h3
-rw-r--r--module/zfs/vdev_disk.c46
2 files changed, 49 insertions, 0 deletions
diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h
index 021e66d1a..03e7048ac 100644
--- a/include/sys/vdev_disk.h
+++ b/include/sys/vdev_disk.h
@@ -81,6 +81,9 @@ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev)
#endif
+/* Default Linux IO Scheduler */
+#define VDEV_SCHEDULER "noop"
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 3e59bd226..aba3c4ab5 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -33,6 +33,8 @@
#include <sys/zio.h>
#include <sys/sunldi.h>
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+
/*
* Virtual device vector for disks.
*/
@@ -102,6 +104,43 @@ vdev_disk_error(zio_t *zio)
#endif
}
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices. This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization. While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device. This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ *
+ * Unfortunately we cannot directly call the elevator_switch() function
+ * because it is not exported from the block layer. This means we have
+ * to use the sysfs interface and a user space upcall. Pools will be
+ * automatically imported on module load so we must do this at device
+ * open time from the kernel.
+ */
+static int
+vdev_elevator_switch(vdev_t *v, char *elevator, char *device)
+{
+ char sh_path[] = "/bin/sh";
+ char sh_cmd[128];
+ char *argv[] = { sh_path, "-c", sh_cmd };
+ char *envp[] = { NULL };
+ int error;
+
+ if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
+ return (0);
+
+ sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler",
+ "/bin/echo", elevator, device);
+
+ error = call_usermodehelper(sh_path, argv, envp, 1);
+ if (error)
+ printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+ elevator, v->vdev_path, device, error);
+
+ return (error);
+}
+
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
{
@@ -167,6 +206,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
/* Based on the minimum sector size set the block size */
*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
+ /* Try to set the io scheduler elevator algorithm */
+ (void) vdev_elevator_switch(v, zfs_vdev_scheduler,
+ bdev->bd_disk->disk_name);
+
return 0;
}
@@ -702,3 +745,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
return 0;
}
+
+module_param(zfs_vdev_scheduler, charp, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "IO Scheduler (noop)");