aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2011-02-07 13:54:59 -0800
committerBrian Behlendorf <[email protected]>2011-02-10 09:27:22 -0800
commit6839eed23e3c9d85cf0de767be32af0759e5bf2d (patch)
treefa4f2992b08736faec1179d68a7000c467297f3f
parent4db77a74a6a26c57a04f98c4a23b9fda9319ba9f (diff)
Use 'noop' IO Scheduler
Initial testing has shown the the right IO scheduler to use under Linux is noop. This strikes the ideal balance by allowing the zfs elevator to do all request ordering and prioritization. While allowing the Linux elevator to do the maximum front/back merging allowed by the physical device. This yields the largest possible requests for the device with the lowest total overhead. While 'noop' should be right for your system you can choose a different IO scheduler with the 'zfs_vdev_scheduler' option. You may set this value to any of the standard Linux schedulers: noop, cfq, deadline, anticipatory. In addition, if you choose 'none' zfs will not attempt to change the IO scheduler for the block device.
-rw-r--r--include/sys/vdev_disk.h3
-rw-r--r--module/zfs/vdev_disk.c46
2 files changed, 49 insertions, 0 deletions
diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h
index 021e66d1a..03e7048ac 100644
--- a/include/sys/vdev_disk.h
+++ b/include/sys/vdev_disk.h
@@ -81,6 +81,9 @@ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev)
#endif
+/* Default Linux IO Scheduler */
+#define VDEV_SCHEDULER "noop"
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 3e59bd226..aba3c4ab5 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -33,6 +33,8 @@
#include <sys/zio.h>
#include <sys/sunldi.h>
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+
/*
* Virtual device vector for disks.
*/
@@ -102,6 +104,43 @@ vdev_disk_error(zio_t *zio)
#endif
}
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices. This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization. While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device. This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ *
+ * Unfortunately we cannot directly call the elevator_switch() function
+ * because it is not exported from the block layer. This means we have
+ * to use the sysfs interface and a user space upcall. Pools will be
+ * automatically imported on module load so we must do this at device
+ * open time from the kernel.
+ */
+static int
+vdev_elevator_switch(vdev_t *v, char *elevator, char *device)
+{
+ char sh_path[] = "/bin/sh";
+ char sh_cmd[128];
+ char *argv[] = { sh_path, "-c", sh_cmd };
+ char *envp[] = { NULL };
+ int error;
+
+ if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
+ return (0);
+
+ sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler",
+ "/bin/echo", elevator, device);
+
+ error = call_usermodehelper(sh_path, argv, envp, 1);
+ if (error)
+ printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+ elevator, v->vdev_path, device, error);
+
+ return (error);
+}
+
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
{
@@ -167,6 +206,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
/* Based on the minimum sector size set the block size */
*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
+ /* Try to set the io scheduler elevator algorithm */
+ (void) vdev_elevator_switch(v, zfs_vdev_scheduler,
+ bdev->bd_disk->disk_name);
+
return 0;
}
@@ -702,3 +745,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
return 0;
}
+
+module_param(zfs_vdev_scheduler, charp, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "IO Scheduler (noop)");