diff options
author | Brian Behlendorf <[email protected]> | 2011-02-07 13:54:59 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2011-02-10 09:27:22 -0800 |
commit | 6839eed23e3c9d85cf0de767be32af0759e5bf2d (patch) | |
tree | fa4f2992b08736faec1179d68a7000c467297f3f | |
parent | 4db77a74a6a26c57a04f98c4a23b9fda9319ba9f (diff) |
Use 'noop' IO Scheduler
Initial testing has shown the the right IO scheduler to use under Linux
is noop. This strikes the ideal balance by allowing the zfs elevator
to do all request ordering and prioritization. While allowing the
Linux elevator to do the maximum front/back merging allowed by the
physical device. This yields the largest possible requests for the
device with the lowest total overhead.
While 'noop' should be right for your system you can choose a different
IO scheduler with the 'zfs_vdev_scheduler' option. You may set this
value to any of the standard Linux schedulers: noop, cfq, deadline,
anticipatory. In addition, if you choose 'none' zfs will not attempt
to change the IO scheduler for the block device.
-rw-r--r-- | include/sys/vdev_disk.h | 3 | ||||
-rw-r--r-- | module/zfs/vdev_disk.c | 46 |
2 files changed, 49 insertions, 0 deletions
diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h index 021e66d1a..03e7048ac 100644 --- a/include/sys/vdev_disk.h +++ b/include/sys/vdev_disk.h @@ -81,6 +81,9 @@ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); # define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) #endif +/* Default Linux IO Scheduler */ +#define VDEV_SCHEDULER "noop" + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 3e59bd226..aba3c4ab5 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -33,6 +33,8 @@ #include <sys/zio.h> #include <sys/sunldi.h> +char *zfs_vdev_scheduler = VDEV_SCHEDULER; + /* * Virtual device vector for disks. */ @@ -102,6 +104,43 @@ vdev_disk_error(zio_t *zio) #endif } +/* + * Use the Linux 'noop' elevator for zfs managed block devices. This + * strikes the ideal balance by allowing the zfs elevator to do all + * request ordering and prioritization. While allowing the Linux + * elevator to do the maximum front/back merging allowed by the + * physical device. This yields the largest possible requests for + * the device with the lowest total overhead. + * + * Unfortunately we cannot directly call the elevator_switch() function + * because it is not exported from the block layer. This means we have + * to use the sysfs interface and a user space upcall. Pools will be + * automatically imported on module load so we must do this at device + * open time from the kernel. + */ +static int +vdev_elevator_switch(vdev_t *v, char *elevator, char *device) +{ + char sh_path[] = "/bin/sh"; + char sh_cmd[128]; + char *argv[] = { sh_path, "-c", sh_cmd }; + char *envp[] = { NULL }; + int error; + + if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4)) + return (0); + + sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler", + "/bin/echo", elevator, device); + + error = call_usermodehelper(sh_path, argv, envp, 1); + if (error) + printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", + elevator, v->vdev_path, device, error); + + return (error); +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) { @@ -167,6 +206,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + /* Try to set the io scheduler elevator algorithm */ + (void) vdev_elevator_switch(v, zfs_vdev_scheduler, + bdev->bd_disk->disk_name); + return 0; } @@ -702,3 +745,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) return 0; } + +module_param(zfs_vdev_scheduler, charp, 0644); +MODULE_PARM_DESC(zfs_vdev_scheduler, "IO Scheduler (noop)"); |