diff options
author | smh <[email protected]> | 2016-02-12 20:47:22 -0500 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-02-26 11:24:35 -0800 |
commit | 9f500936c82137ef3a57c53013894f622dcec14e (patch) | |
tree | e7594e8f711c255d4bfe699e3fe74bab3acb12aa | |
parent | a77f29f93c8d016f17d9b77f39662e311774aaae (diff) |
FreeBSD r256956: Improve ZFS N-way mirror read performance by using load and locality information.
The existing algorithm selects a preferred leaf vdev based on offset of the zio
request modulo the number of members in the mirror. It assumes the devices are
of equal performance and that spreading the requests randomly over both drives
will be sufficient to saturate them. In practice this results in the leaf vdevs
being under utilized.
The new algorithm takes into the following additional factors:
* Load of the vdevs (number outstanding I/O requests)
* The locality of last queued I/O vs the new I/O request.
Within the locality calculation additional knowledge about the underlying vdev
is considered such as; is the device backing the vdev a rotating media device.
This results in performance increases across the board as well as significant
increases for predominantly streaming loads and for configurations which don't
have evenly performing devices.
The following are results from a setup with 3 Way Mirror with 2 x HD's and
1 x SSD from a basic test running multiple parrallel dd's.
With pre-fetch disabled (vfs.zfs.prefetch_disable=1):
== Stripe Balanced (default) ==
Read 15360MB using bs: 1048576, readers: 3, took 161 seconds @ 95 MB/s
== Load Balanced (zfslinux) ==
Read 15360MB using bs: 1048576, readers: 3, took 297 seconds @ 51 MB/s
== Load Balanced (locality freebsd) ==
Read 15360MB using bs: 1048576, readers: 3, took 54 seconds @ 284 MB/s
With pre-fetch enabled (vfs.zfs.prefetch_disable=0):
== Stripe Balanced (default) ==
Read 15360MB using bs: 1048576, readers: 3, took 91 seconds @ 168 MB/s
== Load Balanced (zfslinux) ==
Read 15360MB using bs: 1048576, readers: 3, took 108 seconds @ 142 MB/s
== Load Balanced (locality freebsd) ==
Read 15360MB using bs: 1048576, readers: 3, took 48 seconds @ 320 MB/s
In addition to the performance changes the code was also restructured, with
the help of Justin Gibbs, to provide a more logical flow which also ensures
vdevs loads are only calculated from the set of valid candidates.
The following additional sysctls where added to allow the administrator
to tune the behaviour of the load algorithm:
* vfs.zfs.vdev.mirror.rotating_inc
* vfs.zfs.vdev.mirror.rotating_seek_inc
* vfs.zfs.vdev.mirror.rotating_seek_offset
* vfs.zfs.vdev.mirror.non_rotating_inc
* vfs.zfs.vdev.mirror.non_rotating_seek_inc
These changes where based on work started by the zfsonlinux developers:
https://github.com/zfsonlinux/zfs/pull/1487
Reviewed by: gibbs, mav, will
MFC after: 2 weeks
Sponsored by: Multiplay
References:
https://github.com/freebsd/freebsd@5c7a6f5d
https://github.com/freebsd/freebsd@31b7f68d
https://github.com/freebsd/freebsd@e186f564
Performance Testing:
https://github.com/zfsonlinux/zfs/pull/4334#issuecomment-189057141
Porting notes:
- The tunables were adjusted to have ZoL-style names.
- The code was modified to use ZoL's vd_nonrot.
- Fixes were done to make cstyle.pl happy
- Merge conflicts were handled manually
- freebsd/freebsd@e186f564bc946f82c76e0b34c2f0370ed9aea022 by my
collegue Andriy Gapon has been included. It applied perfectly, but
added a cstyle regression.
- This replaces 556011dbec2d10579819078559a77630fc559112 entirely.
- A typo "IO'a" has been corrected to say "IO's"
- Descriptions of new tunables were added to man/man5/zfs-module-parameters.5.
Ported-by: Richard Yao <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #4334
-rw-r--r-- | include/sys/vdev.h | 4 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 1 | ||||
-rw-r--r-- | man/man5/zfs-module-parameters.5 | 65 | ||||
-rw-r--r-- | module/zfs/vdev_mirror.c | 325 | ||||
-rw-r--r-- | module/zfs/vdev_queue.c | 26 |
5 files changed, 307 insertions, 114 deletions
diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 7d64cf6bc..8a2afd49f 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -119,6 +119,10 @@ extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); +extern int vdev_queue_length(vdev_t *vd); +extern uint64_t vdev_queue_lastoffset(vdev_t *vd); +extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio); + extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 1371a3f03..e1706d603 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -120,6 +120,7 @@ struct vdev_queue { hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ kmutex_t vq_lock; + uint64_t vq_lastoffset; }; /* diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 6cba7f02c..2d565dc19 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1523,12 +1523,71 @@ Default value: \fB0\fR. .sp .ne 2 .na -\fBzfs_vdev_mirror_switch_us\fR (int) +\fBzfs_vdev_mirror_rotating_inc\fR (int) .ad .RS 12n -Switch mirrors every N usecs +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O immediately +follows its predecessor on rotational vdevs for the purpose of making decisions +based on load. .sp -Default value: \fB10,000\fR. +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_rotating_seek_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O lacks +locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within +this that are not immediately following the previous I/O are incremented by +half. +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_rotating_seek_offset\fR (int) +.ad +.RS 12n +The maximum distance for the last queued I/O in which the balancing algorithm +considers an I/O to have locality. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1048576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_non_rotating_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member on non-rotational vdevs +when I/Os do not immediately follow one another. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O lacks +locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within +this that are not immediately following the previous I/O are incremented by +half. +.sp +Default value: \fB1\fR. .RE .sp diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 6b699e883..d3dbdca79 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -41,44 +41,70 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; - int mc_pending; + int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; int mm_children; - int mm_replacing; - int mm_preferred; - int mm_root; - mirror_child_t mm_child[1]; + boolean_t mm_replacing; + boolean_t mm_root; + mirror_child_t mm_child[]; } mirror_map_t; +static int vdev_mirror_shift = 21; + /* - * When the children are equally busy queue incoming requests to a single - * child for N microseconds. This is done to maximize the likelihood that - * the Linux elevator will be able to merge requests while it is plugged. - * Otherwise, requests are queued to the least busy device. - * - * For rotational disks the Linux elevator will plug for 10ms which is - * why zfs_vdev_mirror_switch_us is set to 10ms by default. For non- - * rotational disks the elevator will not plug, but 10ms is still a small - * enough value that the requests will get spread over all the children. + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. * - * For fast SSDs it may make sense to decrease zfs_vdev_mirror_switch_us - * significantly to bound the worst case latencies. It would probably be - * ideal to calculate a decaying average of the last observed latencies and - * use that to dynamically adjust the zfs_vdev_mirror_switch_us time. + * If there is a mixture of rotating and non-rotating media, setting + * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results + * as it will direct more reads to the non-rotating vdevs which are more likely + * to have a higher performance. */ -int zfs_vdev_mirror_switch_us = 10000; + +/* Rotating media load calculation configuration. */ +static int zfs_vdev_mirror_rotating_inc = 0; +static int zfs_vdev_mirror_rotating_seek_inc = 5; +static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; + +/* Non-rotating media load calculation configuration. */ +static int zfs_vdev_mirror_non_rotating_inc = 0; +static int zfs_vdev_mirror_non_rotating_seek_inc = 1; + +static inline size_t +vdev_mirror_map_size(int children) +{ + return (offsetof(mirror_map_t, mm_child[children]) + + sizeof (int) * children); +} + +static inline mirror_map_t * +vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root) +{ + mirror_map_t *mm; + + mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); + mm->mm_children = children; + mm->mm_replacing = replacing; + mm->mm_root = root; + mm->mm_preferred = (int *)((uintptr_t)mm + + offsetof(mirror_map_t, mm_child[children])); + + return (mm); +} static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); + kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { @@ -87,9 +113,54 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = { }; static int -vdev_mirror_pending(vdev_t *vd) +vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) { - return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); + uint64_t lastoffset; + int load; + + /* All DVAs have equal weight at the root. */ + if (mm->mm_root) + return (INT_MAX); + + /* + * We don't return INT_MAX if the device is resilvering i.e. + * vdev_resilver_txg != 0 as when tested performance was slightly + * worse overall when resilvering with compared to without. + */ + + /* Standard load based on pending queue length. */ + load = vdev_queue_length(vd); + lastoffset = vdev_queue_lastoffset(vd); + + if (vd->vdev_nonrot) { + /* Non-rotating media. */ + if (lastoffset == zio_offset) + return (load + zfs_vdev_mirror_non_rotating_inc); + + /* + * Apply a seek penalty even for non-rotating devices as + * sequential I/O's can be aggregated into fewer operations on + * the device, thus avoiding unnecessary per-command overhead + * and boosting performance. + */ + return (load + zfs_vdev_mirror_non_rotating_seek_inc); + } + + /* Rotating media I/O's which directly follow the last I/O. */ + if (lastoffset == zio_offset) + return (load + zfs_vdev_mirror_rotating_inc); + + /* + * Apply half the seek increment to I/O's within seek offset + * of the last I/O queued to this vdev as they should incure less + * of a seek increment. + */ + if (ABS(lastoffset - zio_offset) < + zfs_vdev_mirror_rotating_seek_offset) + return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); + + /* Apply the full seek increment to all other I/O's. */ + return (load + zfs_vdev_mirror_rotating_seek_inc); } /* @@ -97,38 +168,19 @@ vdev_mirror_pending(vdev_t *vd) * is this functions only caller, as small as possible on the stack. */ noinline static mirror_map_t * -vdev_mirror_map_alloc(zio_t *zio) +vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; - int c, d; + int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; - c = BP_GET_NDVAS(zio->io_bp); - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), - KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = B_FALSE; - mm->mm_preferred = spa_get_random(c); - mm->mm_root = B_TRUE; - - /* - * Check the other, lower-index DVAs to see if they're on - * the same vdev as the child we picked. If they are, use - * them since they are likely to have been allocated from - * the primary metaslab in use at the time, and hence are - * more likely to have locality with single-copy data. - */ - for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { - if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) - mm->mm_preferred = d; - } - + mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, + B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -136,56 +188,13 @@ vdev_mirror_map_alloc(zio_t *zio) mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { - int lowest_pending = INT_MAX; - int lowest_nr = 1; - - c = vd->vdev_children; - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), - KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = 0; - mm->mm_root = B_FALSE; - + mm = vdev_mirror_map_alloc(vd->vdev_children, + (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops), B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; - - if (mm->mm_replacing) - continue; - - if (!vdev_readable(mc->mc_vd)) { - mc->mc_error = SET_ERROR(ENXIO); - mc->mc_tried = 1; - mc->mc_skipped = 1; - mc->mc_pending = INT_MAX; - continue; - } - - mc->mc_pending = vdev_mirror_pending(mc->mc_vd); - if (mc->mc_pending < lowest_pending) { - lowest_pending = mc->mc_pending; - lowest_nr = 1; - } else if (mc->mc_pending == lowest_pending) { - lowest_nr++; - } - } - - d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us); - d = (d % lowest_nr) + 1; - - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - - if (mm->mm_child[c].mc_pending == lowest_pending) { - if (--d == 0) { - mm->mm_preferred = c; - break; - } - } } } @@ -276,6 +285,54 @@ vdev_mirror_scrub_done(zio_t *zio) } /* + * Check the other, lower-index DVAs to see if they're on the same + * vdev as the child we picked. If they are, use them since they + * are likely to have been allocated from the primary metaslab in + * use at the time, and hence are more likely to have locality with + * single-copy data. + */ +static int +vdev_mirror_dva_select(zio_t *zio, int p) +{ + dva_t *dva = zio->io_bp->blk_dva; + mirror_map_t *mm = zio->io_vsd; + int preferred; + int c; + + preferred = mm->mm_preferred[p]; + for (p--; p >= 0; p--) { + c = mm->mm_preferred[p]; + if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) + preferred = c; + } + return (preferred); +} + +static int +vdev_mirror_preferred_child_randomize(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + int p; + + if (mm->mm_root) { + p = spa_get_random(mm->mm_preferred_cnt); + return (vdev_mirror_dva_select(zio, p)); + } + + /* + * To ensure we don't always favour the first matching vdev, + * which could lead to wear leveling issues on SSD's, we + * use the I/O offset as a pseudo random seed into the vdevs + * which have the lowest load. + */ + p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; + return (mm->mm_preferred[p]); +} + +/* + * Try to find a vdev whose DTL doesn't contain the block we want to read + * prefering vdevs based on determined load. + * * Try to find a child whose DTL doesn't contain the block we want to read. * If we can't, try the read on any vdev we haven't already tried. */ @@ -283,43 +340,70 @@ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; uint64_t txg = zio->io_txg; - int i, c; + int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); - /* - * Try to find a child whose DTL doesn't contain the block to read. - * If a child is known to be completely inaccessible (indicated by - * vdev_readable() returning B_FALSE), don't even try. - */ - for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { - if (c >= mm->mm_children) - c = 0; + lowest_load = INT_MAX; + mm->mm_preferred_cnt = 0; + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc; + mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; + if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) - return (c); - mc->mc_error = SET_ERROR(ESTALE); - mc->mc_skipped = 1; - mc->mc_speculative = 1; + + if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + mc->mc_error = SET_ERROR(ESTALE); + mc->mc_skipped = 1; + mc->mc_speculative = 1; + continue; + } + + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); + if (mc->mc_load > lowest_load) + continue; + + if (mc->mc_load < lowest_load) { + lowest_load = mc->mc_load; + mm->mm_preferred_cnt = 0; + } + mm->mm_preferred[mm->mm_preferred_cnt] = c; + mm->mm_preferred_cnt++; + } + + if (mm->mm_preferred_cnt == 1) { + vdev_queue_register_lastoffset( + mm->mm_child[mm->mm_preferred[0]].mc_vd, zio); + return (mm->mm_preferred[0]); + } + + if (mm->mm_preferred_cnt > 1) { + int c = vdev_mirror_preferred_child_randomize(zio); + + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); + return (c); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ - for (c = 0; c < mm->mm_children; c++) - if (!mm->mm_child[c].mc_tried) + for (c = 0; c < mm->mm_children; c++) { + if (!mm->mm_child[c].mc_tried) { + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, + zio); return (c); + } + } /* * Every child failed. There's no place left to look. @@ -334,7 +418,7 @@ vdev_mirror_io_start(zio_t *zio) mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_alloc(zio); + mm = vdev_mirror_map_init(zio); if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { @@ -559,6 +643,25 @@ vdev_ops_t vdev_spare_ops = { }; #if defined(_KERNEL) && defined(HAVE_SPL) -module_param(zfs_vdev_mirror_switch_us, int, 0644); -MODULE_PARM_DESC(zfs_vdev_mirror_switch_us, "Switch mirrors every N usecs"); +module_param(zfs_vdev_mirror_rotating_inc, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_rotating_inc, + "Rotating media load increment for non-seeking I/O's"); + +module_param(zfs_vdev_mirror_rotating_seek_inc, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_inc, + "Rotating media load increment for seeking I/O's"); + +module_param(zfs_vdev_mirror_rotating_seek_offset, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_offset, + "Offset in bytes from the last I/O which " + "triggers a reduced rotating media seek increment"); + +module_param(zfs_vdev_mirror_non_rotating_inc, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_inc, + "Non-rotating media load increment for non-seeking I/O's"); + +module_param(zfs_vdev_mirror_non_rotating_seek_inc, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_seek_inc, + "Non-rotating media load increment for seeking I/O's"); + #endif diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e828ce917..af8af67de 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -374,6 +374,8 @@ vdev_queue_init(vdev_t *vd) avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + + vq->vq_lastoffset = 0; } void @@ -776,6 +778,30 @@ vdev_queue_io_done(zio_t *zio) mutex_exit(&vq->vq_lock); } +/* + * As these three methods are only used for load calculations we're not + * concerned if we get an incorrect value on 32bit platforms due to lack of + * vq_lock mutex use here, instead we prefer to keep it lock free for + * performance. + */ +int +vdev_queue_length(vdev_t *vd) +{ + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); +} + +uint64_t +vdev_queue_lastoffset(vdev_t *vd) +{ + return (vd->vdev_queue.vq_lastoffset); +} + +void +vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) +{ + vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; +} + #if defined(_KERNEL) && defined(HAVE_SPL) module_param(zfs_vdev_aggregation_limit, int, 0644); MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); |