diff options
-rw-r--r-- | include/sys/vdev.h | 4 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 1 | ||||
-rw-r--r-- | man/man5/zfs-module-parameters.5 | 65 | ||||
-rw-r--r-- | module/zfs/vdev_mirror.c | 325 | ||||
-rw-r--r-- | module/zfs/vdev_queue.c | 26 |
5 files changed, 307 insertions, 114 deletions
diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 7d64cf6bc..8a2afd49f 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -119,6 +119,10 @@ extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); +extern int vdev_queue_length(vdev_t *vd); +extern uint64_t vdev_queue_lastoffset(vdev_t *vd); +extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio); + extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 1371a3f03..e1706d603 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -120,6 +120,7 @@ struct vdev_queue { hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ kmutex_t vq_lock; + uint64_t vq_lastoffset; }; /* diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 6cba7f02c..2d565dc19 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1523,12 +1523,71 @@ Default value: \fB0\fR. .sp .ne 2 .na -\fBzfs_vdev_mirror_switch_us\fR (int) +\fBzfs_vdev_mirror_rotating_inc\fR (int) .ad .RS 12n -Switch mirrors every N usecs +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O immediately +follows its predecessor on rotational vdevs for the purpose of making decisions +based on load. .sp -Default value: \fB10,000\fR. +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_rotating_seek_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O lacks +locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within +this that are not immediately following the previous I/O are incremented by +half. +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_rotating_seek_offset\fR (int) +.ad +.RS 12n +The maximum distance for the last queued I/O in which the balancing algorithm +considers an I/O to have locality. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1048576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_non_rotating_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member on non-rotational vdevs +when I/Os do not immediately follow one another. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O lacks +locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within +this that are not immediately following the previous I/O are incremented by +half. +.sp +Default value: \fB1\fR. .RE .sp diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 6b699e883..d3dbdca79 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -41,44 +41,70 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; - int mc_pending; + int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; int mm_children; - int mm_replacing; - int mm_preferred; - int mm_root; - mirror_child_t mm_child[1]; + boolean_t mm_replacing; + boolean_t mm_root; + mirror_child_t mm_child[]; } mirror_map_t; +static int vdev_mirror_shift = 21; + /* - * When the children are equally busy queue incoming requests to a single - * child for N microseconds. This is done to maximize the likelihood that - * the Linux elevator will be able to merge requests while it is plugged. - * Otherwise, requests are queued to the least busy device. - * - * For rotational disks the Linux elevator will plug for 10ms which is - * why zfs_vdev_mirror_switch_us is set to 10ms by default. For non- - * rotational disks the elevator will not plug, but 10ms is still a small - * enough value that the requests will get spread over all the children. + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. * - * For fast SSDs it may make sense to decrease zfs_vdev_mirror_switch_us - * significantly to bound the worst case latencies. It would probably be - * ideal to calculate a decaying average of the last observed latencies and - * use that to dynamically adjust the zfs_vdev_mirror_switch_us time. + * If there is a mixture of rotating and non-rotating media, setting + * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results + * as it will direct more reads to the non-rotating vdevs which are more likely + * to have a higher performance. */ -int zfs_vdev_mirror_switch_us = 10000; + +/* Rotating media load calculation configuration. */ +static int zfs_vdev_mirror_rotating_inc = 0; +static int zfs_vdev_mirror_rotating_seek_inc = 5; +static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; + +/* Non-rotating media load calculation configuration. */ +static int zfs_vdev_mirror_non_rotating_inc = 0; +static int zfs_vdev_mirror_non_rotating_seek_inc = 1; + +static inline size_t +vdev_mirror_map_size(int children) +{ + return (offsetof(mirror_map_t, mm_child[children]) + + sizeof (int) * children); +} + +static inline mirror_map_t * +vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root) +{ + mirror_map_t *mm; + + mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); + mm->mm_children = children; + mm->mm_replacing = replacing; + mm->mm_root = root; + mm->mm_preferred = (int *)((uintptr_t)mm + + offsetof(mirror_map_t, mm_child[children])); + + return (mm); +} static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); + kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { @@ -87,9 +113,54 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = { }; static int -vdev_mirror_pending(vdev_t *vd) +vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) { - return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); + uint64_t lastoffset; + int load; + + /* All DVAs have equal weight at the root. */ + if (mm->mm_root) + return (INT_MAX); + + /* + * We don't return INT_MAX if the device is resilvering i.e. + * vdev_resilver_txg != 0 as when tested performance was slightly + * worse overall when resilvering with compared to without. + */ + + /* Standard load based on pending queue length. */ + load = vdev_queue_length(vd); + lastoffset = vdev_queue_lastoffset(vd); + + if (vd->vdev_nonrot) { + /* Non-rotating media. */ + if (lastoffset == zio_offset) + return (load + zfs_vdev_mirror_non_rotating_inc); + + /* + * Apply a seek penalty even for non-rotating devices as + * sequential I/O's can be aggregated into fewer operations on + * the device, thus avoiding unnecessary per-command overhead + * and boosting performance. + */ + return (load + zfs_vdev_mirror_non_rotating_seek_inc); + } + + /* Rotating media I/O's which directly follow the last I/O. */ + if (lastoffset == zio_offset) + return (load + zfs_vdev_mirror_rotating_inc); + + /* + * Apply half the seek increment to I/O's within seek offset + * of the last I/O queued to this vdev as they should incure less + * of a seek increment. + */ + if (ABS(lastoffset - zio_offset) < + zfs_vdev_mirror_rotating_seek_offset) + return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); + + /* Apply the full seek increment to all other I/O's. */ + return (load + zfs_vdev_mirror_rotating_seek_inc); } /* @@ -97,38 +168,19 @@ vdev_mirror_pending(vdev_t *vd) * is this functions only caller, as small as possible on the stack. */ noinline static mirror_map_t * -vdev_mirror_map_alloc(zio_t *zio) +vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; - int c, d; + int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; - c = BP_GET_NDVAS(zio->io_bp); - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), - KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = B_FALSE; - mm->mm_preferred = spa_get_random(c); - mm->mm_root = B_TRUE; - - /* - * Check the other, lower-index DVAs to see if they're on - * the same vdev as the child we picked. If they are, use - * them since they are likely to have been allocated from - * the primary metaslab in use at the time, and hence are - * more likely to have locality with single-copy data. - */ - for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { - if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) - mm->mm_preferred = d; - } - + mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, + B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -136,56 +188,13 @@ vdev_mirror_map_alloc(zio_t *zio) mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { - int lowest_pending = INT_MAX; - int lowest_nr = 1; - - c = vd->vdev_children; - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), - KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = 0; - mm->mm_root = B_FALSE; - + mm = vdev_mirror_map_alloc(vd->vdev_children, + (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops), B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; - - if (mm->mm_replacing) - continue; - - if (!vdev_readable(mc->mc_vd)) { - mc->mc_error = SET_ERROR(ENXIO); - mc->mc_tried = 1; - mc->mc_skipped = 1; - mc->mc_pending = INT_MAX; - continue; - } - - mc->mc_pending = vdev_mirror_pending(mc->mc_vd); - if (mc->mc_pending < lowest_pending) { - lowest_pending = mc->mc_pending; - lowest_nr = 1; - } else if (mc->mc_pending == lowest_pending) { - lowest_nr++; - } - } - - d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us); - d = (d % lowest_nr) + 1; - - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - - if (mm->mm_child[c].mc_pending == lowest_pending) { - if (--d == 0) { - mm->mm_preferred = c; - break; - } - } } } @@ -276,6 +285,54 @@ vdev_mirror_scrub_done(zio_t *zio) } /* + * Check the other, lower-index DVAs to see if they're on the same + * vdev as the child we picked. If they are, use them since they + * are likely to have been allocated from the primary metaslab in + * use at the time, and hence are more likely to have locality with + * single-copy data. + */ +static int +vdev_mirror_dva_select(zio_t *zio, int p) +{ + dva_t *dva = zio->io_bp->blk_dva; + mirror_map_t *mm = zio->io_vsd; + int preferred; + int c; + + preferred = mm->mm_preferred[p]; + for (p--; p >= 0; p--) { + c = mm->mm_preferred[p]; + if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) + preferred = c; + } + return (preferred); +} + +static int +vdev_mirror_preferred_child_randomize(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + int p; + + if (mm->mm_root) { + p = spa_get_random(mm->mm_preferred_cnt); + return (vdev_mirror_dva_select(zio, p)); + } + + /* + * To ensure we don't always favour the first matching vdev, + * which could lead to wear leveling issues on SSD's, we + * use the I/O offset as a pseudo random seed into the vdevs + * which have the lowest load. + */ + p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; + return (mm->mm_preferred[p]); +} + +/* + * Try to find a vdev whose DTL doesn't contain the block we want to read + * prefering vdevs based on determined load. + * * Try to find a child whose DTL doesn't contain the block we want to read. * If we can't, try the read on any vdev we haven't already tried. */ @@ -283,43 +340,70 @@ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; uint64_t txg = zio->io_txg; - int i, c; + int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); - /* - * Try to find a child whose DTL doesn't contain the block to read. - * If a child is known to be completely inaccessible (indicated by - * vdev_readable() returning B_FALSE), don't even try. - */ - for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { - if (c >= mm->mm_children) - c = 0; + lowest_load = INT_MAX; + mm->mm_preferred_cnt = 0; + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc; + mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; + if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) - return (c); - mc->mc_error = SET_ERROR(ESTALE); - mc->mc_skipped = 1; - mc->mc_speculative = 1; + + if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + mc->mc_error = SET_ERROR(ESTALE); + mc->mc_skipped = 1; + mc->mc_speculative = 1; + continue; + } + + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); + if (mc->mc_load > lowest_load) + continue; + + if (mc->mc_load < lowest_load) { + lowest_load = mc->mc_load; + mm->mm_preferred_cnt = 0; + } + mm->mm_preferred[mm->mm_preferred_cnt] = c; + mm->mm_preferred_cnt++; + } + + if (mm->mm_preferred_cnt == 1) { + vdev_queue_register_lastoffset( + mm->mm_child[mm->mm_preferred[0]].mc_vd, zio); + return (mm->mm_preferred[0]); + } + + if (mm->mm_preferred_cnt > 1) { + int c = vdev_mirror_preferred_child_randomize(zio); + + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); + return (c); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ - for (c = 0; c < mm->mm_children; c++) - if (!mm->mm_child[c].mc_tried) + for (c = 0; c < mm->mm_children; c++) { + if (!mm->mm_child[c].mc_tried) { + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, + zio); return (c); + } + } /* * Every child failed. There's no place left to look. @@ -334,7 +418,7 @@ vdev_mirror_io_start(zio_t *zio) mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_alloc(zio); + mm = vdev_mirror_map_init(zio); if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { @@ -559,6 +643,25 @@ vdev_ops_t vdev_spare_ops = { }; #if defined(_KERNEL) && defined(HAVE_SPL) -module_param(zfs_vdev_mirror_switch_us, int, 0644); -MODULE_PARM_DESC(zfs_vdev_mirror_switch_us, "Switch mirrors every N usecs"); +module_param(zfs_vdev_mirror_rotating_inc, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_rotating_inc, + "Rotating media load increment for non-seeking I/O's"); + +module_param(zfs_vdev_mirror_rotating_seek_inc, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_inc, + "Rotating media load increment for seeking I/O's"); + +module_param(zfs_vdev_mirror_rotating_seek_offset, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_offset, + "Offset in bytes from the last I/O which " + "triggers a reduced rotating media seek increment"); + +module_param(zfs_vdev_mirror_non_rotating_inc, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_inc, + "Non-rotating media load increment for non-seeking I/O's"); + +module_param(zfs_vdev_mirror_non_rotating_seek_inc, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_seek_inc, + "Non-rotating media load increment for seeking I/O's"); + #endif diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e828ce917..af8af67de 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -374,6 +374,8 @@ vdev_queue_init(vdev_t *vd) avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + + vq->vq_lastoffset = 0; } void @@ -776,6 +778,30 @@ vdev_queue_io_done(zio_t *zio) mutex_exit(&vq->vq_lock); } +/* + * As these three methods are only used for load calculations we're not + * concerned if we get an incorrect value on 32bit platforms due to lack of + * vq_lock mutex use here, instead we prefer to keep it lock free for + * performance. + */ +int +vdev_queue_length(vdev_t *vd) +{ + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); +} + +uint64_t +vdev_queue_lastoffset(vdev_t *vd) +{ + return (vd->vdev_queue.vq_lastoffset); +} + +void +vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) +{ + vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; +} + #if defined(_KERNEL) && defined(HAVE_SPL) module_param(zfs_vdev_aggregation_limit, int, 0644); MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); |