aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorAlexander Motin <[email protected]>2022-09-08 13:30:53 -0400
committerGitHub <[email protected]>2022-09-08 10:30:53 -0700
commit37f6845c6f86b1d04593e55d94318326006f4b5d (patch)
treed435ddb262caa5c5315597341196b42f3b55b5eb /module
parent320f0c6022e1c9bdc9063f849c6b2e4fa3b93995 (diff)
Improve too large physical ashift handling
When iterating through children physical ashifts for vdev, prefer ones above the maximum logical ashift, that we can actually use, but within the administrator defined maximum. When selecting top-level vdev ashift, do not set it to the defined maximum in case physical ashift is even higher, but just ignore one. Using the maximum does not prevent misaligned writes, but reduces space efficiency. Since ZFS tries to write data sequentially and aggregates the writes, in many cases large misanigned writes may be not as bad as the space penalty otherwise. Allow internal physical ashifts for vdevs higher than SHIFT_MAX. May be one day allocator or aggregation could benefit from that. Reduce zfs_vdev_max_auto_ashift default from 16 (64KB) to 14 (16KB), so that ZFS may still use bigger ashifts up to SHIFT_MAX (64KB), but only if it really has to or explicitly told to, but not as an "optimization". There are some read-intensive NVMe SSDs that report Preferred Write Alignment of 64KB, and attempt to build RAIDZ2 of those leads to a space inefficiency that can't be justified. Instead these changes make ZFS fall back to logical ashift of 12 (4KB) by default and only warn user that it may be suboptimal for performance. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #13798
Diffstat (limited to 'module')
-rw-r--r--module/os/freebsd/zfs/vdev_geom.c3
-rw-r--r--module/zfs/vdev.c36
-rw-r--r--module/zfs/vdev_draid.c10
-rw-r--r--module/zfs/vdev_mirror.c10
-rw-r--r--module/zfs/vdev_raidz.c10
5 files changed, 58 insertions, 11 deletions
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index f3b4846f4..fef6a1b88 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -955,8 +955,7 @@ skip_open:
*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
*physical_ashift = 0;
if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
- ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
- pp->stripeoffset == 0)
+ ISP2(pp->stripesize) && pp->stripeoffset == 0)
*physical_ashift = highbit(pp->stripesize) - 1;
/*
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index ea0245610..048616c25 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -136,7 +136,15 @@ int zfs_vdev_standard_sm_blksz = (1 << 17);
*/
int zfs_nocacheflush = 0;
-uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
+/*
+ * Maximum and minimum ashift values that can be automatically set based on
+ * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX
+ * is higher than the maximum value, it is intentionally limited here to not
+ * excessively impact pool space efficiency. Higher ashift values may still
+ * be forced by vdev logical ashift or by user via ashift property, but won't
+ * be set automatically as a performance optimization.
+ */
+uint64_t zfs_vdev_max_auto_ashift = 14;
uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
void
@@ -1846,6 +1854,24 @@ vdev_set_deflate_ratio(vdev_t *vd)
}
/*
+ * Choose the best of two ashifts, preferring one between logical ashift
+ * (absolute minimum) and administrator defined maximum, otherwise take
+ * the biggest of the two.
+ */
+uint64_t
+vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
+{
+ if (a > logical && a <= zfs_vdev_max_auto_ashift) {
+ if (b <= logical || b > zfs_vdev_max_auto_ashift)
+ return (a);
+ else
+ return (MAX(a, b));
+ } else if (b <= logical || b > zfs_vdev_max_auto_ashift)
+ return (MAX(a, b));
+ return (b);
+}
+
+/*
* Maximize performance by inflating the configured ashift for top level
* vdevs to be as close to the physical ashift as possible while maintaining
* administrator defined limits and ensuring it doesn't go below the
@@ -1856,7 +1882,8 @@ vdev_ashift_optimize(vdev_t *vd)
{
ASSERT(vd == vd->vdev_top);
- if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+ if (vd->vdev_ashift < vd->vdev_physical_ashift &&
+ vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
vd->vdev_ashift = MIN(
MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
MAX(zfs_vdev_min_auto_ashift,
@@ -4463,7 +4490,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_configured_ashift = vd->vdev_top != NULL
? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
vs->vs_logical_ashift = vd->vdev_logical_ashift;
- vs->vs_physical_ashift = vd->vdev_physical_ashift;
+ if (vd->vdev_physical_ashift <= ASHIFT_MAX)
+ vs->vs_physical_ashift = vd->vdev_physical_ashift;
+ else
+ vs->vs_physical_ashift = 0;
/*
* Report fragmentation and rebuild progress for top-level,
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index 24034d9d9..24ea5d2cb 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1496,8 +1496,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
- physical_ashift = MAX(physical_ashift,
- cvd->vdev_physical_ashift);
+ }
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ continue;
+ physical_ashift = vdev_best_ashift(logical_ashift,
+ physical_ashift, cvd->vdev_physical_ashift);
}
*asizep = asize;
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 3879de680..f9a01c9f5 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -409,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
- *physical_ashift = MAX(*physical_ashift,
- cvd->vdev_physical_ashift);
+ }
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error)
+ continue;
+ *physical_ashift = vdev_best_ashift(*logical_ashift,
+ *physical_ashift, cvd->vdev_physical_ashift);
}
if (numerrors == vd->vdev_children) {
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index b4daf642e..5a44983e5 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1527,8 +1527,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
- *physical_ashift = MAX(*physical_ashift,
- cvd->vdev_physical_ashift);
+ }
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error != 0)
+ continue;
+ *physical_ashift = vdev_best_ashift(*logical_ashift,
+ *physical_ashift, cvd->vdev_physical_ashift);
}
*asize *= vd->vdev_children;