aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zpool/zpool_main.c17
-rw-r--r--include/libzfs.h1
-rw-r--r--include/os/freebsd/spl/sys/mod_os.h6
-rw-r--r--include/sys/fs/zfs.h10
-rw-r--r--include/sys/vdev.h1
-rw-r--r--include/sys/vdev_impl.h29
-rw-r--r--lib/libzfs/libzfs_status.c91
-rw-r--r--man/man5/zfs-module-parameters.523
-rw-r--r--man/man8/zpool.86
-rw-r--r--module/os/freebsd/zfs/sysctl_os.c54
-rw-r--r--module/os/freebsd/zfs/vdev_file.c5
-rw-r--r--module/os/freebsd/zfs/vdev_geom.c11
-rw-r--r--module/os/linux/zfs/vdev_disk.c53
-rw-r--r--module/os/linux/zfs/vdev_file.c5
-rw-r--r--module/zfs/arc.c2
-rw-r--r--module/zfs/spa.c1
-rw-r--r--module/zfs/spa_config.c4
-rw-r--r--module/zfs/vdev.c83
-rw-r--r--module/zfs/vdev_indirect.c5
-rw-r--r--module/zfs/vdev_mirror.c6
-rw-r--r--module/zfs/vdev_missing.c3
-rw-r--r--module/zfs/vdev_raidz.c6
-rw-r--r--module/zfs/vdev_root.c3
23 files changed, 326 insertions, 99 deletions
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index de689afa7..f3756a5d9 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2082,7 +2082,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs)
{
nvlist_t **child, *root;
- uint_t c, i, children;
+ uint_t c, i, vsc, children;
pool_scan_stat_t *ps = NULL;
vdev_stat_t *vs;
char rbuf[6], wbuf[6], cbuf[6];
@@ -2099,7 +2099,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
children = 0;
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
- (uint64_t **)&vs, &c) == 0);
+ (uint64_t **)&vs, &vsc) == 0);
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
@@ -2199,6 +2199,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
(void) printf(gettext("unsupported feature(s)"));
break;
+ case VDEV_AUX_ASHIFT_TOO_BIG:
+ (void) printf(gettext("unsupported minimum blocksize"));
+ break;
+
case VDEV_AUX_SPARED:
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&spare_cb.cb_guid) == 0);
@@ -8105,6 +8109,15 @@ status_callback(zpool_handle_t *zhp, void *data)
"'zpool clear'.\n"));
break;
+ case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
+ (void) printf(gettext("status: One or more devices are "
+ "configured to use a non-native block size.\n"
+ "\tExpect reduced performance.\n"));
+ (void) printf(gettext("action: Replace affected devices with "
+ "devices that support the\n\tconfigured block size, or "
+ "migrate data to a properly configured\n\tpool.\n"));
+ break;
+
case ZPOOL_STATUS_HOSTID_MISMATCH:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid"
diff --git a/include/libzfs.h b/include/libzfs.h
index b405ad1e1..4e6336180 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -390,6 +390,7 @@ typedef enum {
ZPOOL_STATUS_REMOVED_DEV, /* removed device */
ZPOOL_STATUS_REBUILDING, /* device being rebuilt */
ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */
+ ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */
/*
* Finally, the following indicates a healthy pool.
diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h
index 9a3b29e1e..ec1da1a46 100644
--- a/include/os/freebsd/spl/sys/mod_os.h
+++ b/include/os/freebsd/spl/sys/mod_os.h
@@ -78,6 +78,12 @@
#define param_set_slop_shift_args(var) \
CTLTYPE_INT, &var, 0, param_set_slop_shift, "I"
+#define param_set_min_auto_ashift_args(var) \
+ CTLTYPE_U64, &var, 0, param_set_min_auto_ashift, "QU"
+
+#define param_set_max_auto_ashift_args(var) \
+ CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU"
+
#include <sys/kernel.h>
#define module_init(fn) \
static void \
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 001893b71..d3acd674a 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -873,6 +873,7 @@ typedef enum vdev_aux {
VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */
VDEV_AUX_ACTIVE, /* vdev active on a different host */
VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */
+ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */
} vdev_aux_t;
/*
@@ -1068,8 +1069,17 @@ typedef struct vdev_stat {
uint64_t vs_trim_state; /* vdev_trim_state_t */
uint64_t vs_trim_action_time; /* time_t */
uint64_t vs_rebuild_processed; /* bytes rebuilt */
+ uint64_t vs_configured_ashift; /* TLV vdev_ashift */
+ uint64_t vs_logical_ashift; /* vdev_logical_ashift */
+ uint64_t vs_physical_ashift; /* vdev_physical_ashift */
} vdev_stat_t;
+/* BEGIN CSTYLED */
+#define VDEV_STAT_VALID(field, uint64_t_field_count) \
+ ((uint64_t_field_count * sizeof (uint64_t)) >= \
+ (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field)))
+/* END CSTYLED */
+
/*
* Extended stats
*
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index a7e880636..797065fdd 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -94,6 +94,7 @@ extern void vdev_rele(vdev_t *);
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_ashift_optimize(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index b9298c62d..90d607746 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -69,7 +69,7 @@ extern uint32_t zfs_vdev_async_write_max_active;
* Virtual device operations
*/
typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
- uint64_t *ashift);
+ uint64_t *ashift, uint64_t *pshift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef void vdev_io_start_func_t(zio_t *zio);
@@ -216,6 +216,25 @@ struct vdev {
uint64_t vdev_min_asize; /* min acceptable asize */
uint64_t vdev_max_asize; /* max acceptable asize */
uint64_t vdev_ashift; /* block alignment shift */
+
+ /*
+ * Logical block alignment shift
+ *
+ * The smallest sized/aligned I/O supported by the device.
+ */
+ uint64_t vdev_logical_ashift;
+ /*
+ * Physical block alignment shift
+ *
+ * The device supports logical I/Os with vdev_logical_ashift
+ * size/alignment, but optimum performance will be achieved by
+ * aligning/sizing requests to vdev_physical_ashift. Smaller
+ * requests may be inflated or incur device level read-modify-write
+ * operations.
+ *
+ * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
+ */
+ uint64_t vdev_physical_ashift;
uint64_t vdev_state; /* see VDEV_STATE_* #defines */
uint64_t vdev_prevstate; /* used when reopening a vdev */
vdev_ops_t *vdev_ops; /* vdev operations */
@@ -586,6 +605,14 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
*/
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
+/*
+ * Vdev ashift optimization tunables
+ */
+extern uint64_t zfs_vdev_min_auto_ashift;
+extern uint64_t zfs_vdev_max_auto_ashift;
+int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS);
+int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c
index 67b8ea33e..435937041 100644
--- a/lib/libzfs/libzfs_status.c
+++ b/lib/libzfs/libzfs_status.c
@@ -43,6 +43,7 @@
#include <libzfs.h>
#include <libzutil.h>
+#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/systeminfo.h>
@@ -94,57 +95,69 @@ static char *zfs_msgid_table[] = {
/* ARGSUSED */
static int
-vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_missing(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_CANT_OPEN &&
- aux == VDEV_AUX_OPEN_FAILED);
+ return (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+ vs->vs_aux == VDEV_AUX_OPEN_FAILED);
}
/* ARGSUSED */
static int
-vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_faulted(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_FAULTED);
+ return (vs->vs_state == VDEV_STATE_FAULTED);
}
/* ARGSUSED */
static int
-vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_errors(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_DEGRADED || errs != 0);
+ return (vs->vs_state == VDEV_STATE_DEGRADED ||
+ vs->vs_read_errors != 0 || vs->vs_write_errors != 0 ||
+ vs->vs_checksum_errors != 0);
}
/* ARGSUSED */
static int
-vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_broken(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_CANT_OPEN);
+ return (vs->vs_state == VDEV_STATE_CANT_OPEN);
}
/* ARGSUSED */
static int
-vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_offlined(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_OFFLINE);
+ return (vs->vs_state == VDEV_STATE_OFFLINE);
}
/* ARGSUSED */
static int
-vdev_removed(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_removed(vdev_stat_t *vs, uint_t vsc)
{
- return (state == VDEV_STATE_REMOVED);
+ return (vs->vs_state == VDEV_STATE_REMOVED);
+}
+
+static int
+vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc)
+{
+ if (getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") != NULL)
+ return (0);
+
+ return (VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
+ vs->vs_configured_ashift < vs->vs_physical_ashift);
}
/*
* Detect if any leaf devices that have seen errors or could not be opened.
*/
static boolean_t
-find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
+find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t),
+ boolean_t ignore_replacing)
{
nvlist_t **child;
vdev_stat_t *vs;
- uint_t c, children;
- char *type;
+ uint_t c, vsc, children;
/*
* Ignore problems within a 'replacing' vdev, since we're presumably in
@@ -152,23 +165,25 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
* out again. We'll pick up the fact that a resilver is happening
* later.
*/
- verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
- if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
- return (B_FALSE);
+ if (ignore_replacing == B_TRUE) {
+ char *type;
+
+ verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE,
+ &type) == 0);
+ if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
+ return (B_FALSE);
+ }
if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
&children) == 0) {
for (c = 0; c < children; c++)
- if (find_vdev_problem(child[c], func))
+ if (find_vdev_problem(child[c], func, ignore_replacing))
return (B_TRUE);
} else {
verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
- (uint64_t **)&vs, &c) == 0);
+ (uint64_t **)&vs, &vsc) == 0);
- if (func(vs->vs_state, vs->vs_aux,
- vs->vs_read_errors +
- vs->vs_write_errors +
- vs->vs_checksum_errors))
+ if (func(vs, vsc) != 0)
return (B_TRUE);
}
@@ -178,7 +193,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child,
&children) == 0) {
for (c = 0; c < children; c++)
- if (find_vdev_problem(child[c], func))
+ if (find_vdev_problem(child[c], func, ignore_replacing))
return (B_TRUE);
}
@@ -362,15 +377,15 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap)
* Bad devices in non-replicated config.
*/
if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
- find_vdev_problem(nvroot, vdev_faulted))
+ find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
return (ZPOOL_STATUS_FAULTED_DEV_NR);
if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
- find_vdev_problem(nvroot, vdev_missing))
+ find_vdev_problem(nvroot, vdev_missing, B_TRUE))
return (ZPOOL_STATUS_MISSING_DEV_NR);
if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
- find_vdev_problem(nvroot, vdev_broken))
+ find_vdev_problem(nvroot, vdev_broken, B_TRUE))
return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
/*
@@ -392,32 +407,38 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap)
/*
* Missing devices in a replicated config.
*/
- if (find_vdev_problem(nvroot, vdev_faulted))
+ if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
return (ZPOOL_STATUS_FAULTED_DEV_R);
- if (find_vdev_problem(nvroot, vdev_missing))
+ if (find_vdev_problem(nvroot, vdev_missing, B_TRUE))
return (ZPOOL_STATUS_MISSING_DEV_R);
- if (find_vdev_problem(nvroot, vdev_broken))
+ if (find_vdev_problem(nvroot, vdev_broken, B_TRUE))
return (ZPOOL_STATUS_CORRUPT_LABEL_R);
/*
* Devices with errors
*/
- if (!isimport && find_vdev_problem(nvroot, vdev_errors))
+ if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE))
return (ZPOOL_STATUS_FAILING_DEV);
/*
* Offlined devices
*/
- if (find_vdev_problem(nvroot, vdev_offlined))
+ if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE))
return (ZPOOL_STATUS_OFFLINE_DEV);
/*
* Removed device
*/
- if (find_vdev_problem(nvroot, vdev_removed))
+ if (find_vdev_problem(nvroot, vdev_removed, B_TRUE))
return (ZPOOL_STATUS_REMOVED_DEV);
/*
+ * Suboptimal, but usable, ashift configuration.
+ */
+ if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE))
+ return (ZPOOL_STATUS_NON_NATIVE_ASHIFT);
+
+ /*
* Informational errata available.
*/
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRATA, &errata);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 463e51acc..853e8fc94 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -489,6 +489,29 @@ Default value: \fB29\fR [meaning (1 << 29) = 512MB].
.sp
.ne 2
.na
+\fBzfs_vdev_max_auto_ashift\fR (ulong)
+.ad
+.RS 12n
+Maximum ashift used when optimizing for logical -> physical sector size on new
+top-level vdevs.
+.sp
+Default value: \fBASHIFT_MAX\fR (16).
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_min_auto_ashift\fR (ulong)
+.ad
+.RS 12n
+Minimum ashift used when creating new top-level vdevs.
+.sp
+Default value: \fBASHIFT_MIN\fR (9).
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_vdev_min_ms_count\fR (int)
.ad
.RS 12n
diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index c23fa0591..0fe6866f3 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -470,6 +470,12 @@ The maximum time in milliseconds that
.Nm zpool import
will wait for an expected device to be available.
.El
+.Bl -tag -width "ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE"
+.It Ev ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE
+If set, suppress warning about non-native vdev ashift in
+.Nm zpool status .
+The value is not used, only the presence or absence of the variable matters.
+.El
.Bl -tag -width "ZPOOL_VDEV_NAME_GUID"
.It Ev ZPOOL_VDEV_NAME_GUID
Cause
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
index d76b49de9..200bbf43d 100644
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
#include <sys/dmu.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
@@ -518,56 +519,53 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
/* vdev.c */
-#ifdef notyet
-extern uint64_t zfs_max_auto_ashift;
-extern uint64_t zfs_min_auto_ashift;
-
-static int
-sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+int
+param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
- val = zfs_max_auto_ashift;
+ val = zfs_vdev_min_auto_ashift;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
- return (err);
+ return (SET_ERROR(err));
- if (val > ASHIFT_MAX || val < zfs_min_auto_ashift)
- return (EINVAL);
+ if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
+ return (SET_ERROR(EINVAL));
- zfs_max_auto_ashift = val;
+ zfs_vdev_min_auto_ashift = val;
return (0);
}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
- CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t),
- sysctl_vfs_zfs_max_auto_ashift, "QU",
- "Max ashift used when optimising for logical -> physical sectors size on "
- "new top-level vdevs.");
-static int
-sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+
+int
+param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
- val = zfs_min_auto_ashift;
+ val = zfs_vdev_max_auto_ashift;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
- return (err);
+ return (SET_ERROR(err));
- if (val < ASHIFT_MIN || val > zfs_max_auto_ashift)
- return (EINVAL);
+ if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
+ return (SET_ERROR(EINVAL));
- zfs_min_auto_ashift = val;
+ zfs_vdev_max_auto_ashift = val;
return (0);
}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
- CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t),
- sysctl_vfs_zfs_min_auto_ashift, "QU",
- "Min ashift used when creating new top-level vdevs.");
-#endif
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN,
+ &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
+ param_set_min_auto_ashift, "QU",
+ "Min ashift used when creating new top-level vdev. (LEGACY)");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN,
+ &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
+ param_set_max_auto_ashift, "QU",
+ "Max ashift used when optimizing for logical -> physical sector size on "
+ "new top-level vdevs. (LEGACY)");
/*
* Since the DTL space map of a vdev is not expected to have a lot of
diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c
index cca6bffd9..4d27751c8 100644
--- a/module/os/freebsd/zfs/vdev_file.c
+++ b/module/os/freebsd/zfs/vdev_file.c
@@ -83,7 +83,7 @@ vdev_file_open_mode(spa_mode_t spa_mode)
static int
vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
vdev_file_t *vf;
zfs_file_t *fp;
@@ -167,7 +167,8 @@ skip_open:
}
*max_psize = *psize = zfa.zfa_size;
- *ashift = SPA_MINBLOCKSHIFT;
+ *logical_ashift = SPA_MINBLOCKSHIFT;
+ *physical_ashift = SPA_MINBLOCKSHIFT;
return (0);
}
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index 8462755f8..bf06f6919 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -802,7 +802,7 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid)
static int
vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *logical_ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
struct g_provider *pp;
struct g_consumer *cp;
@@ -949,11 +949,12 @@ skip_open:
* transfer size.
*/
*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
-#ifdef notyet
- if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
- pp->stripesize <= (1 << ASHIFT_MAX) && pp->stripeoffset == 0)
+ *physical_ashift = 0;
+ if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
+ ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
+ pp->stripeoffset == 0)
*physical_ashift = highbit(pp->stripesize) - 1;
-#endif
+
/*
* Clear the nowritecache settings, so that on a vdev_reopen()
* we will try again.
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 5869b474d..5a2245436 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -159,7 +159,7 @@ vdev_disk_error(zio_t *zio)
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
struct block_device *bdev;
fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
@@ -270,7 +270,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
struct request_queue *q = bdev_get_queue(vd->vd_bdev);
/* Determine the physical block size */
- int block_size = bdev_physical_block_size(vd->vd_bdev);
+ int physical_block_size = bdev_physical_block_size(vd->vd_bdev);
+
+ /* Determine the logical block size */
+ int logical_block_size = bdev_logical_block_size(vd->vd_bdev);
/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
v->vdev_nowritecache = B_FALSE;
@@ -291,7 +294,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
*max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
/* Based on the minimum sector size set the block size */
- *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
+ *physical_ashift = highbit64(MAX(physical_block_size,
+ SPA_MINBLOCKSIZE)) - 1;
+
+ *logical_ashift = highbit64(MAX(logical_block_size,
+ SPA_MINBLOCKSIZE)) - 1;
return (0);
}
@@ -824,3 +831,43 @@ char *zfs_vdev_scheduler = "unused";
module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
param_get_charp, &zfs_vdev_scheduler, 0644);
MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
+
+int
+param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
+{
+ uint64_t val;
+ int error;
+
+ error = kstrtoull(buf, 0, &val);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
+ return (SET_ERROR(-EINVAL));
+
+ error = param_set_ulong(buf, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ return (0);
+}
+
+int
+param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
+{
+ uint64_t val;
+ int error;
+
+ error = kstrtoull(buf, 0, &val);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
+ return (SET_ERROR(-EINVAL));
+
+ error = param_set_ulong(buf, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ return (0);
+}
diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c
index 592ba2b4a..a4e71ca40 100644
--- a/module/os/linux/zfs/vdev_file.c
+++ b/module/os/linux/zfs/vdev_file.c
@@ -75,7 +75,7 @@ vdev_file_open_mode(spa_mode_t spa_mode)
static int
vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
vdev_file_t *vf;
zfs_file_t *fp;
@@ -159,7 +159,8 @@ skip_open:
}
*max_psize = *psize = zfa.zfa_size;
- *ashift = SPA_MINBLOCKSHIFT;
+ *logical_ashift = SPA_MINBLOCKSHIFT;
+ *physical_ashift = SPA_MINBLOCKSHIFT;
return (0);
}
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index fc62af7c7..bd1a993dc 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -9283,6 +9283,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
ASSERT(!l2arc_vdev_present(vd));
+ vdev_ashift_optimize(vd);
+
/*
* Create a new l2arc device entry.
*/
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index e358404db..1e3728d93 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -5747,6 +5747,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
+ vdev_ashift_optimize(vd);
vdev_metaslab_set_size(vd);
vdev_expand(vd, txg);
}
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index 95dd19844..cc65a00d9 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -576,8 +576,10 @@ spa_config_update(spa_t *spa, int what)
(tvd->vdev_islog && tvd->vdev_removing))
continue;
- if (tvd->vdev_ms_array == 0)
+ if (tvd->vdev_ms_array == 0) {
+ vdev_ashift_optimize(tvd);
vdev_metaslab_set_size(tvd);
+ }
vdev_expand(tvd, txg);
}
}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index a51e427f8..1844a5653 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -110,6 +110,9 @@ int zfs_vdev_standard_sm_blksz = (1 << 17);
*/
int zfs_nocacheflush = 0;
+uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
+uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+
/*PRINTFLIKE2*/
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
@@ -1176,6 +1179,8 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
mvd->vdev_max_asize = cvd->vdev_max_asize;
mvd->vdev_psize = cvd->vdev_psize;
mvd->vdev_ashift = cvd->vdev_ashift;
+ mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
+ mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
mvd->vdev_state = cvd->vdev_state;
mvd->vdev_crtxg = cvd->vdev_crtxg;
@@ -1207,7 +1212,8 @@ vdev_remove_parent(vdev_t *cvd)
mvd->vdev_ops == &vdev_replacing_ops ||
mvd->vdev_ops == &vdev_spare_ops);
cvd->vdev_ashift = mvd->vdev_ashift;
-
+ cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
+ cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
vdev_remove_child(mvd, cvd);
vdev_remove_child(pvd, mvd);
@@ -1677,7 +1683,8 @@ vdev_open(vdev_t *vd)
uint64_t osize = 0;
uint64_t max_osize = 0;
uint64_t asize, max_asize, psize;
- uint64_t ashift = 0;
+ uint64_t logical_ashift = 0;
+ uint64_t physical_ashift = 0;
ASSERT(vd->vdev_open_thread == curthread ||
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
@@ -1707,8 +1714,8 @@ vdev_open(vdev_t *vd)
return (SET_ERROR(ENXIO));
}
- error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
-
+ error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
+ &logical_ashift, &physical_ashift);
/*
* Physical volume size should never be larger than its max size, unless
* the disk has shrunk while we were reading it or the device is buggy
@@ -1823,6 +1830,17 @@ vdev_open(vdev_t *vd)
return (SET_ERROR(EINVAL));
}
+ vd->vdev_physical_ashift =
+ MAX(physical_ashift, vd->vdev_physical_ashift);
+ vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
+ vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
+
+ if (vd->vdev_logical_ashift > ASHIFT_MAX) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_ASHIFT_TOO_BIG);
+ return (SET_ERROR(EDOM));
+ }
+
if (vd->vdev_asize == 0) {
/*
* This is the first-ever open, so use the computed values.
@@ -1830,9 +1848,6 @@ vdev_open(vdev_t *vd)
*/
vd->vdev_asize = asize;
vd->vdev_max_asize = max_asize;
- if (vd->vdev_ashift == 0) {
- vd->vdev_ashift = ashift; /* use detected value */
- }
if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
vd->vdev_ashift > ASHIFT_MAX)) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1841,16 +1856,17 @@ vdev_open(vdev_t *vd)
}
} else {
/*
- * Detect if the alignment requirement has increased.
- * We don't want to make the pool unavailable, just
- * post an event instead.
+ * Make sure the alignment required hasn't increased.
*/
- if (ashift > vd->vdev_top->vdev_ashift &&
+ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
vd->vdev_ops->vdev_op_leaf) {
zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
spa, vd, NULL, NULL, 0, 0);
- }
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (SET_ERROR(EDOM));
+ }
vd->vdev_max_asize = max_asize;
}
@@ -2428,6 +2444,35 @@ vdev_metaslab_set_size(vdev_t *vd)
ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
}
+/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
+void
+vdev_ashift_optimize(vdev_t *vd)
+{
+ if (vd == vd->vdev_top) {
+ if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+ vd->vdev_ashift = MIN(
+ MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
+ MAX(zfs_vdev_min_auto_ashift,
+ vd->vdev_physical_ashift));
+ } else {
+ /*
+ * Unusual case where logical ashift > physical ashift
+ * so we can't cap the calculated ashift based on max
+ * ashift as that would cause failures.
+ * We still check if we need to increase it to match
+ * the min ashift.
+ */
+ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
+ vd->vdev_ashift);
+ }
+ }
+}
+
void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
@@ -4083,6 +4128,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
1ULL << tvd->vdev_ms_shift);
}
+ vs->vs_configured_ashift = vd->vdev_top != NULL
+ ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
+ vs->vs_logical_ashift = vd->vdev_logical_ashift;
+ vs->vs_physical_ashift = vd->vdev_physical_ashift;
+
/*
* Report fragmentation and rebuild progress for top-level,
* non-auxiliary, concrete devices.
@@ -5028,4 +5078,13 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
"Disable cache flushes");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
+ param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
+ "Minimum ashift used when creating new top-level vdevs");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
+ param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
+ "Maximum ashift used when optimizing for logical -> physical sector "
+ "size on new top-level vdevs");
/* END CSTYLED */
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 4cd83d79e..6a944f4e8 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -950,11 +950,12 @@ vdev_indirect_close(vdev_t *vd)
/* ARGSUSED */
static int
vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
*psize = *max_psize = vd->vdev_asize +
VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
- *ashift = vd->vdev_ashift;
+ *logical_ashift = vd->vdev_ashift;
+ *physical_ashift = vd->vdev_physical_ashift;
return (0);
}
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 094530e9b..5e1060f12 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -366,7 +366,7 @@ vdev_mirror_map_init(zio_t *zio)
static int
vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
int numerrors = 0;
int lasterror = 0;
@@ -389,7 +389,9 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
- *ashift = MAX(*ashift, cvd->vdev_ashift);
+ *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+ *physical_ashift = MAX(*physical_ashift,
+ vd->vdev_physical_ashift);
}
if (numerrors == vd->vdev_children) {
diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c
index 205b23eba..ce90df6e8 100644
--- a/module/zfs/vdev_missing.c
+++ b/module/zfs/vdev_missing.c
@@ -45,7 +45,7 @@
/* ARGSUSED */
static int
vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
+ uint64_t *ashift, uint64_t *pshift)
{
/*
* Really this should just fail. But then the root vdev will be in the
@@ -56,6 +56,7 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
*psize = 0;
*max_psize = 0;
*ashift = 0;
+ *pshift = 0;
return (0);
}
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index be3466673..8d4962805 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1554,7 +1554,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
static int
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
vdev_t *cvd;
uint64_t nparity = vd->vdev_nparity;
@@ -1583,7 +1583,9 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
- *ashift = MAX(*ashift, cvd->vdev_ashift);
+ *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+ *physical_ashift = MAX(*physical_ashift,
+ cvd->vdev_physical_ashift);
}
*asize *= vd->vdev_children;
diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c
index ce79f7c73..9e8aac7d0 100644
--- a/module/zfs/vdev_root.c
+++ b/module/zfs/vdev_root.c
@@ -82,7 +82,7 @@ too_many_errors(vdev_t *vd, uint64_t numerrors)
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
- uint64_t *ashift)
+ uint64_t *ashift, uint64_t *pshift)
{
spa_t *spa = vd->vdev_spa;
int lasterror = 0;
@@ -116,6 +116,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = 0;
*max_asize = 0;
*ashift = 0;
+ *pshift = 0;
return (0);
}