diff options
author | Brian Behlendorf <[email protected]> | 2012-11-15 11:43:54 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2012-11-15 11:51:23 -0800 |
commit | 54602c37718eca0dbeb668321edf5dfc41dcbe93 (patch) | |
tree | 1dcd5fc3ccef64e881c660dd68fb752ac56b4147 | |
parent | 3997bc74351d608e1a09f8ba8d58fb1c12fff331 (diff) | |
parent | df83110856950c8e7b16a7e94cdf42b8531b9cc8 (diff) |
Merge branch 'ashift'
This branch adds some overdue ashift improvements.
* Add '-o ashift' to 'zpool add' and 'zpool attach'
* Improve AF hard disk detection
* Allow 'zpool import' to handle increases in ashift
Signed-off-by: Brian Behlendorf <[email protected]>
-rw-r--r-- | cmd/zpool/zpool_main.c | 53 | ||||
-rw-r--r-- | config/kernel-bdev-physical-size.m4 | 39 | ||||
-rw-r--r-- | config/kernel.m4 | 1 | ||||
-rw-r--r-- | include/linux/blkdev_compat.h | 24 | ||||
-rw-r--r-- | include/sys/fm/fs/zfs.h | 2 | ||||
-rw-r--r-- | man/man8/zpool.8 | 31 | ||||
-rw-r--r-- | module/zfs/vdev.c | 13 | ||||
-rw-r--r-- | module/zfs/zfs_fm.c | 4 |
8 files changed, 144 insertions, 23 deletions
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 56617323f..a684f3bbb 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. + * Copyright (c) 2012 by Cyril Plisko. All rights reserved. */ #include <assert.h> @@ -199,10 +200,11 @@ static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: - return (gettext("\tadd [-fn] <pool> <vdev> ...\n")); + return (gettext("\tadd [-fn] [-o property=value] " + "<pool> <vdev> ...\n")); case HELP_ATTACH: - return (gettext("\tattach [-f] <pool> <device> " - "<new-device>\n")); + return (gettext("\tattach [-f] [-o property=value] " + "<pool> <device> <new-device>\n")); case HELP_CLEAR: return (gettext("\tclear [-nF] <pool> [device]\n")); case HELP_CREATE: @@ -436,11 +438,12 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, } /* - * zpool add [-fn] <pool> <vdev> ... + * zpool add [-fn] [-o property=value] <pool> <vdev> ... * * -f Force addition of devices, even if they appear in use * -n Do not add the devices, but display the resulting layout if * they were to be added. + * -o Set property=value. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by get_vdev_spec(), which constructs the nvlist needed to pass to @@ -457,9 +460,11 @@ zpool_do_add(int argc, char **argv) int ret; zpool_handle_t *zhp; nvlist_t *config; + nvlist_t *props = NULL; + char *propval; /* check options */ - while ((c = getopt(argc, argv, "fn")) != -1) { + while ((c = getopt(argc, argv, "fno:")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -467,6 +472,19 @@ zpool_do_add(int argc, char **argv) case 'n': dryrun = B_TRUE; break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -503,7 +521,7 @@ zpool_do_add(int argc, char **argv) } /* pass off to get_vdev_spec for processing */ - nvroot = make_root_vdev(zhp, NULL, force, !force, B_FALSE, dryrun, + nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); @@ -536,6 +554,7 @@ zpool_do_add(int argc, char **argv) ret = (zpool_add(zhp, nvroot) != 0); } + nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); @@ -2865,6 +2884,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; + nvlist_t *props = NULL; + char *propval; int ret; /* check options */ @@ -2873,6 +2894,19 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) case 'f': force = B_TRUE; break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -2929,7 +2963,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) return (1); } - nvroot = make_root_vdev(zhp, NULL, force, B_FALSE, replacing, B_FALSE, + nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); @@ -2959,9 +2993,10 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-f] <pool> <device> <new_device> + * zpool attach [-f] [-o property=value] <pool> <device> <new_device> * * -f Force attach, even if <new_device> appears to be in use. + * -o Set property=value. * * Attach <new_device> to the mirror containing <device>. If <device> is not * part of a mirror, then <device> will be transformed into a mirror of @@ -3736,7 +3771,7 @@ print_dedup_stats(nvlist_t *config) /* * If the pool was faulted then we may not have been able to - * obtain the config. Otherwise, if have anything in the dedup + * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, diff --git a/config/kernel-bdev-physical-size.m4 b/config/kernel-bdev-physical-size.m4 new file mode 100644 index 000000000..0a1fe8e26 --- /dev/null +++ b/config/kernel-bdev-physical-size.m4 @@ -0,0 +1,39 @@ +dnl # +dnl # 2.6.30 API change +dnl # +dnl # The bdev_physical_block_size() interface was added to provide a way +dnl # to determine the smallest write which can be performed without a +dnl # read-modify-write operation. From the kernel documentation: +dnl # +dnl # What: /sys/block/<disk>/queue/physical_block_size +dnl # Date: May 2009 +dnl # Contact: Martin K. Petersen <[email protected]> +dnl # Description: +dnl # This is the smallest unit the storage device can write +dnl # without resorting to read-modify-write operation. It is +dnl # usually the same as the logical block size but may be +dnl # bigger. One example is SATA drives with 4KB sectors +dnl # that expose a 512-byte logical block size to the +dnl # operating system. +dnl # +dnl # Unfortunately, this interface isn't entirely reliable because +dnl # drives are sometimes known to misreport this value. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE], [ + AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Wno-unused-but-set-variable" + ZFS_LINUX_TRY_COMPILE([ + #include <linux/blkdev.h> + ],[ + struct block_device *bdev = NULL; + bdev_physical_block_size(bdev); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_PHYSICAL_BLOCK_SIZE, 1, + [bdev_physical_block_size() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 13238d8ac..34969c316 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -14,6 +14,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS ZFS_AC_KERNEL_BDEV_LOGICAL_BLOCK_SIZE + ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_BIO_EMPTY_BARRIER ZFS_AC_KERNEL_BIO_FAILFAST ZFS_AC_KERNEL_BIO_FAILFAST_DTD diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h index a5294ceba..1ff8eeaf3 100644 --- a/include/linux/blkdev_compat.h +++ b/include/linux/blkdev_compat.h @@ -394,13 +394,27 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) /* * 2.6.30 API change - * Change to make it explicit there this is the logical block size. + * To ensure good performance preferentially use the physical block size + * for proper alignment. The physical size is supposed to be the internal + * sector size used by the device. This is often 4096 byte for AF devices, + * while a smaller 512 byte logical size is supported for compatibility. + * + * Unfortunately, many drives still misreport their physical sector size. + * For devices which are known to lie you may need to manually set this + * at pool creation time with 'zpool create -o ashift=12 ...'. + * + * When the physical block size interface isn't available, we fall back to + * the logical block size interface and then the older hard sector size. */ -#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE -# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) +#ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE +# define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev) #else -# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) -#endif +# ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE +# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) +# else +# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) +# endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ +#endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */ /* * 2.6.37 API change diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 15803c034..d5c6004c2 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -47,6 +47,7 @@ extern "C" { #define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" #define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" #define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" +#define FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT "vdev.bad_ashift" #define FM_EREPORT_ZFS_DEVICE_REMOVE "vdev.remove" #define FM_EREPORT_ZFS_DEVICE_CLEAR "vdev.clear" #define FM_EREPORT_ZFS_DEVICE_CHECK "vdev.check" @@ -71,6 +72,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 1ac30507b..826a6e788 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -2,6 +2,7 @@ .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Nexenta Systems, Inc. All rights reserved. .\" Copyright (c) 2012 by Delphix. All Rights Reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] @@ -16,12 +17,12 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ... +\fBzpool add\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIvdev\fR ... .fi .LP .nf -\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR +\fBzpool attach\fR [\fB-f\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR .fi .LP @@ -711,7 +712,7 @@ Displays a help message. .ne 2 .mk .na -\fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...\fR +\fB\fBzpool add\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIvdev\fR ...\fR .ad .sp .6 .RS 4n @@ -738,6 +739,17 @@ Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting r Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR +.ad +.sp .6 +.RS 4n +Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set. The only property supported at the moment is "ashift". +.RE + Do not add a disk that is currently configured as a quorum device to a zpool. After a disk is in the pool, that disk can then be configured as a quorum device. .RE @@ -745,7 +757,7 @@ Do not add a disk that is currently configured as a quorum device to a zpool. Af .ne 2 .mk .na -\fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR +\fB\fBzpool attach\fR [\fB-f\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR .ad .sp .6 .RS 4n @@ -761,6 +773,17 @@ Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR +.ad +.sp .6 +.RS 4n +Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set. The only property supported at the moment is "ashift". +.RE + .RE .sp diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7d6d5278a..e0d82e673 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1271,13 +1271,16 @@ vdev_open(vdev_t *vd) vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); } else { /* - * Make sure the alignment requirement hasn't increased. + * Detect if the alignment requirement has increased. + * We don't want to make the pool unavailable, just + * post an event instead. */ - if (ashift > vd->vdev_top->vdev_ashift) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); + if (ashift > vd->vdev_top->vdev_ashift && + vd->vdev_ops->vdev_op_leaf) { + zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, + spa, vd, NULL, 0, 0); } + vd->vdev_max_asize = max_asize; } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index db6a831d2..820291bf4 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -267,6 +267,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, DATA_TYPE_STRING, vd->vdev_fru, NULL); + if (vd->vdev_ashift) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, + DATA_TYPE_UINT64, vd->vdev_ashift, NULL); if (pvd != NULL) { fm_payload_set(ereport, |