aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2014-01-24 15:47:46 -0800
committerBrian Behlendorf <[email protected]>2014-04-02 13:10:08 -0700
commit904ea2763e6576f6971be4a684e6765aaea5221c (patch)
tree5ce537bd1bec2b7cbaa3c9fb2dfa3c5c86722f8d
parentd21705eab952e5aa1c0bcf920b76e8428384d80b (diff)
Add automatic hot spare functionality
When a vdev starts getting I/O or checksum errors it is now possible to automatically rebuild to a hot spare device. To cleanly support this functionality in a shell script some additional information was added to all zevent ereports which include a vdev. This covers both io and checksum zevents but may be used but other scripts. In the Illumos FMA solution the same information is required but it is retrieved through the libzfs library interface. Specifically the following members were added: vdev_spare_paths - List of vdev paths for all hot spares. vdev_spare_guids - List of vdev guids for all hot spares. vdev_read_errors - Read errors for the problematic vdev vdev_write_errors - Write errors for the problematic vdev vdev_cksum_errors - Checksum errors for the problematic vdev. By default the required hot spare scripts are installed but this functionality is disabled. To enable hot sparing uncomment the ZED_SPARE_ON_IO_ERRORS and ZED_SPARE_ON_CHECKSUM_ERRORS in the /etc/zfs/zed.d/zed.rc configuration file. These scripts do no add support for the autoexpand property. At a minimum this requires adding a new udev rule to detect when a new device is added to the system. It also requires that the autoexpand policy be ported from Illumos, see: https://github.com/illumos/illumos-gate/blob/master/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c Support for detecting the correct name of a vdev when it's not a whole disk was added by Turbo Fredriksson. Signed-off-by: Brian Behlendorf <[email protected]> Signed-off-by: Chris Dunlap <[email protected]> Signed-off-by: Turbo Fredriksson <[email protected]> Issue #2
-rw-r--r--cmd/zed/Makefile.am4
l---------cmd/zed/zed.d/checksum-spare.sh1
-rwxr-xr-xcmd/zed/zed.d/io-spare.sh125
-rw-r--r--cmd/zed/zed.d/zed.rc6
-rw-r--r--include/sys/fm/fs/zfs.h5
-rw-r--r--module/zfs/spa.c2
-rw-r--r--module/zfs/zfs_fm.c56
7 files changed, 190 insertions, 9 deletions
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am
index 8e4efe919..f1404dea1 100644
--- a/cmd/zed/Makefile.am
+++ b/cmd/zed/Makefile.am
@@ -39,17 +39,21 @@ dist_zedexec_SCRIPTS = \
$(top_srcdir)/cmd/zed/zed.d/all-debug.sh \
$(top_srcdir)/cmd/zed/zed.d/all-syslog.sh \
$(top_srcdir)/cmd/zed/zed.d/checksum-email.sh \
+ $(top_srcdir)/cmd/zed/zed.d/checksum-spare.sh \
$(top_srcdir)/cmd/zed/zed.d/data-email.sh \
$(top_srcdir)/cmd/zed/zed.d/generic-email.sh \
$(top_srcdir)/cmd/zed/zed.d/io-email.sh \
+ $(top_srcdir)/cmd/zed/zed.d/io-spare.sh \
$(top_srcdir)/cmd/zed/zed.d/resilver.finish-email.sh \
$(top_srcdir)/cmd/zed/zed.d/scrub.finish-email.sh
zedconfdefaults = \
all-syslog.sh \
checksum-email.sh \
+ checksum-spare.sh \
data-email.sh \
io-email.sh \
+ io-spare.sh \
resilver.finish-email.sh \
scrub.finish-email.sh
diff --git a/cmd/zed/zed.d/checksum-spare.sh b/cmd/zed/zed.d/checksum-spare.sh
new file mode 120000
index 000000000..f564f9322
--- /dev/null
+++ b/cmd/zed/zed.d/checksum-spare.sh
@@ -0,0 +1 @@
+io-spare.sh \ No newline at end of file
diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh
new file mode 100755
index 000000000..dd5bf4e0f
--- /dev/null
+++ b/cmd/zed/zed.d/io-spare.sh
@@ -0,0 +1,125 @@
+#!/bin/sh
+#
+# Replace a device with a hot spare in response to IO or checksum errors.
+# The following actions will be performed automatically when the number
+# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or
+# ZED_SPARE_ON_CHECKSUM_ERRORS.
+#
+# 1) FAULT the device on IO errors, no futher IO will be attempted.
+# DEGRADE the device on checksum errors, the device is still
+# functional and can be used to service IO requests.
+# 2) Set the SES fault beacon for the device.
+# 3) Replace the device with a hot spare if any are available.
+#
+# Once the hot sparing operation is complete either the failed device or
+# the hot spare must be manually retired using the 'zpool detach' command.
+# The 'autoreplace' functionality which would normally take care of this
+# under Illumos has not yet been implemented.
+#
+# Full support for autoreplace is planned, but it requires that the full
+# ZFS Diagnosis Engine be ported. In the meanwhile this script provides
+# the majority of the expected hot spare functionality.
+#
+# Exit codes:
+# 0: replaced by hot spare
+# 1: no hot spare device available
+# 2: hot sparing disabled
+# 3: already faulted or degraded
+# 4: unsupported event class
+# 5: internal error
+#
+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc"
+
+test -n "${ZEVENT_POOL}" || exit 5
+test -n "${ZEVENT_SUBCLASS}" || exit 5
+test -n "${ZEVENT_VDEV_PATH}" || exit 5
+test -n "${ZEVENT_VDEV_GUID}" || exit 5
+
+# Defaults to disabled, enable in the zed.rc file.
+ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0}
+ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0}
+
+if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \
+ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then
+ exit 2
+fi
+
+# A lock file is used to serialize execution.
+ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock}
+LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock"
+
+exec 8> "${LOCKFILE}"
+flock -x 8
+
+# Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...).
+vdev_status() {
+ local POOL=$1
+ local VDEV=`basename $2`
+
+ ${ZPOOL} status ${POOL} | \
+ awk -v pat="${VDEV}|${VDEV/-part?}" '$0 ~ pat { print $1" "$2 }'
+ return 0
+}
+
+# Fault devices after N I/O errors.
+if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then
+ ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}`
+
+ if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \
+ ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then
+ ACTION="fault"
+ fi
+# Degrade devices after N checksum errors.
+elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then
+ ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS}
+
+ if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \
+ ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then
+ ACTION="degrade"
+ fi
+else
+ ACTION=
+fi
+
+if [ -n "${ACTION}" ]; then
+
+ # Device is already FAULTED or DEGRADED
+ set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}`
+ ZEVENT_VDEV_PATH_FOUND=$1
+ STATUS=$2
+ if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then
+ exit 3
+ fi
+
+ # Step 1) FAULT or DEGRADE the device
+ #
+ ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL}
+
+ # Step 2) Set the SES fault beacon.
+ #
+ # XXX: Set the 'fault' or 'ident' beacon for the device. This can
+ # be done through the sg_ses utility, the only hard part is to map
+ # the sd device to its corresponding enclosure and slot. We may
+ # be able to leverage the existing vdev_id scripts for this.
+ #
+ # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3
+ # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3
+
+ # Step 3) Replace the device with a hot spare.
+ #
+ # Round robin through the spares selecting those which are available.
+ #
+ for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do
+ set -- `vdev_status ${ZEVENT_POOL} ${SPARE}`
+ SPARE_VDEV_FOUND=$1
+ STATUS=$2
+ if [ "${STATUS}" = "AVAIL" ]; then
+ ${ZPOOL} replace ${ZEVENT_POOL} \
+ ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0
+ fi
+ done
+
+ exit 1
+fi
+
+exit 4
diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
index 57c969c89..69989f953 100644
--- a/cmd/zed/zed.d/zed.rc
+++ b/cmd/zed/zed.d/zed.rc
@@ -26,3 +26,9 @@
# The syslog tag for marking zed events.
#ZED_SYSLOG_TAG="zed"
+
+# Replace a device with a hot spare after N I/O errors are detected.
+#ZED_SPARE_ON_IO_ERRORS=1
+
+# Replace a device with a hot spare after N checksum errors are detected.
+#ZED_SPARE_ON_CHECKSUM_ERRORS=10
diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h
index d9122ac5f..d541b07a3 100644
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@@ -75,6 +75,11 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS "vdev_spare_paths"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS "vdev_spare_guids"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 9e7a7b785..af93b7ce5 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1377,7 +1377,7 @@ spa_load_spares(spa_t *spa)
* validate each vdev on the spare list. If the vdev also exists in the
* active configuration, then we also mark this vdev as an active spare.
*/
- spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
+ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
KM_PUSHPAGE);
for (i = 0; i < spa->spa_spares.sav_count; i++) {
VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index df47d99cf..05ee84c19 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -251,6 +251,11 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
if (vd != NULL) {
vdev_t *pvd = vd->vdev_parent;
vdev_queue_t *vq = &vd->vdev_queue;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ vdev_t *spare_vd;
+ uint64_t *spare_guids;
+ char **spare_paths;
+ int i, spare_count;
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
DATA_TYPE_UINT64, vd->vdev_guid,
@@ -282,6 +287,16 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
}
+ if (vs != NULL) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
+ DATA_TYPE_UINT64, vs->vs_read_errors,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
+ DATA_TYPE_UINT64, vs->vs_write_errors,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
+ DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL);
+ }
+
if (pvd != NULL) {
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
@@ -298,6 +313,28 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
DATA_TYPE_STRING, pvd->vdev_devid, NULL);
}
+
+ spare_count = spa->spa_spares.sav_count;
+ spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
+ KM_PUSHPAGE);
+ spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
+ KM_PUSHPAGE);
+
+ for (i = 0; i < spare_count; i++) {
+ spare_vd = spa->spa_spares.sav_vdevs[i];
+ if (spare_vd) {
+ spare_paths[i] = spare_vd->vdev_path;
+ spare_guids[i] = spare_vd->vdev_guid;
+ }
+ }
+
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
+ DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
+ DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
+
+ kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
+ kmem_free(spare_paths, sizeof (char *) * spare_count);
}
if (zio != NULL) {
@@ -834,15 +871,18 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
(void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
ZFS_ERROR_CLASS, name);
- VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
- VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
- VERIFY(nvlist_add_uint64(resource,
- FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
+ VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
+ VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
+ VERIFY0(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
+ VERIFY0(nvlist_add_int32(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
+
if (vd) {
- VERIFY(nvlist_add_uint64(resource,
- FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
- VERIFY(nvlist_add_uint64(resource,
- FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state) == 0);
+ VERIFY0(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
+ VERIFY0(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
}
zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);