diff options
Diffstat (limited to 'cmd/zed')
-rw-r--r-- | cmd/zed/Makefile.am | 4 | ||||
l--------- | cmd/zed/zed.d/checksum-spare.sh | 1 | ||||
-rwxr-xr-x | cmd/zed/zed.d/io-spare.sh | 125 | ||||
-rw-r--r-- | cmd/zed/zed.d/zed.rc | 6 |
4 files changed, 136 insertions, 0 deletions
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index 8e4efe919..f1404dea1 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -39,17 +39,21 @@ dist_zedexec_SCRIPTS = \ $(top_srcdir)/cmd/zed/zed.d/all-debug.sh \ $(top_srcdir)/cmd/zed/zed.d/all-syslog.sh \ $(top_srcdir)/cmd/zed/zed.d/checksum-email.sh \ + $(top_srcdir)/cmd/zed/zed.d/checksum-spare.sh \ $(top_srcdir)/cmd/zed/zed.d/data-email.sh \ $(top_srcdir)/cmd/zed/zed.d/generic-email.sh \ $(top_srcdir)/cmd/zed/zed.d/io-email.sh \ + $(top_srcdir)/cmd/zed/zed.d/io-spare.sh \ $(top_srcdir)/cmd/zed/zed.d/resilver.finish-email.sh \ $(top_srcdir)/cmd/zed/zed.d/scrub.finish-email.sh zedconfdefaults = \ all-syslog.sh \ checksum-email.sh \ + checksum-spare.sh \ data-email.sh \ io-email.sh \ + io-spare.sh \ resilver.finish-email.sh \ scrub.finish-email.sh diff --git a/cmd/zed/zed.d/checksum-spare.sh b/cmd/zed/zed.d/checksum-spare.sh new file mode 120000 index 000000000..f564f9322 --- /dev/null +++ b/cmd/zed/zed.d/checksum-spare.sh @@ -0,0 +1 @@ +io-spare.sh
\ No newline at end of file diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh new file mode 100755 index 000000000..dd5bf4e0f --- /dev/null +++ b/cmd/zed/zed.d/io-spare.sh @@ -0,0 +1,125 @@ +#!/bin/sh +# +# Replace a device with a hot spare in response to IO or checksum errors. +# The following actions will be performed automatically when the number +# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or +# ZED_SPARE_ON_CHECKSUM_ERRORS. +# +# 1) FAULT the device on IO errors, no futher IO will be attempted. +# DEGRADE the device on checksum errors, the device is still +# functional and can be used to service IO requests. +# 2) Set the SES fault beacon for the device. +# 3) Replace the device with a hot spare if any are available. +# +# Once the hot sparing operation is complete either the failed device or +# the hot spare must be manually retired using the 'zpool detach' command. +# The 'autoreplace' functionality which would normally take care of this +# under Illumos has not yet been implemented. +# +# Full support for autoreplace is planned, but it requires that the full +# ZFS Diagnosis Engine be ported. In the meanwhile this script provides +# the majority of the expected hot spare functionality. +# +# Exit codes: +# 0: replaced by hot spare +# 1: no hot spare device available +# 2: hot sparing disabled +# 3: already faulted or degraded +# 4: unsupported event class +# 5: internal error +# +test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" + +test -n "${ZEVENT_POOL}" || exit 5 +test -n "${ZEVENT_SUBCLASS}" || exit 5 +test -n "${ZEVENT_VDEV_PATH}" || exit 5 +test -n "${ZEVENT_VDEV_GUID}" || exit 5 + +# Defaults to disabled, enable in the zed.rc file. +ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0} +ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0} + +if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \ + ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then + exit 2 +fi + +# A lock file is used to serialize execution. +ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock} +LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock" + +exec 8> "${LOCKFILE}" +flock -x 8 + +# Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...). +vdev_status() { + local POOL=$1 + local VDEV=`basename $2` + + ${ZPOOL} status ${POOL} | \ + awk -v pat="${VDEV}|${VDEV/-part?}" '$0 ~ pat { print $1" "$2 }' + return 0 +} + +# Fault devices after N I/O errors. +if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then + ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}` + + if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \ + ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then + ACTION="fault" + fi +# Degrade devices after N checksum errors. +elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then + ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS} + + if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \ + ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then + ACTION="degrade" + fi +else + ACTION= +fi + +if [ -n "${ACTION}" ]; then + + # Device is already FAULTED or DEGRADED + set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}` + ZEVENT_VDEV_PATH_FOUND=$1 + STATUS=$2 + if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then + exit 3 + fi + + # Step 1) FAULT or DEGRADE the device + # + ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL} + + # Step 2) Set the SES fault beacon. + # + # XXX: Set the 'fault' or 'ident' beacon for the device. This can + # be done through the sg_ses utility, the only hard part is to map + # the sd device to its corresponding enclosure and slot. We may + # be able to leverage the existing vdev_id scripts for this. + # + # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 + # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 + + # Step 3) Replace the device with a hot spare. + # + # Round robin through the spares selecting those which are available. + # + for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do + set -- `vdev_status ${ZEVENT_POOL} ${SPARE}` + SPARE_VDEV_FOUND=$1 + STATUS=$2 + if [ "${STATUS}" = "AVAIL" ]; then + ${ZPOOL} replace ${ZEVENT_POOL} \ + ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0 + fi + done + + exit 1 +fi + +exit 4 diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 57c969c89..69989f953 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -26,3 +26,9 @@ # The syslog tag for marking zed events. #ZED_SYSLOG_TAG="zed" + +# Replace a device with a hot spare after N I/O errors are detected. +#ZED_SPARE_ON_IO_ERRORS=1 + +# Replace a device with a hot spare after N checksum errors are detected. +#ZED_SPARE_ON_CHECKSUM_ERRORS=10 |