summaryrefslogtreecommitdiffstats
path: root/cmd/zed
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/zed')
-rw-r--r--cmd/zed/Makefile.am4
l---------cmd/zed/zed.d/checksum-spare.sh1
-rwxr-xr-xcmd/zed/zed.d/io-spare.sh125
-rw-r--r--cmd/zed/zed.d/zed.rc6
4 files changed, 136 insertions, 0 deletions
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am
index 8e4efe919..f1404dea1 100644
--- a/cmd/zed/Makefile.am
+++ b/cmd/zed/Makefile.am
@@ -39,17 +39,21 @@ dist_zedexec_SCRIPTS = \
$(top_srcdir)/cmd/zed/zed.d/all-debug.sh \
$(top_srcdir)/cmd/zed/zed.d/all-syslog.sh \
$(top_srcdir)/cmd/zed/zed.d/checksum-email.sh \
+ $(top_srcdir)/cmd/zed/zed.d/checksum-spare.sh \
$(top_srcdir)/cmd/zed/zed.d/data-email.sh \
$(top_srcdir)/cmd/zed/zed.d/generic-email.sh \
$(top_srcdir)/cmd/zed/zed.d/io-email.sh \
+ $(top_srcdir)/cmd/zed/zed.d/io-spare.sh \
$(top_srcdir)/cmd/zed/zed.d/resilver.finish-email.sh \
$(top_srcdir)/cmd/zed/zed.d/scrub.finish-email.sh
zedconfdefaults = \
all-syslog.sh \
checksum-email.sh \
+ checksum-spare.sh \
data-email.sh \
io-email.sh \
+ io-spare.sh \
resilver.finish-email.sh \
scrub.finish-email.sh
diff --git a/cmd/zed/zed.d/checksum-spare.sh b/cmd/zed/zed.d/checksum-spare.sh
new file mode 120000
index 000000000..f564f9322
--- /dev/null
+++ b/cmd/zed/zed.d/checksum-spare.sh
@@ -0,0 +1 @@
+io-spare.sh \ No newline at end of file
diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh
new file mode 100755
index 000000000..dd5bf4e0f
--- /dev/null
+++ b/cmd/zed/zed.d/io-spare.sh
@@ -0,0 +1,125 @@
+#!/bin/sh
+#
+# Replace a device with a hot spare in response to IO or checksum errors.
+# The following actions will be performed automatically when the number
+# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or
+# ZED_SPARE_ON_CHECKSUM_ERRORS.
+#
+# 1) FAULT the device on IO errors, no futher IO will be attempted.
+# DEGRADE the device on checksum errors, the device is still
+# functional and can be used to service IO requests.
+# 2) Set the SES fault beacon for the device.
+# 3) Replace the device with a hot spare if any are available.
+#
+# Once the hot sparing operation is complete either the failed device or
+# the hot spare must be manually retired using the 'zpool detach' command.
+# The 'autoreplace' functionality which would normally take care of this
+# under Illumos has not yet been implemented.
+#
+# Full support for autoreplace is planned, but it requires that the full
+# ZFS Diagnosis Engine be ported. In the meanwhile this script provides
+# the majority of the expected hot spare functionality.
+#
+# Exit codes:
+# 0: replaced by hot spare
+# 1: no hot spare device available
+# 2: hot sparing disabled
+# 3: already faulted or degraded
+# 4: unsupported event class
+# 5: internal error
+#
+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc"
+
+test -n "${ZEVENT_POOL}" || exit 5
+test -n "${ZEVENT_SUBCLASS}" || exit 5
+test -n "${ZEVENT_VDEV_PATH}" || exit 5
+test -n "${ZEVENT_VDEV_GUID}" || exit 5
+
+# Defaults to disabled, enable in the zed.rc file.
+ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0}
+ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0}
+
+if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \
+ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then
+ exit 2
+fi
+
+# A lock file is used to serialize execution.
+ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock}
+LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock"
+
+exec 8> "${LOCKFILE}"
+flock -x 8
+
+# Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...).
+vdev_status() {
+ local POOL=$1
+ local VDEV=`basename $2`
+
+ ${ZPOOL} status ${POOL} | \
+ awk -v pat="${VDEV}|${VDEV/-part?}" '$0 ~ pat { print $1" "$2 }'
+ return 0
+}
+
+# Fault devices after N I/O errors.
+if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then
+ ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}`
+
+ if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \
+ ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then
+ ACTION="fault"
+ fi
+# Degrade devices after N checksum errors.
+elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then
+ ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS}
+
+ if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \
+ ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then
+ ACTION="degrade"
+ fi
+else
+ ACTION=
+fi
+
+if [ -n "${ACTION}" ]; then
+
+ # Device is already FAULTED or DEGRADED
+ set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}`
+ ZEVENT_VDEV_PATH_FOUND=$1
+ STATUS=$2
+ if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then
+ exit 3
+ fi
+
+ # Step 1) FAULT or DEGRADE the device
+ #
+ ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL}
+
+ # Step 2) Set the SES fault beacon.
+ #
+ # XXX: Set the 'fault' or 'ident' beacon for the device. This can
+ # be done through the sg_ses utility, the only hard part is to map
+ # the sd device to its corresponding enclosure and slot. We may
+ # be able to leverage the existing vdev_id scripts for this.
+ #
+ # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3
+ # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3
+
+ # Step 3) Replace the device with a hot spare.
+ #
+ # Round robin through the spares selecting those which are available.
+ #
+ for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do
+ set -- `vdev_status ${ZEVENT_POOL} ${SPARE}`
+ SPARE_VDEV_FOUND=$1
+ STATUS=$2
+ if [ "${STATUS}" = "AVAIL" ]; then
+ ${ZPOOL} replace ${ZEVENT_POOL} \
+ ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0
+ fi
+ done
+
+ exit 1
+fi
+
+exit 4
diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
index 57c969c89..69989f953 100644
--- a/cmd/zed/zed.d/zed.rc
+++ b/cmd/zed/zed.d/zed.rc
@@ -26,3 +26,9 @@
# The syslog tag for marking zed events.
#ZED_SYSLOG_TAG="zed"
+
+# Replace a device with a hot spare after N I/O errors are detected.
+#ZED_SPARE_ON_IO_ERRORS=1
+
+# Replace a device with a hot spare after N checksum errors are detected.
+#ZED_SPARE_ON_CHECKSUM_ERRORS=10