summaryrefslogtreecommitdiffstats
path: root/cmd/zed/zed.d/io-spare.sh
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/zed/zed.d/io-spare.sh')
-rwxr-xr-xcmd/zed/zed.d/io-spare.sh239
1 files changed, 0 insertions, 239 deletions
diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh
deleted file mode 100755
index 1835cb4a3..000000000
--- a/cmd/zed/zed.d/io-spare.sh
+++ /dev/null
@@ -1,239 +0,0 @@
-#!/bin/sh
-#
-# Replace a device with a hot spare in response to IO or CHECKSUM errors.
-# The following actions will be performed automatically when the number
-# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or
-# ZED_SPARE_ON_CHECKSUM_ERRORS.
-#
-# 1) FAULT the device on IO errors, no futher IO will be attempted.
-# DEGRADE the device on checksum errors, the device is still
-# functional and can be used to service IO requests.
-# 2) Set the SES fault beacon for the device.
-# 3) Replace the device with a hot spare if any are available.
-#
-# Once the hot sparing operation is complete either the failed device or
-# the hot spare must be manually retired using the 'zpool detach' command.
-# The 'autoreplace' functionality which would normally take care of this
-# under Illumos has not yet been implemented.
-#
-# Full support for autoreplace is planned, but it requires that the full
-# ZFS Diagnosis Engine be ported. In the meanwhile this script provides
-# the majority of the expected hot spare functionality.
-#
-# Exit codes:
-# 0: hot spare replacement successful
-# 1: hot spare device not available
-# 2: hot sparing disabled or threshold not reached
-# 3: device already faulted or degraded
-# 9: internal error
-
-[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
-. "${ZED_ZEDLET_DIR}/zed-functions.sh"
-
-# Disabled by default. Enable in the zed.rc file.
-: "${ZED_SPARE_ON_CHECKSUM_ERRORS:=0}"
-: "${ZED_SPARE_ON_IO_ERRORS:=0}"
-
-
-# query_vdev_status (pool, vdev)
-#
-# Given a [pool] and [vdev], return the matching vdev path & status on stdout.
-#
-# Warning: This function does not handle the case of [pool] or [vdev]
-# containing whitespace. Beware of ShellCheck SC2046. Caveat emptor.
-#
-# Arguments
-# pool: pool name
-# vdev: virtual device name
-#
-# StdOut
-# arg1: vdev pathname
-# arg2: vdev status
-#
-query_vdev_status()
-{
- local pool="$1"
- local vdev="$2"
- local t
-
- vdev="$(basename -- "${vdev}")"
- ([ -n "${pool}" ] && [ -n "${vdev}" ]) || return
- t="$(printf '\t')"
-
- "${ZPOOL}" status "${pool}" 2>/dev/null | sed -n -e \
- "s,^[ $t]*\(.*${vdev}\(-part[0-9]\+\)\?\)[ $t]*\([A-Z]\+\).*,\1 \3,p" \
- | tail -1
-}
-
-
-# notify (old_vdev, new_vdev, num_errors)
-#
-# Send a notification regarding the hot spare replacement.
-#
-# Arguments
-# old_vdev: path of old vdev that has failed
-# new_vdev: path of new vdev used as the hot spare replacement
-# num_errors: number of errors that triggered this replacement
-#
-notify()
-{
- local old_vdev="$1"
- local new_vdev="$2"
- local num_errors="$3"
- local note_subject
- local note_pathname
- local s
- local rv
-
- umask 077
- note_subject="ZFS hot spare replacement for ${ZEVENT_POOL} on $(hostname)"
- note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
- {
- [ "${num_errors}" -ne 1 ] 2>/dev/null && s="s"
-
- echo "ZFS has replaced a failing device with a hot spare after" \
- "${num_errors} ${ZEVENT_SUBCLASS} error${s}:"
- echo
- echo " eid: ${ZEVENT_EID}"
- echo " class: ${ZEVENT_SUBCLASS}"
- echo " host: $(hostname)"
- echo " time: ${ZEVENT_TIME_STRING}"
- echo " old: ${old_vdev}"
- echo " new: ${new_vdev}"
-
- "${ZPOOL}" status "${ZEVENT_POOL}"
-
- } > "${note_pathname}"
-
- zed_notify "${note_subject}" "${note_pathname}"; rv=$?
- rm -f "${note_pathname}"
- return "${rv}"
-}
-
-
-# main
-#
-# Arguments
-# none
-#
-# Return
-# see above
-#
-main()
-{
- local num_errors
- local action
- local lockfile
- local vdev_path
- local vdev_status
- local spare
- local spare_path
- local spare_status
- local zpool_err
- local zpool_rv
- local rv
-
- # Avoid hot-sparing a hot-spare.
- #
- # Note: ZEVENT_VDEV_PATH is not defined for ZEVENT_VDEV_TYPE=spare.
- #
- [ "${ZEVENT_VDEV_TYPE}" = "spare" ] && exit 2
-
- [ -n "${ZEVENT_POOL}" ] || exit 9
- [ -n "${ZEVENT_VDEV_GUID}" ] || exit 9
- [ -n "${ZEVENT_VDEV_PATH}" ] || exit 9
-
- zed_check_cmd "${ZPOOL}" "${ZINJECT}" || exit 9
-
- # Fault the device after a given number of I/O errors.
- #
- if [ "${ZEVENT_SUBCLASS}" = "io" ]; then
- if [ "${ZED_SPARE_ON_IO_ERRORS}" -gt 0 ]; then
- num_errors=$((ZEVENT_VDEV_READ_ERRORS + ZEVENT_VDEV_WRITE_ERRORS))
- [ "${num_errors}" -ge "${ZED_SPARE_ON_IO_ERRORS}" ] \
- && action="fault"
- fi 2>/dev/null
-
- # Degrade the device after a given number of checksum errors.
- #
- elif [ "${ZEVENT_SUBCLASS}" = "checksum" ]; then
- if [ "${ZED_SPARE_ON_CHECKSUM_ERRORS}" -gt 0 ]; then
- num_errors="${ZEVENT_VDEV_CKSUM_ERRORS}"
- [ "${num_errors}" -ge "${ZED_SPARE_ON_CHECKSUM_ERRORS}" ] \
- && action="degrade"
- fi 2>/dev/null
-
- else
- zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\""
- exit 9
- fi
-
- # Error threshold not reached.
- #
- if [ -z "${action}" ]; then
- exit 2
- fi
-
- lockfile="zed.spare.lock"
- zed_lock "${lockfile}"
-
- # shellcheck disable=SC2046
- set -- $(query_vdev_status "${ZEVENT_POOL}" "${ZEVENT_VDEV_PATH}")
- vdev_path="$1"
- vdev_status="$2"
-
- # Device is already FAULTED or DEGRADED.
- #
- if [ "${vdev_status}" = "FAULTED" ] \
- || [ "${vdev_status}" = "DEGRADED" ]; then
- rv=3
-
- else
- rv=1
-
- # 1) FAULT or DEGRADE the device.
- #
- "${ZINJECT}" -d "${ZEVENT_VDEV_GUID}" -A "${action}" "${ZEVENT_POOL}"
-
- # 2) Set the SES fault beacon.
- #
- # TODO: Set the 'fault' or 'ident' beacon for the device. This can
- # be done through the sg_ses utility. The only hard part is to map
- # the sd device to its corresponding enclosure and slot. We may
- # be able to leverage the existing vdev_id scripts for this.
- #
- # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3
- # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3
-
- # 3) Replace the device with a hot spare.
- #
- # Round-robin through the spares trying those that are available.
- #
- for spare in ${ZEVENT_VDEV_SPARE_PATHS}; do
-
- # shellcheck disable=SC2046
- set -- $(query_vdev_status "${ZEVENT_POOL}" "${spare}")
- spare_path="$1"
- spare_status="$2"
-
- [ "${spare_status}" = "AVAIL" ] || continue
-
- zpool_err="$("${ZPOOL}" replace "${ZEVENT_POOL}" \
- "${ZEVENT_VDEV_GUID}" "${spare_path}" 2>&1)"; zpool_rv=$?
-
- if [ "${zpool_rv}" -ne 0 ]; then
- [ -n "${zpool_err}" ] && zed_log_err "zpool ${zpool_err}"
- else
- notify "${vdev_path}" "${spare_path}" "${num_errors}"
- rv=0
- break
- fi
- done
- fi
-
- zed_unlock "${lockfile}"
- exit "${rv}"
-}
-
-
-main "$@"