diff options
Diffstat (limited to 'cmd/zed/zed.d/io-spare.sh')
-rwxr-xr-x | cmd/zed/zed.d/io-spare.sh | 239 |
1 files changed, 0 insertions, 239 deletions
diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh deleted file mode 100755 index 1835cb4a3..000000000 --- a/cmd/zed/zed.d/io-spare.sh +++ /dev/null @@ -1,239 +0,0 @@ -#!/bin/sh -# -# Replace a device with a hot spare in response to IO or CHECKSUM errors. -# The following actions will be performed automatically when the number -# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or -# ZED_SPARE_ON_CHECKSUM_ERRORS. -# -# 1) FAULT the device on IO errors, no futher IO will be attempted. -# DEGRADE the device on checksum errors, the device is still -# functional and can be used to service IO requests. -# 2) Set the SES fault beacon for the device. -# 3) Replace the device with a hot spare if any are available. -# -# Once the hot sparing operation is complete either the failed device or -# the hot spare must be manually retired using the 'zpool detach' command. -# The 'autoreplace' functionality which would normally take care of this -# under Illumos has not yet been implemented. -# -# Full support for autoreplace is planned, but it requires that the full -# ZFS Diagnosis Engine be ported. In the meanwhile this script provides -# the majority of the expected hot spare functionality. -# -# Exit codes: -# 0: hot spare replacement successful -# 1: hot spare device not available -# 2: hot sparing disabled or threshold not reached -# 3: device already faulted or degraded -# 9: internal error - -[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" -. "${ZED_ZEDLET_DIR}/zed-functions.sh" - -# Disabled by default. Enable in the zed.rc file. -: "${ZED_SPARE_ON_CHECKSUM_ERRORS:=0}" -: "${ZED_SPARE_ON_IO_ERRORS:=0}" - - -# query_vdev_status (pool, vdev) -# -# Given a [pool] and [vdev], return the matching vdev path & status on stdout. -# -# Warning: This function does not handle the case of [pool] or [vdev] -# containing whitespace. Beware of ShellCheck SC2046. Caveat emptor. -# -# Arguments -# pool: pool name -# vdev: virtual device name -# -# StdOut -# arg1: vdev pathname -# arg2: vdev status -# -query_vdev_status() -{ - local pool="$1" - local vdev="$2" - local t - - vdev="$(basename -- "${vdev}")" - ([ -n "${pool}" ] && [ -n "${vdev}" ]) || return - t="$(printf '\t')" - - "${ZPOOL}" status "${pool}" 2>/dev/null | sed -n -e \ - "s,^[ $t]*\(.*${vdev}\(-part[0-9]\+\)\?\)[ $t]*\([A-Z]\+\).*,\1 \3,p" \ - | tail -1 -} - - -# notify (old_vdev, new_vdev, num_errors) -# -# Send a notification regarding the hot spare replacement. -# -# Arguments -# old_vdev: path of old vdev that has failed -# new_vdev: path of new vdev used as the hot spare replacement -# num_errors: number of errors that triggered this replacement -# -notify() -{ - local old_vdev="$1" - local new_vdev="$2" - local num_errors="$3" - local note_subject - local note_pathname - local s - local rv - - umask 077 - note_subject="ZFS hot spare replacement for ${ZEVENT_POOL} on $(hostname)" - note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" - { - [ "${num_errors}" -ne 1 ] 2>/dev/null && s="s" - - echo "ZFS has replaced a failing device with a hot spare after" \ - "${num_errors} ${ZEVENT_SUBCLASS} error${s}:" - echo - echo " eid: ${ZEVENT_EID}" - echo " class: ${ZEVENT_SUBCLASS}" - echo " host: $(hostname)" - echo " time: ${ZEVENT_TIME_STRING}" - echo " old: ${old_vdev}" - echo " new: ${new_vdev}" - - "${ZPOOL}" status "${ZEVENT_POOL}" - - } > "${note_pathname}" - - zed_notify "${note_subject}" "${note_pathname}"; rv=$? - rm -f "${note_pathname}" - return "${rv}" -} - - -# main -# -# Arguments -# none -# -# Return -# see above -# -main() -{ - local num_errors - local action - local lockfile - local vdev_path - local vdev_status - local spare - local spare_path - local spare_status - local zpool_err - local zpool_rv - local rv - - # Avoid hot-sparing a hot-spare. - # - # Note: ZEVENT_VDEV_PATH is not defined for ZEVENT_VDEV_TYPE=spare. - # - [ "${ZEVENT_VDEV_TYPE}" = "spare" ] && exit 2 - - [ -n "${ZEVENT_POOL}" ] || exit 9 - [ -n "${ZEVENT_VDEV_GUID}" ] || exit 9 - [ -n "${ZEVENT_VDEV_PATH}" ] || exit 9 - - zed_check_cmd "${ZPOOL}" "${ZINJECT}" || exit 9 - - # Fault the device after a given number of I/O errors. - # - if [ "${ZEVENT_SUBCLASS}" = "io" ]; then - if [ "${ZED_SPARE_ON_IO_ERRORS}" -gt 0 ]; then - num_errors=$((ZEVENT_VDEV_READ_ERRORS + ZEVENT_VDEV_WRITE_ERRORS)) - [ "${num_errors}" -ge "${ZED_SPARE_ON_IO_ERRORS}" ] \ - && action="fault" - fi 2>/dev/null - - # Degrade the device after a given number of checksum errors. - # - elif [ "${ZEVENT_SUBCLASS}" = "checksum" ]; then - if [ "${ZED_SPARE_ON_CHECKSUM_ERRORS}" -gt 0 ]; then - num_errors="${ZEVENT_VDEV_CKSUM_ERRORS}" - [ "${num_errors}" -ge "${ZED_SPARE_ON_CHECKSUM_ERRORS}" ] \ - && action="degrade" - fi 2>/dev/null - - else - zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" - exit 9 - fi - - # Error threshold not reached. - # - if [ -z "${action}" ]; then - exit 2 - fi - - lockfile="zed.spare.lock" - zed_lock "${lockfile}" - - # shellcheck disable=SC2046 - set -- $(query_vdev_status "${ZEVENT_POOL}" "${ZEVENT_VDEV_PATH}") - vdev_path="$1" - vdev_status="$2" - - # Device is already FAULTED or DEGRADED. - # - if [ "${vdev_status}" = "FAULTED" ] \ - || [ "${vdev_status}" = "DEGRADED" ]; then - rv=3 - - else - rv=1 - - # 1) FAULT or DEGRADE the device. - # - "${ZINJECT}" -d "${ZEVENT_VDEV_GUID}" -A "${action}" "${ZEVENT_POOL}" - - # 2) Set the SES fault beacon. - # - # TODO: Set the 'fault' or 'ident' beacon for the device. This can - # be done through the sg_ses utility. The only hard part is to map - # the sd device to its corresponding enclosure and slot. We may - # be able to leverage the existing vdev_id scripts for this. - # - # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 - # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 - - # 3) Replace the device with a hot spare. - # - # Round-robin through the spares trying those that are available. - # - for spare in ${ZEVENT_VDEV_SPARE_PATHS}; do - - # shellcheck disable=SC2046 - set -- $(query_vdev_status "${ZEVENT_POOL}" "${spare}") - spare_path="$1" - spare_status="$2" - - [ "${spare_status}" = "AVAIL" ] || continue - - zpool_err="$("${ZPOOL}" replace "${ZEVENT_POOL}" \ - "${ZEVENT_VDEV_GUID}" "${spare_path}" 2>&1)"; zpool_rv=$? - - if [ "${zpool_rv}" -ne 0 ]; then - [ -n "${zpool_err}" ] && zed_log_err "zpool ${zpool_err}" - else - notify "${vdev_path}" "${spare_path}" "${num_errors}" - rv=0 - break - fi - done - fi - - zed_unlock "${lockfile}" - exit "${rv}" -} - - -main "$@" |