summaryrefslogtreecommitdiffstats
path: root/cmd/zed/zed.d/io-spare.sh
blob: b64b2a9f1160697c64e0b716d3d534796494725b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/bin/sh
#
# Replace a device with a hot spare in response to IO or checksum errors.
# The following actions will be performed automatically when the number
# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or
# ZED_SPARE_ON_CHECKSUM_ERRORS.
#
# 1) FAULT the device on IO errors, no futher IO will be attempted.
#    DEGRADE the device on checksum errors, the device is still
#    functional and can be used to service IO requests.
# 2) Set the SES fault beacon for the device.
# 3) Replace the device with a hot spare if any are available.
#
# Once the hot sparing operation is complete either the failed device or
# the hot spare must be manually retired using the 'zpool detach' command.
# The 'autoreplace' functionality which would normally take care of this
# under Illumos has not yet been implemented.
#
# Full support for autoreplace is planned, but it requires that the full
# ZFS Diagnosis Engine be ported.  In the meanwhile this script provides
# the majority of the expected hot spare functionality.
#
# Exit codes:
#  0: replaced by hot spare
#  1: no hot spare device available
#  2: hot sparing disabled
#  3: already faulted or degraded
#  4: unsupported event class
#  5: internal error
#
test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc"

test -n "${ZEVENT_POOL}" || exit 5
test -n "${ZEVENT_SUBCLASS}" || exit 5
test -n "${ZEVENT_VDEV_PATH}" || exit 5
test -n "${ZEVENT_VDEV_GUID}" || exit 5

# Defaults to disabled, enable in the zed.rc file.
ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0}
ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0}

if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \
     ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then
	exit 2
fi

# A lock file is used to serialize execution.
ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock}
LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock"

exec 8> "${LOCKFILE}"
flock -x 8

# Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...).
vdev_status() {
	local POOL=$1
	local VDEV=`basename $2`
	local T='	'	# tab character since '\t' isn't portable

	${ZPOOL} status ${POOL} | sed -n -e \
	    "s,^[ $T]*\(.*$VDEV\(-part[0-9]\+\)\?\)[ $T]*\([A-Z]\+\).*,\1 \3,p"
	return 0
}

# Fault devices after N I/O errors.
if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then
	ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}`

	if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \
	     ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then
		ACTION="fault"
	fi
# Degrade devices after N checksum errors.
elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then
	ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS}

	if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \
	     ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then
		ACTION="degrade"
	fi
else
	ACTION=
fi

if [ -n "${ACTION}" ]; then

	# Device is already FAULTED or DEGRADED
	set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}`
	ZEVENT_VDEV_PATH_FOUND=$1
	STATUS=$2
	if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then
		exit 3
	fi

	# Step 1) FAULT or DEGRADE the device
	#
	${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL}

	# Step 2) Set the SES fault beacon.
	#
	# XXX: Set the 'fault' or 'ident' beacon for the device.  This can
	# be done through the sg_ses utility, the only hard part is to map
	# the sd device to its corresponding enclosure and slot.  We may
	# be able to leverage the existing vdev_id scripts for this.
	#
	# $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3
	# $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3

	# Step 3) Replace the device with a hot spare.
	#
	# Round robin through the spares selecting those which are available.
	#
	for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do
		set -- `vdev_status ${ZEVENT_POOL} ${SPARE}`
		SPARE_VDEV_FOUND=$1
		STATUS=$2
		if [ "${STATUS}" = "AVAIL" ]; then
			${ZPOOL} replace ${ZEVENT_POOL} \
			    ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0
		fi
	done

	exit 1
fi

exit 4