diff options
-rw-r--r-- | cmd/zed/Makefile.am | 14 | ||||
-rw-r--r-- | cmd/zed/agents/README.md | 112 | ||||
-rw-r--r-- | cmd/zed/agents/fmd_api.c | 760 | ||||
-rw-r--r-- | cmd/zed/agents/fmd_api.h | 246 | ||||
-rw-r--r-- | cmd/zed/agents/fmd_serd.c | 313 | ||||
-rw-r--r-- | cmd/zed/agents/fmd_serd.h | 86 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_agents.c | 368 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_agents.h | 22 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_diagnosis.c | 1009 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_mod.c | 49 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_retire.c | 613 | ||||
l--------- | cmd/zed/zed.d/checksum-spare.sh | 1 | ||||
-rwxr-xr-x | cmd/zed/zed.d/io-spare.sh | 239 | ||||
-rw-r--r-- | cmd/zed/zed_disk_event.c | 15 | ||||
-rw-r--r-- | cmd/zed/zed_event.c | 25 | ||||
-rw-r--r-- | cmd/zed/zed_exec.c | 47 | ||||
-rw-r--r-- | module/zfs/vdev.c | 11 |
17 files changed, 3592 insertions, 338 deletions
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index d35dfc428..199af1dda 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -27,10 +27,15 @@ ZED_SRC = \ zed_strings.h FMA_SRC = \ + agents/zfs_agents.c \ agents/zfs_agents.h \ agents/zfs_diagnosis.c \ agents/zfs_mod.c \ - agents/zfs_retire.c + agents/zfs_retire.c \ + agents/fmd_api.c \ + agents/fmd_api.h \ + agents/fmd_serd.c \ + agents/fmd_serd.h zed_SOURCES = $(ZED_SRC) $(FMA_SRC) @@ -38,10 +43,13 @@ zed_LDADD = \ $(top_builddir)/lib/libavl/libavl.la \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libspl/libspl.la \ + $(top_builddir)/lib/libuutil/libuutil.la \ $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs/libzfs.la \ $(top_builddir)/lib/libzfs_core/libzfs_core.la +zed_LDFLAGS = -lrt -pthread + zedconfdir = $(sysconfdir)/zfs/zed.d dist_zedconf_DATA = \ @@ -54,11 +62,9 @@ dist_zedexec_SCRIPTS = \ zed.d/all-debug.sh \ zed.d/all-syslog.sh \ zed.d/checksum-notify.sh \ - zed.d/checksum-spare.sh \ zed.d/data-notify.sh \ zed.d/generic-notify.sh \ zed.d/io-notify.sh \ - zed.d/io-spare.sh \ zed.d/resilver_finish-notify.sh \ zed.d/scrub_finish-notify.sh \ zed.d/statechange-led.sh \ @@ -67,10 +73,8 @@ dist_zedexec_SCRIPTS = \ zedconfdefaults = \ all-syslog.sh \ checksum-notify.sh \ - checksum-spare.sh \ data-notify.sh \ io-notify.sh \ - io-spare.sh \ resilver_finish-notify.sh \ scrub_finish-notify.sh \ statechange-blinkled.sh \ diff --git a/cmd/zed/agents/README.md b/cmd/zed/agents/README.md new file mode 100644 index 000000000..e35b97668 --- /dev/null +++ b/cmd/zed/agents/README.md @@ -0,0 +1,112 @@ +## Fault Management Logic for ZED ## + +The integration of Fault Management Daemon (FMD) logic from illumos +is being deployed in three phases. This logic is encapsulated in +several software modules inside ZED. + +### ZED+FM Phase 1 ### + +All the phase 1 work is in current Master branch. Phase I work includes: + +* Add new paths to the persistent VDEV label for device matching. +* Add a disk monitor for generating _disk-add_ and _disk-change_ events. +* Add support for automated VDEV auto-online, auto-replace and auto-expand. +* Expand the statechange event to include all VDEV state transitions. + +### ZED+FM Phase 2 (WIP) ### + +The phase 2 work primarily entails the _Diagnosis Engine_ and the +_Retire Agent_ modules. It also includes infrastructure to support a +crude FMD environment to host these modules. For additional +information see the **FMD Components in ZED** and **Implementation +Notes** sections below. + +### ZED+FM Phase 3 ### + +Future work will add additional functionality and will likely include: + +* Add FMD module garbage collection (periodically call `fmd_module_gc()`). +* Add real module property retrieval (currently hard-coded in accessors). +* Additional diagnosis telemetry (like latency outliers and SMART data). +* Export FMD module statistics. +* Zedlet parallel execution and resiliency (add watchdog). + +### ZFS Fault Management Overview ### + +The primary purpose with ZFS fault management is automated diagnosis +and isolation of VDEV faults. A fault is something we can associate +with an impact (e.g. loss of data redundancy) and a corrective action +(e.g. offline or replace a disk). A typical ZFS fault management stack +is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk +monitor_, a _diagnosis engine_ and _response agents_. + +After detecting a software error, the ZFS kernel module sends error +events to the ZED user daemon which in turn routes the events to its +internal FMA modules based on their event subscriptions. Likewise, if +a disk is added or changed in the system, the disk monitor sends disk +events which are consumed by a response agent. + +### FMD Components in ZED ### + +There are three FMD modules (aka agents) that are now built into ZED. + + 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`) + 2. A _Retire Agent_ module (`agents/zfs_retire.c`) + 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`) + +To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum +ereports and feeds them into a Soft Error Rate Discrimination (SERD) +algorithm which will generate a corresponding fault diagnosis when the +tracked VDEV encounters **N** events in a given **T** time window. The +initial N and T values for the SERD algorithm are estimates inherited +from illumos (10 errors in 10 minutes). + +In turn, a **Retire Agent** responds to diagnosed faults by isolating +the faulty VDEV. It will notify the ZFS kernel module of the new VDEV +state (degraded or faulted). The retire agent is also responsible for +managing hot spares across all pools. When it encounters a device fault +or a device removal it will replace the device with an appropriate +spare if available. + +Finally, a **Disk Add Agent** responds to events from a libudev disk +monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or +expand the associated VDEV. This agent is also known as the `zfs_mod` +or Sysevent Loadable Module (SLM) on the illumos platform. The added +disk is matched to a specific VDEV using its device id, physical path +or VDEV GUID. + +Note that the _auto-replace_ feature (aka hot plug) is opt-in and you +must set the pool's `autoreplace` property to enable it. The new disk +will be matched to the corresponding leaf VDEV by physical location +and labeled with a GPT partition before replacing the original VDEV +in the pool. + +### Implementation Notes ### + +* The FMD module API required for logic modules is emulated and implemented + in the `fmd_api.c` and `fmd_serd.c` source files. This support includes + module registration, memory allocation, module property accessors, basic + case management, one-shot timers and SERD engines. + For detailed information on the FMD module API, see the document -- + _"Fault Management Daemon Programmer's Reference Manual"_. + +* The event subscriptions for the modules (located in a module specific + configuration file on illumos) are currently hard-coded into the ZED + `zfs_agent_dispatch()` function. + +* The FMD modules are called one at a time from a single thread that + consumes events queued to the modules. These events are sourced from + the normal ZED events and also include events posted from the diagnosis + engine and the libudev disk event monitor. + +* The FMD code modules have minimal changes and were intentionally left + as similar as possible to their upstream source files. + +* The sysevent namespace in ZED differs from illumos. For example: + * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"` + * Linux uses `"sysevent.fs.zfs.vdev_remove"` + +* The FMD Modules port was produced by Intel Federal, LLC under award + number B609815 between the U.S. Department of Energy (DOE) and Intel + Federal, LLC. + diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c new file mode 100644 index 000000000..ae90a322c --- /dev/null +++ b/cmd/zed/agents/fmd_api.c @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * This file imlements the minimal FMD module API required to support the + * fault logic modules in ZED. This support includes module registration, + * memory allocation, module property accessors, basic case management, + * one-shot timers and SERD engines. + * + * In the ZED runtime, the modules are called from a single thread so no + * locking is required in this emulated FMD environment. + */ + +#include <sys/types.h> +#include <sys/fm/protocol.h> +#include <uuid/uuid.h> +#include <signal.h> +#include <strings.h> +#include <time.h> + +#include "fmd_api.h" +#include "fmd_serd.h" + +#include "zfs_agents.h" +#include "../zed_log.h" + +typedef struct fmd_modstat { + fmd_stat_t ms_accepted; /* total events accepted by module */ + fmd_stat_t ms_caseopen; /* cases currently open */ + fmd_stat_t ms_casesolved; /* total cases solved by module */ + fmd_stat_t ms_caseclosed; /* total cases closed by module */ +} fmd_modstat_t; + +typedef struct fmd_module { + const char *mod_name; /* basename of module (ro) */ + const fmd_hdl_info_t *mod_info; /* module info registered with handle */ + void *mod_spec; /* fmd_hdl_get/setspecific data value */ + fmd_stat_t *mod_ustat; /* module specific custom stats */ + uint_t mod_ustat_cnt; /* count of ustat stats */ + fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */ + fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */ + char *mod_vers; /* a copy of module version string */ +} fmd_module_t; + +/* + * ZED has two FMD hardwired module instances + */ +fmd_module_t zfs_retire_module; +fmd_module_t zfs_diagnosis_module; + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +/* + * Register a module with fmd and finish module initialization. + * Returns an integer indicating whether it succeeded (zero) or + * failed (non-zero). + */ +int +fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_info = mip; + mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */ + mp->mod_spec = NULL; + + /* bare minimum module stats */ + (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted"); + (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen"); + (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved"); + (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed"); + + fmd_serd_hash_create(&mp->mod_serds); + + fmd_hdl_debug(hdl, "register module"); + + return (0); +} + +void +fmd_hdl_unregister(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_modstat_t *msp = &mp->mod_stats; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + /* dump generic module stats */ + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name, + msp->ms_accepted.fmds_value.ui64); + if (ops->fmdo_close != NULL) { + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name, + msp->ms_caseopen.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name, + msp->ms_casesolved.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name, + msp->ms_caseclosed.fmds_value.ui64); + } + + /* dump module specific stats */ + if (mp->mod_ustat != NULL) { + int i; + + for (i = 0; i < mp->mod_ustat_cnt; i++) { + fmd_hdl_debug(hdl, "%s: %llu", + mp->mod_ustat[i].fmds_name, + mp->mod_ustat[i].fmds_value.ui64); + } + } + + fmd_serd_hash_destroy(&mp->mod_serds); + + fmd_hdl_debug(hdl, "unregister module"); +} + +/* + * fmd_hdl_setspecific() is used to associate a data pointer with + * the specified handle for the duration of the module's lifetime. + * This pointer can be retrieved using fmd_hdl_getspecific(). + */ +void +fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_spec = spec; +} + +/* + * Return the module-specific data pointer previously associated + * with the handle using fmd_hdl_setspecific(). + */ +void * +fmd_hdl_getspecific(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_spec); +} + +void * +fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_alloc(size, flags)); +} + +void * +fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_zalloc(size, flags)); +} + +void +fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size) +{ + umem_free(data, size); +} + +/* + * Record a module debug message using the specified format. + */ +void +fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...) +{ + char message[256]; + va_list vargs; + fmd_module_t *mp = (fmd_module_t *)hdl; + + va_start(vargs, format); + (void) vsnprintf(message, sizeof (message), format, vargs); + va_end(vargs); + + /* prefix message with module name */ + zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message); +} + +/* Property Retrieval */ + +int32_t +fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "spare_on_remove") == 0) + return (1); + + if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) + return (10); /* N = 10 events */ + + return (0); +} + +int64_t +fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "remove_timeout") == 0) + return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ + + if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) + return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ + + return (0); +} + +/* FMD Statistics */ + +fmd_stat_t * +fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (flags == FMD_STAT_NOALLOC) { + mp->mod_ustat = statv; + mp->mod_ustat_cnt = nstats; + } + + return (statv); +} + +/* Case Management */ + +fmd_case_t * +fmd_case_open(fmd_hdl_t *hdl, void *data) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + uuid_t uuid; + + fmd_case_t *cp; + + cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP); + cp->ci_mod = hdl; + cp->ci_state = FMD_CASE_UNSOLVED; + cp->ci_flags = FMD_CF_DIRTY; + cp->ci_data = data; + cp->ci_bufptr = NULL; + cp->ci_bufsiz = 0; + + uuid_generate(uuid); + uuid_unparse(uuid, cp->ci_uuid); + + fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid); + mp->mod_stats.ms_caseopen.fmds_value.ui64++; + + return (cp); +} + +void +fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + /* + * For ZED, the event was already sent from fmd_case_add_suspect() + */ + + if (cp->ci_state >= FMD_CASE_SOLVED) + fmd_hdl_debug(hdl, "case is already solved or closed"); + + cp->ci_state = FMD_CASE_SOLVED; + + fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid); + mp->mod_stats.ms_casesolved.fmds_value.ui64++; +} + +void +fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid); + + if (ops->fmdo_close != NULL) + ops->fmdo_close(hdl, cp); + + mp->mod_stats.ms_caseopen.fmds_value.ui64--; + mp->mod_stats.ms_caseclosed.fmds_value.ui64++; + + if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0) + fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz); + + fmd_hdl_free(hdl, cp, sizeof (fmd_case_t)); +} + +void +fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid) +{ + fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid); +} + +int +fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE); +} + +void +fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep) +{ +} + +static void +zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code) +{ + nvlist_t *rsrc; + char *strval; + uint64_t guid; + uint8_t byte; + + zed_log_msg(LOG_INFO, "\nzed_fault_event:"); + + if (uuid != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid); + if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval); + if (code != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code); + if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte); + if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { + if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME, + strval); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL, + guid); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV, + guid); + } +} + +static const char * +fmd_fault_mkcode(nvlist_t *fault) +{ + char *class, *code = "-"; + + /* + * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po + */ + if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) { + if (strcmp(class, "fault.fs.zfs.vdev.io") == 0) + code = "ZFS-8000-FD"; + else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0) + code = "ZFS-8000-GH"; + else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0) + code = "ZFS-8000-HC"; + else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0) + code = "ZFS-8000-JQ"; + else if (strcmp(class, "fault.fs.zfs.log_replay") == 0) + code = "ZFS-8000-K4"; + else if (strcmp(class, "fault.fs.zfs.pool") == 0) + code = "ZFS-8000-CS"; + else if (strcmp(class, "fault.fs.zfs.device") == 0) + code = "ZFS-8000-D3"; + + } + return (code); +} + +void +fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault) +{ + nvlist_t *nvl; + const char *code = fmd_fault_mkcode(fault); + int64_t tod[2]; + int err = 0; + + /* + * payload derived from fmd_protocol_list() + */ + + (void) gettimeofday(&cp->ci_tv, NULL); + tod[0] = cp->ci_tv.tv_sec; + tod[1] = cp->ci_tv.tv_usec; + + nvl = fmd_nvl_alloc(hdl, FMD_SLEEP); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS); + err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid); + err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code); + err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); + err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1); + err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1); + + if (err) + zed_log_die("failed to populate nvlist"); + + zed_log_fault(fault, cp->ci_uuid, code); + zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl); + + nvlist_free(nvl); + nvlist_free(fault); +} + +void +fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data) +{ + cp->ci_data = data; +} + +void * +fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return (cp->ci_data); +} + +void +fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr == NULL); + assert(size < (1024 * 1024)); + + cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP); + cp->ci_bufsiz = size; +} + +void +fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(size <= cp->ci_bufsiz); + + bcopy(cp->ci_bufptr, buf, size); +} + +void +fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, const void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(cp->ci_bufsiz >= size); + + bcopy(buf, cp->ci_bufptr, size); +} + +/* SERD Engines */ + +void +fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) { + zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': " + " name already exists", name); + return; + } + + (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t); +} + +void +fmd_serd_destroy(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + fmd_serd_eng_delete(&mp->mod_serds, name); + + fmd_hdl_debug(hdl, "serd_destroy %s", name); +} + +int +fmd_serd_exists(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); +} + +void +fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + return; + } + + fmd_serd_eng_reset(sgp); + + fmd_hdl_debug(hdl, "serd_reset %s", name); +} + +int +fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + int err; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", + name); + return (FMD_B_FALSE); + } + err = fmd_serd_eng_record(sgp, ep->ev_hrt); + + return (err); +} + +/* FMD Timers */ + +static void +_timer_notify(union sigval sv) +{ + fmd_timer_t *ftp = sv.sival_ptr; + fmd_hdl_t *hdl = ftp->ft_hdl; + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + struct itimerspec its; + + fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + + /* disarm the timer */ + bzero(&its, sizeof (struct itimerspec)); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + /* Note that the fmdo_timeout can remove this timer */ + if (ops->fmdo_timeout != NULL) + ops->fmdo_timeout(hdl, ftp, ftp->ft_arg); +} + +/* + * Install a new timer which will fire at least delta nanoseconds after the + * current time. After the timeout has expired, the module's fmdo_timeout + * entry point is called. + */ +fmd_timer_t * +fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta) +{ + struct sigevent sev; + struct itimerspec its; + fmd_timer_t *ftp; + + ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP); + ftp->ft_arg = arg; + ftp->ft_hdl = hdl; + + its.it_value.tv_sec = delta / 1000000000; + its.it_value.tv_nsec = delta % 1000000000; + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = _timer_notify; + sev.sigev_notify_attributes = NULL; + sev.sigev_value.sival_ptr = ftp; + + timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + fmd_hdl_debug(hdl, "installing timer for %d secs (%p)", + (int)its.it_value.tv_sec, ftp->ft_tid); + + return (ftp); +} + +void +fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp) +{ + fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid); + + timer_delete(ftp->ft_tid); + + fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t)); +} + +/* Name-Value Pair Lists */ + +nvlist_t * +fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty, + nvlist_t *asru, nvlist_t *fru, nvlist_t *resource) +{ + nvlist_t *nvl; + int err = 0; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + zed_log_die("failed to xalloc fault nvlist"); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, class); + err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty); + + if (asru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru); + if (fru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru); + if (resource != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource); + + if (err) + zed_log_die("failed to populate nvlist: %s\n", strerror(err)); + + return (nvl); +} + +/* + * sourced from fmd_string.c + */ +static int +fmd_strmatch(const char *s, const char *p) +{ + char c; + + if (p == NULL) + return (0); + + if (s == NULL) + s = ""; /* treat NULL string as the empty string */ + + do { + if ((c = *p++) == '\0') + return (*s == '\0'); + + if (c == '*') { + while (*p == '*') + p++; /* consecutive *'s can be collapsed */ + + if (*p == '\0') + return (1); + + while (*s != '\0') { + if (fmd_strmatch(s++, p) != 0) + return (1); + } + + return (0); + } + } while (c == *s++); + + return (0); +} + +int +fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern) +{ + char *class; + + return (nvl != NULL && + nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 && + fmd_strmatch(class, pattern)); +} + +nvlist_t * +fmd_nvl_alloc(fmd_hdl_t *hdl, int flags) +{ + nvlist_t *nvl = NULL; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + return (nvl); +} + + +/* + * ZED Agent specific APIs + */ + +fmd_hdl_t * +fmd_module_hdl(const char *name) +{ + if (strcmp(name, "zfs-retire") == 0) + return ((fmd_hdl_t *)&zfs_retire_module); + if (strcmp(name, "zfs-diagnosis") == 0) + return ((fmd_hdl_t *)&zfs_diagnosis_module); + + return (NULL); +} + +boolean_t +fmd_module_initialized(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_info != NULL); +} + +/* + * fmd_module_recv is called for each event that is received by + * the fault manager that has a class that matches one of the + * module's subscriptions. + */ +void +fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + fmd_event_t faux_event = {0}; + int64_t *tv; + uint_t n; + + /* + * Will need to normalized this if we persistently store the case data + */ + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0) + faux_event.ev_hrt = tv[0] * NANOSEC + tv[1]; + else + faux_event.ev_hrt = 0; + + ops->fmdo_recv(hdl, &faux_event, nvl, class); + + mp->mod_stats.ms_accepted.fmds_value.ui64++; + + /* TBD - should we initiate fm_module_gc() periodically? */ +} diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h new file mode 100644 index 000000000..4f06fb244 --- /dev/null +++ b/cmd/zed/agents/fmd_api.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_API_H +#define _FMD_API_H + +#include <sys/types.h> +#include <sys/time.h> +#include <time.h> +#include <libnvpair.h> +#include <stdarg.h> +#include <umem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Fault Management Daemon Client Interfaces + */ + +#define FMD_API_VERSION 5 + +typedef struct fmd_hdl fmd_hdl_t; + +typedef struct fmd_timer { + timer_t ft_tid; + void *ft_arg; + fmd_hdl_t *ft_hdl; +} fmd_timer_t; + +#define id_t fmd_timer_t * + + +typedef struct fmd_event { + hrtime_t ev_hrt; /* event time used by SERD engines */ +} fmd_event_t; + +typedef struct fmd_case { + char ci_uuid[48]; /* uuid string for this case */ + fmd_hdl_t *ci_mod; /* module that owns this case */ + void *ci_data; /* data from fmd_case_setspecific() */ + ushort_t ci_state; /* case state (see below) */ + ushort_t ci_flags; /* case flags (see below) */ + struct timeval ci_tv; /* time of original diagnosis */ + void *ci_bufptr; /* case data serialization buffer */ + size_t ci_bufsiz; +} fmd_case_t; + + +#define FMD_B_FALSE 0 /* false value for booleans as int */ +#define FMD_B_TRUE 1 /* true value for booleans as int */ + + +#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */ +#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */ +#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */ +#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */ +#define FMD_CASE_REPAIRED 4 /* case is repaired */ +#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */ + +#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */ +#define FMD_CF_SOLVED 0x02 /* case has been solved */ +#define FMD_CF_ISOLATED 0x04 /* case has been isolated */ +#define FMD_CF_REPAIRED 0x08 /* case has been repaired */ +#define FMD_CF_RESOLVED 0x10 /* case has been resolved */ + + +#define FMD_TYPE_BOOL 0 /* int */ +#define FMD_TYPE_INT32 1 /* int32_t */ +#define FMD_TYPE_UINT32 2 /* uint32_t */ +#define FMD_TYPE_INT64 3 /* int64_t */ +#define FMD_TYPE_UINT64 4 /* uint64_t */ +#define FMD_TYPE_TIME 5 /* uint64_t */ +#define FMD_TYPE_SIZE 6 /* uint64_t */ + +typedef struct fmd_prop { + const char *fmdp_name; /* property name */ + uint_t fmdp_type; /* property type (see above) */ + const char *fmdp_defv; /* default value */ +} fmd_prop_t; + +typedef struct fmd_stat { + char fmds_name[32]; /* statistic name */ + uint_t fmds_type; /* statistic type (see above) */ + char fmds_desc[64]; /* statistic description */ + union { + int bool; /* FMD_TYPE_BOOL */ + int32_t i32; /* FMD_TYPE_INT32 */ + uint32_t ui32; /* FMD_TYPE_UINT32 */ + int64_t i64; /* FMD_TYPE_INT64 */ + uint64_t ui64; /* FMD_TYPE_UINT64 */ + } fmds_value; +} fmd_stat_t; + +typedef struct fmd_hdl_ops { + void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *); + void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *); + void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *); + void (*fmdo_stats)(fmd_hdl_t *); + void (*fmdo_gc)(fmd_hdl_t *); +} fmd_hdl_ops_t; + +#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */ +#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */ +#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */ + +typedef struct fmd_hdl_info { + const char *fmdi_desc; /* fmd client description string */ + const char *fmdi_vers; /* fmd client version string */ + const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */ + const fmd_prop_t *fmdi_props; /* array of configuration props */ +} fmd_hdl_info_t; + +extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *); +extern void fmd_hdl_unregister(fmd_hdl_t *); + +extern void fmd_hdl_setspecific(fmd_hdl_t *, void *); +extern void *fmd_hdl_getspecific(fmd_hdl_t *); + +#define FMD_SLEEP UMEM_NOFAIL + +extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int); +extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int); +extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t); + +extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int); +extern void fmd_hdl_strfree(fmd_hdl_t *, char *); + +extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); +extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); + +extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); +extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); + +#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ +#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ + +extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *); +extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *); +extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *); + +extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *); +extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *); + +extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *); +extern void fmd_case_uuclose(fmd_hdl_t *, const char *); +extern int fmd_case_uuclosed(fmd_hdl_t *, const char *); +extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *); +extern void fmd_case_uuresolved(fmd_hdl_t *, const char *); + +extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *); +extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *); +extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *); + +extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *); +extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *); + +extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t); +extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *, + const char *, void *, size_t); +extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *, + const char *, const void *, size_t); +extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); + +extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); +extern void fmd_serd_destroy(fmd_hdl_t *, const char *); +extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern void fmd_serd_reset(fmd_hdl_t *, const char *); +extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); +extern int fmd_serd_fired(fmd_hdl_t *, const char *); +extern int fmd_serd_empty(fmd_hdl_t *, const char *); + +extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); +extern void fmd_timer_remove(fmd_hdl_t *, id_t); + +extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *, + const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *); + +extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *); + +#define FMD_HAS_FAULT_FRU 0 +#define FMD_HAS_FAULT_ASRU 1 +#define FMD_HAS_FAULT_RESOURCE 2 + +extern void fmd_repair_fru(fmd_hdl_t *, const char *); +extern int fmd_repair_asru(fmd_hdl_t *, const char *); + +extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int); +extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int); + +/* + * ZED Specific Interfaces + */ + +extern fmd_hdl_t *fmd_module_hdl(const char *); +extern boolean_t fmd_module_initialized(fmd_hdl_t *); +extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *); + +/* ZFS FMA Retire Agent */ +extern void _zfs_retire_init(fmd_hdl_t *); +extern void _zfs_retire_fini(fmd_hdl_t *); + +/* ZFS FMA Diagnosis Engine */ +extern void _zfs_diagnosis_init(fmd_hdl_t *); +extern void _zfs_diagnosis_fini(fmd_hdl_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_API_H */ diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c new file mode 100644 index 000000000..6143c5d7e --- /dev/null +++ b/cmd/zed/agents/fmd_serd.c @@ -0,0 +1,313 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#include <assert.h> +#include <stddef.h> +#include <stdlib.h> +#include <strings.h> +#include <sys/list.h> +#include <sys/time.h> + +#include "fmd_api.h" +#include "fmd_serd.h" +#include "../zed_log.h" + + +#define FMD_STR_BUCKETS 211 + + +#ifdef SERD_ENG_DEBUG +#define serd_log_msg(fmt, ...) \ + zed_log_msg(LOG_INFO, fmt, __VA_ARGS__) +#else +#define serd_log_msg(fmt, ...) +#endif + + +/* + * SERD Engine Backend + */ + +/* + * Compute the delta between events in nanoseconds. To account for very old + * events which are replayed, we must handle the case where time is negative. + * We convert the hrtime_t's to unsigned 64-bit integers and then handle the + * case where 'old' is greater than 'new' (i.e. high-res time has wrapped). + */ +static hrtime_t +fmd_event_delta(hrtime_t t1, hrtime_t t2) +{ + uint64_t old = t1; + uint64_t new = t2; + + return (new >= old ? new - old : (UINT64_MAX - old) + new + 1); +} + +static fmd_serd_eng_t * +fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t) +{ + fmd_serd_eng_t *sgp; + + sgp = malloc(sizeof (fmd_serd_eng_t)); + bzero(sgp, sizeof (fmd_serd_eng_t)); + + sgp->sg_name = strdup(name); + sgp->sg_flags = FMD_SERD_DIRTY; + sgp->sg_n = n; + sgp->sg_t = t; + + list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t), + offsetof(fmd_serd_elem_t, se_list)); + + return (sgp); +} + +static void +fmd_serd_eng_free(fmd_serd_eng_t *sgp) +{ + fmd_serd_eng_reset(sgp); + free(sgp->sg_name); + list_destroy(&sgp->sg_list); + free(sgp); +} + +/* + * sourced from fmd_string.c + */ +static ulong_t +fmd_strhash(const char *key) +{ + ulong_t g, h = 0; + const char *p; + + for (p = key; *p != '\0'; p++) { + h = (h << 4) + *p; + + if ((g = (h & 0xf0000000)) != 0) { + h ^= (g >> 24); + h ^= g; + } + } + + return (h); +} + +void +fmd_serd_hash_create(fmd_serd_hash_t *shp) +{ + shp->sh_hashlen = FMD_STR_BUCKETS; + shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *)); + shp->sh_count = 0; +} + +void +fmd_serd_hash_destroy(fmd_serd_hash_t *shp) +{ + fmd_serd_eng_t *sgp, *ngp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) { + ngp = sgp->sg_next; + fmd_serd_eng_free(sgp); + } + } + + free(shp->sh_hash); + bzero(shp, sizeof (fmd_serd_hash_t)); +} + +void +fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg) +{ + fmd_serd_eng_t *sgp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next) + func(sgp, arg); + } +} + +fmd_serd_eng_t * +fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name, + uint_t n, hrtime_t t) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t); + + serd_log_msg(" SERD Engine: inserting %s N %d T %llu", + name, (int)n, (long long unsigned)t); + + sgp->sg_next = shp->sh_hash[h]; + shp->sh_hash[h] = sgp; + shp->sh_count++; + + return (sgp); +} + +fmd_serd_eng_t * +fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp; + + for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(name, sgp->sg_name) == 0) + return (sgp); + } + + return (NULL); +} + +void +fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h]; + + serd_log_msg(" SERD Engine: deleting %s", name); + + for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(sgp->sg_name, name) != 0) + pp = &sgp->sg_next; + else + break; + } + + if (sgp != NULL) { + *pp = sgp->sg_next; + fmd_serd_eng_free(sgp); + assert(shp->sh_count != 0); + shp->sh_count--; + } +} + +static void +fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep) +{ + list_remove(&sgp->sg_list, sep); + sgp->sg_count--; + + serd_log_msg(" SERD Engine: discarding %s, %d remaining", + sgp->sg_name, (int)sgp->sg_count); + + free(sep); +} + +int +fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt) +{ + fmd_serd_elem_t *sep, *oep; + + /* + * If the fired flag is already set, return false and discard the + * event. This means that the caller will only see the engine "fire" + * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired() + * function can also be used in combination with fmd_serd_eng_record(). + */ + if (sgp->sg_flags & FMD_SERD_FIRED) { + serd_log_msg(" SERD Engine: record %s already fired!", + sgp->sg_name); + return (FMD_B_FALSE); + } + + while (sgp->sg_count >= sgp->sg_n) + fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list)); + + sep = malloc(sizeof (fmd_serd_elem_t)); + sep->se_hrt = hrt; + + list_insert_head(&sgp->sg_list, sep); + sgp->sg_count++; + + serd_log_msg(" SERD Engine: recording %s of %d (%llu)", + sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt); + + /* + * Pick up the oldest element pointer for comparison to 'sep'. We must + * do this after adding 'sep' because 'oep' and 'sep' can be the same. + */ + oep = list_tail(&sgp->sg_list); + + if (sgp->sg_count >= sgp->sg_n && + fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) { + sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY; + serd_log_msg(" SERD Engine: fired %s", sgp->sg_name); + return (FMD_B_TRUE); + } + + sgp->sg_flags |= FMD_SERD_DIRTY; + return (FMD_B_FALSE); +} + +int +fmd_serd_eng_fired(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_flags & FMD_SERD_FIRED); +} + +int +fmd_serd_eng_empty(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_count == 0); +} + +void +fmd_serd_eng_reset(fmd_serd_eng_t *sgp) +{ + serd_log_msg(" SERD Engine: reseting %s", sgp->sg_name); + + while (sgp->sg_count != 0) + fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list)); + + sgp->sg_flags &= ~FMD_SERD_FIRED; + sgp->sg_flags |= FMD_SERD_DIRTY; +} + +void +fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +{ + fmd_serd_elem_t *sep, *nep; + hrtime_t hrt; + + if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED)) + return; /* no garbage collection needed if empty or fired */ + + sep = list_head(&sgp->sg_list); + hrt = sep->se_hrt - sgp->sg_t; + + for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) { + if (sep->se_hrt >= hrt) + break; /* sep and subsequent events are all within T */ + + nep = list_next(&sgp->sg_list, sep); + fmd_serd_eng_discard(sgp, sep); + sgp->sg_flags |= FMD_SERD_DIRTY; + } +} diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h new file mode 100644 index 000000000..c35c9acc7 --- /dev/null +++ b/cmd/zed/agents/fmd_serd.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_SERD_H +#define _FMD_SERD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/list.h> +#include <sys/time.h> + +typedef struct fmd_serd_elem { + list_node_t se_list; /* linked list forward/back pointers */ + hrtime_t se_hrt; /* upper bound on event hrtime */ +} fmd_serd_elem_t; + +typedef struct fmd_serd_eng { + char *sg_name; /* string name for this engine */ + struct fmd_serd_eng *sg_next; /* next engine on hash chain */ + list_t sg_list; /* list of fmd_serd_elem_t's */ + uint_t sg_count; /* count of events in sg_list */ + uint_t sg_flags; /* engine flags (see below) */ + uint_t sg_n; /* engine N parameter (event count) */ + hrtime_t sg_t; /* engine T parameter (nanoseconds) */ +} fmd_serd_eng_t; + +#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */ +#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */ + +typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *); + +typedef struct fmd_serd_hash { + fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */ + uint_t sh_hashlen; /* length of hash bucket array */ + uint_t sh_count; /* count of engines in hash */ +} fmd_serd_hash_t; + +extern void fmd_serd_hash_create(fmd_serd_hash_t *); +extern void fmd_serd_hash_destroy(fmd_serd_hash_t *); +extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *); + +extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *, + const char *, uint32_t, hrtime_t); + +extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *); +extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *); + +extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t); +extern int fmd_serd_eng_fired(fmd_serd_eng_t *); +extern int fmd_serd_eng_empty(fmd_serd_eng_t *); + +extern void fmd_serd_eng_reset(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_SERD_H */ diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c new file mode 100644 index 000000000..8779d5945 --- /dev/null +++ b/cmd/zed/agents/zfs_agents.c @@ -0,0 +1,368 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#include <libnvpair.h> +#include <libzfs.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/list.h> +#include <sys/time.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> +#include <pthread.h> +#include <unistd.h> + +#include "zfs_agents.h" +#include "fmd_api.h" +#include "../zed_log.h" + +/* + * agent dispatch code + */ + +static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER; +static list_t agent_events; /* list of pending events */ +static int agent_exiting; + +typedef struct agent_event { + char ae_class[64]; + char ae_subclass[32]; + nvlist_t *ae_nvl; + list_node_t ae_node; +} agent_event_t; + +pthread_t g_agents_tid; + +libzfs_handle_t *g_zfs_hdl; + +/* guid search data */ +typedef struct guid_search { + uint64_t gs_pool_guid; + uint64_t gs_vdev_guid; + char *gs_devid; +} guid_search_t; + +static void +zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) +{ + guid_search_t *gsp = arg; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_agent_iter_vdev(zhp, child[c], gsp); + return; + } + /* + * On a devid match, grab the vdev guid + */ + if ((gsp->gs_vdev_guid == 0) && + (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && + (strcmp(gsp->gs_devid, path) == 0)) { + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &gsp->gs_vdev_guid); + } +} + +static int +zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) +{ + guid_search_t *gsp = arg; + nvlist_t *config, *nvl; + + /* + * For each vdev in this pool, look for a match by devid + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvl) == 0) { + zfs_agent_iter_vdev(zhp, nvl, gsp); + } + } + /* + * if a match was found then grab the pool guid + */ + if (gsp->gs_vdev_guid) { + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &gsp->gs_pool_guid); + } + + zpool_close(zhp); + return (gsp->gs_vdev_guid != 0); +} + +void +zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + agent_event_t *event; + + if (subclass == NULL) + subclass = ""; + + event = malloc(sizeof (agent_event_t)); + if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) { + if (event) + free(event); + return; + } + + if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { + class = EC_ZFS; + subclass = ESC_ZFS_VDEV_CHECK; + } + + /* + * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED + * ereport from vdev_disk layer after a hot unplug. Fortunately we + * get a EC_DEV_REMOVE from our disk monitor and it is a suitable + * proxy so we remap it here for the benefit of the diagnosis engine. + */ + if ((strcmp(class, EC_DEV_REMOVE) == 0) && + (strcmp(subclass, ESC_DISK) == 0) && + (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) || + nvlist_exists(nvl, DEV_IDENTIFIER))) { + nvlist_t *payload = event->ae_nvl; + struct timeval tv; + int64_t tod[2]; + uint64_t pool_guid = 0, vdev_guid = 0; + + class = "resource.fs.zfs.removed"; + subclass = ""; + + (void) nvlist_add_string(payload, FM_CLASS, class); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); + + /* + * For multipath, ZFS_EV_VDEV_GUID is missing so find it. + */ + if (vdev_guid == 0) { + guid_search_t search = { 0 }; + + (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, + &search.gs_devid); + + (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, + &search); + pool_guid = search.gs_pool_guid; + vdev_guid = search.gs_vdev_guid; + } + + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid); + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid); + + (void) gettimeofday(&tv, NULL); + tod[0] = tv.tv_sec; + tod[1] = tv.tv_usec; + (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2); + + zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'", + EC_DEV_REMOVE, class); + } + + (void) strlcpy(event->ae_class, class, sizeof (event->ae_class)); + (void) strlcpy(event->ae_subclass, subclass, + sizeof (event->ae_subclass)); + + (void) pthread_mutex_lock(&agent_lock); + list_insert_tail(&agent_events, event); + (void) pthread_mutex_unlock(&agent_lock); + + (void) pthread_cond_signal(&agent_cond); +} + +static void +zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl) +{ + /* + * The diagnosis engine subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf + */ + if (strstr(class, "ereport.fs.zfs.") != NULL || + strstr(class, "resource.fs.zfs.") != NULL || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 || + strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class); + } + + /* + * The retire agent subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-retire.conf + * + * NOTE: faults events come directy from our diagnosis engine + * and will not pass through the zfs kernel module. + */ + if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || + strcmp(class, "resource.fs.zfs.removed") == 0 || + strcmp(class, "resource.fs.zfs.statechange") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class); + } + + /* + * The SLM module only consumes disk events and vdev check events + * + * NOTE: disk events come directly from disk monitor and will + * not pass through the zfs kernel module. + */ + if (strstr(class, "EC_dev_") != NULL || + strcmp(class, EC_ZFS) == 0) { + (void) zfs_slm_event(class, subclass, nvl); + } +} + +/* + * Events are consumed and dispatched from this thread + * An agent can also post an event so event list lock + * is not held when calling an agent. + * One event is consumed at a time. + */ +static void * +zfs_agent_consumer_thread(void *arg) +{ + for (;;) { + agent_event_t *event; + + (void) pthread_mutex_lock(&agent_lock); + + /* wait for an event to show up */ + while (!agent_exiting && list_is_empty(&agent_events)) + (void) pthread_cond_wait(&agent_cond, &agent_lock); + + if (agent_exiting) { + (void) pthread_mutex_unlock(&agent_lock); + zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: " + "exiting"); + return (NULL); + } + + if ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + + (void) pthread_mutex_unlock(&agent_lock); + + /* dispatch to all event subscribers */ + zfs_agent_dispatch(event->ae_class, event->ae_subclass, + event->ae_nvl); + + nvlist_free(event->ae_nvl); + free(event); + continue; + } + + (void) pthread_mutex_unlock(&agent_lock); + } + + return (NULL); +} + +void +zfs_agent_init(libzfs_handle_t *zfs_hdl) +{ + fmd_hdl_t *hdl; + + g_zfs_hdl = zfs_hdl; + + if (zfs_slm_init() != 0) + zed_log_die("Failed to initialize zfs slm"); + zed_log_msg(LOG_INFO, "Add Agent: init"); + + hdl = fmd_module_hdl("zfs-diagnosis"); + _zfs_diagnosis_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs diagnosis"); + + hdl = fmd_module_hdl("zfs-retire"); + _zfs_retire_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs retire"); + + list_create(&agent_events, sizeof (agent_event_t), + offsetof(struct agent_event, ae_node)); + + if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread, + NULL) != 0) { + list_destroy(&agent_events); + zed_log_die("Failed to initialize agents"); + } +} + +void +zfs_agent_fini(void) +{ + fmd_hdl_t *hdl; + agent_event_t *event; + + agent_exiting = 1; + (void) pthread_cond_signal(&agent_cond); + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_agents_tid, NULL); + + /* drain any pending events */ + while ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + nvlist_free(event->ae_nvl); + free(event); + } + + list_destroy(&agent_events); + + if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) { + _zfs_retire_fini(hdl); + fmd_hdl_unregister(hdl); + } + if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) { + _zfs_diagnosis_fini(hdl); + fmd_hdl_unregister(hdl); + } + + zed_log_msg(LOG_INFO, "Add Agent: fini"); + zfs_slm_fini(); + + g_zfs_hdl = NULL; +} + +/* + * In ZED context, all the FMA agents run in the same thread + * and do not require a unique libzfs instance. Modules should + * use these stubs. + */ +libzfs_handle_t * +__libzfs_init(void) +{ + return (g_zfs_hdl); +} + +void +__libzfs_fini(libzfs_handle_t *hdl) +{ +} diff --git a/cmd/zed/agents/zfs_agents.h b/cmd/zed/agents/zfs_agents.h index 4630f2212..3c9af54c9 100644 --- a/cmd/zed/agents/zfs_agents.h +++ b/cmd/zed/agents/zfs_agents.h @@ -26,29 +26,25 @@ extern "C" { #endif /* - * Agents from ZFS FMA and syseventd - linked directly into ZED daemon binary + * Agent abstraction presented to ZED */ +extern void zfs_agent_init(libzfs_handle_t *); +extern void zfs_agent_fini(void); +extern void zfs_agent_post_event(const char *, const char *, nvlist_t *); /* * ZFS Sysevent Linkable Module (SLM) */ -extern int zfs_slm_init(libzfs_handle_t *zfs_hdl); +extern int zfs_slm_init(void); extern void zfs_slm_fini(void); extern void zfs_slm_event(const char *, const char *, nvlist_t *); /* - * ZFS FMA Retire Agent + * In ZED context, all the FMA agents run in the same thread + * and do not require a unique libzfs instance. */ -extern int zfs_retire_init(libzfs_handle_t *zfs_hdl); -extern void zfs_retire_fini(void); -extern void zfs_retire_recv(nvlist_t *nvl, const char *class); - -/* - * ZFS FMA Diagnosis Engine - */ -extern int zfs_diagnosis_init(libzfs_handle_t *zfs_hdl); -extern void zfs_diagnosis_fini(void); -extern void zfs_diagnosis_recv(nvlist_t *nvl, const char *class); +extern libzfs_handle_t *__libzfs_init(void); +extern void __libzfs_fini(libzfs_handle_t *); #ifdef __cplusplus } diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c index 4d534a4d3..f10a42ffd 100644 --- a/cmd/zed/agents/zfs_diagnosis.c +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -21,27 +21,1022 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, Intel Corporation. */ +#include <stddef.h> +#include <strings.h> +#include <libuutil.h> +#include <libzfs.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> + #include "zfs_agents.h" -#include "../zed_log.h" +#include "fmd_api.h" +/* + * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This + * #define reserves enough space for two 64-bit hex values plus the length of + * the longest string. + */ +#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) -/*ARGSUSED*/ -void -zfs_diagnosis_recv(nvlist_t *nvl, const char *class) +/* + * On-disk case structure. This must maintain backwards compatibility with + * previous versions of the DE. By default, any members appended to the end + * will be filled with zeros if they don't exist in a previous version. + */ +typedef struct zfs_case_data { + uint64_t zc_version; + uint64_t zc_ena; + uint64_t zc_pool_guid; + uint64_t zc_vdev_guid; + int zc_pool_state; + char zc_serd_checksum[MAX_SERDLEN]; + char zc_serd_io[MAX_SERDLEN]; + int zc_has_remove_timer; +} zfs_case_data_t; + +/* + * Time-of-day + */ +typedef struct er_timeval { + uint64_t ertv_sec; + uint64_t ertv_nsec; +} er_timeval_t; + +/* + * In-core case structure. + */ +typedef struct zfs_case { + boolean_t zc_present; + uint32_t zc_version; + zfs_case_data_t zc_data; + fmd_case_t *zc_case; + uu_list_node_t zc_node; + id_t zc_remove_timer; + char *zc_fru; + er_timeval_t zc_when; +} zfs_case_t; + +#define CASE_DATA "data" +#define CASE_FRU "fru" +#define CASE_DATA_VERSION_INITIAL 1 +#define CASE_DATA_VERSION_SERD 2 + +typedef struct zfs_de_stats { + fmd_stat_t old_drops; + fmd_stat_t dev_drops; + fmd_stat_t vdev_drops; + fmd_stat_t import_drops; + fmd_stat_t resource_drops; +} zfs_de_stats_t; + +zfs_de_stats_t zfs_stats = { + { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, + { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, + { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, + { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, + { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } +}; + +static hrtime_t zfs_remove_timeout; + +uu_list_pool_t *zfs_case_pool; +uu_list_t *zfs_cases; + +#define ZFS_MAKE_RSRC(type) \ + FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type +#define ZFS_MAKE_EREPORT(type) \ + FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type + +/* + * Write out the persistent representation of an active case. + */ +static void +zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) { + zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; +} + +/* + * Read back the persistent representation of an active case. + */ +static zfs_case_t * +zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + zfs_case_t *zcp; + + zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); + zcp->zc_case = cp; + + fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, + sizeof (zcp->zc_data)); + + if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + return (NULL); + } + + /* + * fmd_buf_read() will have already zeroed out the remainder of the + * buffer, so we don't have to do anything special if the version + * doesn't include the SERD engine name. + */ + + if (zcp->zc_data.zc_has_remove_timer) + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, + NULL, zfs_remove_timeout); + + uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); + (void) uu_list_insert_before(zfs_cases, NULL, zcp); + + fmd_case_setspecific(hdl, cp, zcp); + + return (zcp); +} + +/* + * Iterate over any active cases. If any cases are associated with a pool or + * vdev which is no longer present on the system, close the associated case. + */ +static void +zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) +{ + uint64_t vdev_guid; + uint_t c, children; + nvlist_t **child; + zfs_case_t *zcp; + int ret; + + ret = nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); + assert(ret == 0); + + /* + * Mark any cases associated with this (pool, vdev) pair. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == vdev_guid) { + zcp->zc_present = B_TRUE; + zcp->zc_when = *loaded; + } + } + + /* + * Iterate over all children. + */ + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } } /*ARGSUSED*/ -int -zfs_diagnosis_init(libzfs_handle_t *zfs_hdl) +static int +zfs_mark_pool(zpool_handle_t *zhp, void *unused) +{ + zfs_case_t *zcp; + uint64_t pool_guid; + uint64_t *tod; + er_timeval_t loaded = { 0 }; + nvlist_t *config, *vd; + uint_t nelem = 0; + int ret; + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + /* + * Mark any cases associated with just this pool. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) + zcp->zc_present = B_TRUE; + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem); + if (nelem == 2) { + loaded.ertv_sec = tod[0]; + loaded.ertv_nsec = tod[1]; + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) { + zcp->zc_when = loaded; + } + } + } + + ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); + assert(ret == 0); + + zfs_mark_vdev(pool_guid, vd, &loaded); + + zpool_close(zhp); + + return (0); +} + +struct load_time_arg { + uint64_t lt_guid; + er_timeval_t *lt_time; + boolean_t lt_found; +}; + +static int +zpool_find_load_time(zpool_handle_t *zhp, void *arg) { + struct load_time_arg *lta = arg; + uint64_t pool_guid; + uint64_t *tod; + nvlist_t *config; + uint_t nelem; + + if (lta->lt_found) { + zpool_close(zhp); + return (0); + } + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + if (pool_guid != lta->lt_guid) { + zpool_close(zhp); + return (0); + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem) == 0 && nelem == 2) { + lta->lt_found = B_TRUE; + lta->lt_time->ertv_sec = tod[0]; + lta->lt_time->ertv_nsec = tod[1]; + } + + zpool_close(zhp); + return (0); } +static void +zfs_purge_cases(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * There is no way to open a pool by GUID, or lookup a vdev by GUID. No + * matter what we do, we're going to have to stomach an O(vdevs * cases) + * algorithm. In reality, both quantities are likely so small that + * neither will matter. Given that iterating over pools is more + * expensive than iterating over the in-memory case list, we opt for a + * 'present' flag in each case that starts off cleared. We then iterate + * over all pools, marking those that are still present, and removing + * those that aren't found. + * + * Note that we could also construct an FMRI and rely on + * fmd_nvl_fmri_present(), but this would end up doing the same search. + */ + + /* + * Mark the cases as not present. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) + zcp->zc_present = B_FALSE; + + /* + * Iterate over all pools and mark the pools and vdevs found. If this + * fails (most probably because we're out of memory), then don't close + * any of the cases and we cannot be sure they are accurate. + */ + if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) + return; + + /* + * Remove those cases which were not found. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + if (!zcp->zc_present) + fmd_case_close(hdl, zcp->zc_case); + } + uu_list_walk_end(walk); +} + +/* + * Construct the name of a serd engine given the pool/vdev GUID and type (io or + * checksum). + */ +static void +zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, + const char *type) +{ + (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", + (long long unsigned int)pool_guid, + (long long unsigned int)vdev_guid, type); +} + +/* + * Solve a given ZFS case. This first checks to make sure the diagnosis is + * still valid, as well as cleaning up any pending timer associated with the + * case. + */ +static void +zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, + boolean_t checkunusable) +{ + nvlist_t *detector, *fault; + boolean_t serialize; + nvlist_t *fru = NULL; +#ifdef _HAS_FMD_TOPO + nvlist_t *fmri + topo_hdl_t *thp; + int err; +#endif + fmd_hdl_debug(hdl, "solving fault '%s'", faultname); + + /* + * Construct the detector from the case data. The detector is in the + * ZFS scheme, and is either the pool or the vdev, depending on whether + * this is a vdev or pool fault. + */ + detector = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); + (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, + zcp->zc_data.zc_pool_guid); + if (zcp->zc_data.zc_vdev_guid != 0) { + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, + zcp->zc_data.zc_vdev_guid); + } + +#ifdef _HAS_FMD_TOPO + /* + * We also want to make sure that the detector (pool or vdev) properly + * reflects the diagnosed state, when the fault corresponds to internal + * ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a + * device which was unavailable early in boot (because the driver/file + * wasn't available) and is now healthy will be mis-diagnosed. + */ + if (!fmd_nvl_fmri_present(hdl, detector) || + (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) { + fmd_case_close(hdl, zcp->zc_case); + nvlist_free(detector); + return; + } + + + fru = NULL; + if (zcp->zc_fru != NULL && + (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) { + /* + * If the vdev had an associated FRU, then get the FRU nvlist + * from the topo handle and use that in the suspect list. We + * explicitly lookup the FRU because the fmri reported from the + * kernel may not have up to date details about the disk itself + * (serial, part, etc). + */ + if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) { + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * If the disk is part of the system chassis, but the + * FRU indicates a different chassis ID than our + * current system, then ignore the error. This + * indicates that the device was part of another + * cluster head, and for obvious reasons cannot be + * imported on this system. + */ + if (libzfs_fru_notself(zhdl, zcp->zc_fru)) { + fmd_case_close(hdl, zcp->zc_case); + nvlist_free(fmri); + fmd_hdl_topo_rele(hdl, thp); + nvlist_free(detector); + return; + } + + /* + * If the device is no longer present on the system, or + * topo_fmri_fru() fails for other reasons, then fall + * back to the fmri specified in the vdev. + */ + if (topo_fmri_fru(thp, fmri, &fru, &err) != 0) + fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP); + nvlist_free(fmri); + } + + fmd_hdl_topo_rele(hdl, thp); + } +#endif + fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, + fru, detector); + fmd_case_add_suspect(hdl, zcp->zc_case, fault); + + nvlist_free(fru); + + fmd_case_solve(hdl, zcp->zc_case); + + serialize = B_FALSE; + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + serialize = B_TRUE; + } + if (serialize) + zfs_case_serialize(hdl, zcp); + + nvlist_free(detector); +} + +static boolean_t +timeval_earlier(er_timeval_t *a, er_timeval_t *b) +{ + return (a->ertv_sec < b->ertv_sec || + (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); +} + +/*ARGSUSED*/ +static void +zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) +{ + int64_t *tod; + uint_t nelem; + + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, + &nelem) == 0 && nelem == 2) { + when->ertv_sec = tod[0]; + when->ertv_nsec = tod[1]; + } else { + when->ertv_sec = when->ertv_nsec = UINT64_MAX; + } +} + +/* + * Main fmd entry point. + */ /*ARGSUSED*/ +static void +zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) +{ + zfs_case_t *zcp, *dcp; + int32_t pool_state; + uint64_t ena, pool_guid, vdev_guid; + er_timeval_t pool_load; + er_timeval_t er_when; + nvlist_t *detector; + boolean_t pool_found = B_FALSE; + boolean_t isresource; + char *type; + + /* + * We subscribe to notifications for vdev or pool removal. In these + * cases, there may be cases that no longer apply. Purge any cases + * that no longer apply. + */ + if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { + fmd_hdl_debug(hdl, "purging orphaned cases from %s", + strrchr(class, '.') + 1); + zfs_purge_cases(hdl); + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); + + if (isresource) { + /* + * For resources, we don't have a normal payload. + */ + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + pool_state = SPA_LOAD_OPEN; + else + pool_state = SPA_LOAD_NONE; + detector = NULL; + } else { + (void) nvlist_lookup_nvlist(nvl, + FM_EREPORT_DETECTOR, &detector); + (void) nvlist_lookup_int32(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); + } + + /* + * We also ignore all ereports generated during an import of a pool, + * since the only possible fault (.pool) would result in import failure, + * and hence no persistent fault. Some day we may want to do something + * with these ereports, so we continue generating them internally. + */ + if (pool_state == SPA_LOAD_IMPORT) { + zfs_stats.import_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "ignoring '%s' during import", class); + return; + } + + /* + * Device I/O errors are ignored during pool open. + */ + if (pool_state == SPA_LOAD_OPEN && + (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { + fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); + zfs_stats.dev_drops.fmds_value.ui64++; + return; + } + + /* + * We ignore ereports for anything except disks and files. + */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &type) == 0) { + if (strcmp(type, VDEV_TYPE_DISK) != 0 && + strcmp(type, VDEV_TYPE_FILE) != 0) { + zfs_stats.vdev_drops.fmds_value.ui64++; + return; + } + } + + /* + * Determine if this ereport corresponds to an open case. + * Each vdev or pool can have a single case. + */ + (void) nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); + if (nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + vdev_guid = 0; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) + ena = 0; + + zfs_ereport_when(hdl, nvl, &er_when); + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid) { + pool_found = B_TRUE; + pool_load = zcp->zc_when; + } + if (zcp->zc_data.zc_vdev_guid == vdev_guid) + break; + } + + /* + * Avoid falsely accusing a pool of being faulty. Do so by + * not replaying ereports that were generated prior to the + * current import. If the failure that generated them was + * transient because the device was actually removed but we + * didn't receive the normal asynchronous notification, we + * don't want to mark it as faulted and potentially panic. If + * there is still a problem we'd expect not to be able to + * import the pool, or that new ereports will be generated + * once the pool is used. + */ + if (pool_found && timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, er_when.ertv_nsec, + pool_load.ertv_sec, pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + + if (!pool_found) { + /* + * Haven't yet seen this pool, but same situation + * may apply. + */ + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + struct load_time_arg la; + + la.lt_guid = pool_guid; + la.lt_time = &pool_load; + la.lt_found = B_FALSE; + + if (zhdl != NULL && + zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && + la.lt_found == B_TRUE) { + pool_found = B_TRUE; + + if (timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, " + "pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, + er_when.ertv_nsec, pool_load.ertv_sec, + pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + } + } + + if (zcp == NULL) { + fmd_case_t *cs; + zfs_case_data_t data = { 0 }; + + /* + * If this is one of our 'fake' resource ereports, and there is + * no case open, simply discard it. + */ + if (isresource) { + zfs_stats.resource_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", + class, vdev_guid); + return; + } + + /* + * Skip tracking some ereports + */ + if (strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Open a new case. + */ + cs = fmd_case_open(hdl, NULL); + + fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", + vdev_guid, class); + + /* + * Initialize the case buffer. To commonize code, we actually + * create the buffer with existing data, and then call + * zfs_case_unserialize() to instantiate the in-core structure. + */ + fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); + + data.zc_version = CASE_DATA_VERSION_SERD; + data.zc_ena = ena; + data.zc_pool_guid = pool_guid; + data.zc_vdev_guid = vdev_guid; + data.zc_pool_state = (int)pool_state; + + fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); + + zcp = zfs_case_unserialize(hdl, cs); + assert(zcp != NULL); + if (pool_found) + zcp->zc_when = pool_load; + } + + if (isresource) { + fmd_hdl_debug(hdl, "resource event '%s'", class); + + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { + /* + * The 'resource.fs.zfs.autoreplace' event indicates + * that the pool was loaded with the 'autoreplace' + * property set. In this case, any pending device + * failures should be ignored, as the asynchronous + * autoreplace handling will take care of them. + */ + fmd_case_close(hdl, zcp->zc_case); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { + /* + * The 'resource.fs.zfs.removed' event indicates that + * device removal was detected, and the device was + * closed asynchronously. If this is the case, we + * assume that any recent I/O errors were due to the + * device removal, not any fault of the device itself. + * We reset the SERD engine, and cancel any pending + * timers. + */ + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + zfs_case_serialize(hdl, zcp); + } + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_checksum); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { + uint64_t state = 0; + + if (zcp != NULL && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && + state == VDEV_STATE_HEALTHY) { + fmd_hdl_debug(hdl, "closing case after a " + "device statechange to healthy"); + fmd_case_close(hdl, zcp->zc_case); + } + } + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Associate the ereport with this case. + */ + fmd_case_add_ereport(hdl, zcp->zc_case, ep); + + /* + * Don't do anything else if this case is already solved. + */ + if (fmd_case_solved(hdl, zcp->zc_case)) + return; + + fmd_hdl_debug(hdl, "error event '%s'", class); + + /* + * Determine if we should solve the case and generate a fault. We solve + * a case if: + * + * a. A pool failed to open (ereport.fs.zfs.pool) + * b. A device failed to open (ereport.fs.zfs.pool) while a pool + * was up and running. + * + * We may see a series of ereports associated with a pool open, all + * chained together by the same ENA. If the pool open succeeds, then + * we'll see no further ereports. To detect when a pool open has + * succeeded, we associate a timer with the event. When it expires, we + * close the case. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { + /* + * Pool level fault. Before solving the case, go through and + * close any open device cases that may be pending. + */ + for (dcp = uu_list_first(zfs_cases); dcp != NULL; + dcp = uu_list_next(zfs_cases, dcp)) { + if (dcp->zc_data.zc_pool_guid == + zcp->zc_data.zc_pool_guid && + dcp->zc_data.zc_vdev_guid != 0) + fmd_case_close(hdl, dcp->zc_case); + } + + zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { + /* + * Pool level fault for reading the intent logs. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { + /* + * Device fault. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { + char *failmode = NULL; + boolean_t checkremove = B_FALSE; + + /* + * If this is a checksum or I/O error, then toss it into the + * appropriate SERD engine and check to see if it has fired. + * Ideally, we want to do something more sophisticated, + * (persistent errors for a single data block, etc). For now, + * a single SERD engine is sufficient. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { + if (zcp->zc_data.zc_serd_io[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_io, + pool_guid, vdev_guid, "io"); + fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, + fmd_prop_get_int32(hdl, "io_N"), + fmd_prop_get_int64(hdl, "io_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) + checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { + if (zcp->zc_data.zc_serd_checksum[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_checksum, + pool_guid, vdev_guid, "checksum"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_checksum, + fmd_prop_get_int32(hdl, "checksum_N"), + fmd_prop_get_int64(hdl, "checksum_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, + zcp->zc_data.zc_serd_checksum, ep)) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.checksum", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && + (nvlist_lookup_string(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && + failmode != NULL) { + if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, + strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_continue", + B_FALSE); + } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, + strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_wait", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { +#ifndef __linux__ + /* This causes an unexpected fault diagnosis on linux */ + checkremove = B_TRUE; +#endif + } + + /* + * Because I/O errors may be due to device removal, we postpone + * any diagnosis until we're sure that we aren't about to + * receive a 'resource.fs.zfs.removed' event. + */ + if (checkremove) { + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, + zfs_remove_timeout); + if (!zcp->zc_data.zc_has_remove_timer) { + zcp->zc_data.zc_has_remove_timer = 1; + zfs_case_serialize(hdl, zcp); + } + } + } +} + +/* + * The timeout is fired when we diagnosed an I/O error, and it was not due to + * device removal (which would cause the timeout to be cancelled). + */ +/* ARGSUSED */ +static void +zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) +{ + zfs_case_t *zcp = data; + + if (id == zcp->zc_remove_timer) + zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE); +} + +/* + * The specified case has been closed and any case-specific + * data structures should be deallocated. + */ +static void +zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) +{ + zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); + + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); +} + +/* + * We use the fmd gc entry point to look for old cases that no longer apply. + * This allows us to keep our set of case data small in a long running system. + */ +static void +zfs_fm_gc(fmd_hdl_t *hdl) +{ + zfs_purge_cases(hdl); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_fm_recv, /* fmdo_recv */ + zfs_fm_timeout, /* fmdo_timeout */ + zfs_fm_close, /* fmdo_close */ + NULL, /* fmdo_stats */ + zfs_fm_gc, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "checksum_N", FMD_TYPE_UINT32, "10" }, + { "checksum_T", FMD_TYPE_TIME, "10min" }, + { "io_N", FMD_TYPE_UINT32, "10" }, + { "io_T", FMD_TYPE_TIME, "10min" }, + { "remove_timeout", FMD_TYPE_TIME, "15sec" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props +}; + void -zfs_diagnosis_fini(void) +_zfs_diagnosis_init(fmd_hdl_t *hdl) { + libzfs_handle_t *zhdl; + + if ((zhdl = __libzfs_init()) == NULL) + return; + + if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", + sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), + NULL, UU_LIST_POOL_DEBUG)) == NULL) { + __libzfs_fini(zhdl); + return; + } + + if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, + UU_LIST_DEBUG)) == NULL) { + uu_list_pool_destroy(zfs_case_pool); + __libzfs_fini(zhdl); + return; + } + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + __libzfs_fini(zhdl); + return; + } + + fmd_hdl_setspecific(hdl, zhdl); + + (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / + sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); + + zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); +} + +void +_zfs_diagnosis_fini(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl; + + /* + * Remove all active cases. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + fmd_hdl_debug(hdl, "removing case ena %llu", + (long long unsigned)zcp->zc_data.zc_ena); + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + } + uu_list_walk_end(walk); + + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + + zhdl = fmd_hdl_getspecific(hdl); + __libzfs_fini(zhdl); } diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 70548571a..053312e9e 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -168,7 +168,7 @@ zfs_unavail_pool(zpool_handle_t *zhp, void *data) * operation when finished). If this succeeds, then we're done. If it fails, * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, * but that the label was not what we expected. If the 'autoreplace' property - * is not set, then we relabel the disk (if specified), and attempt a 'zpool + * is enabled, then we relabel the disk (if specified), and attempt a 'zpool * replace'. If the online is successful, but the new state is something else * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of * race, and we should avoid attempting to relabel the disk. @@ -261,16 +261,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) } /* - * If the pool doesn't have the autoreplace property set, then attempt - * a true online (without the unspare flag), which will trigger a FMA - * fault. + * If the pool doesn't have the autoreplace property set, then use + * vdev online to trigger a FMA fault by posting an ereport. */ - if (!is_dm && (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || - !wholedisk || physpath == NULL)) { + if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || + !(wholedisk || is_dm) || (physpath == NULL)) { (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); - zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", - fullpath, libzfs_error_description(g_zfshdl)); + zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " + "not a whole disk for '%s'", fullpath); return; } @@ -291,12 +290,6 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) return; } - if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL)) { - zed_log_msg(LOG_INFO, "%s: Autoreplace is not enabled on this" - " pool, ignore disk.", __func__); - return; - } - /* Only autoreplace bad disks */ if ((vs->vs_state != VDEV_STATE_DEGRADED) && (vs->vs_state != VDEV_STATE_FAULTED) && @@ -369,9 +362,13 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) found = B_TRUE; break; } + zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", + physpath, device->pd_physpath); } if (!found) { /* unexpected partition slice encountered */ + zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", + fullpath); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; @@ -656,14 +653,10 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). * * For disks, we only want to pay attention to vdevs marked as whole - * disks. For multipath devices does whole disk apply? (TBD). + * disks or are a multipath device. */ - if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) { - if (!is_slice) { - (void) devphys_iter(devpath, devid, zfs_process_add, - is_slice); - } - } + if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) + (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); return (0); } @@ -849,9 +842,9 @@ zfs_enum_pools(void *arg) * For now, each agent has it's own libzfs instance */ int -zfs_slm_init(libzfs_handle_t *zfs_hdl) +zfs_slm_init() { - if ((g_zfshdl = libzfs_init()) == NULL) + if ((g_zfshdl = __libzfs_init()) == NULL) return (-1); /* @@ -863,6 +856,7 @@ zfs_slm_init(libzfs_handle_t *zfs_hdl) if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { list_destroy(&g_pool_list); + __libzfs_fini(g_zfshdl); return (-1); } @@ -903,19 +897,12 @@ zfs_slm_fini() } list_destroy(&g_device_list); - libzfs_fini(g_zfshdl); + __libzfs_fini(g_zfshdl); } void zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) { - static pthread_mutex_t serialize = PTHREAD_MUTEX_INITIALIZER; - - /* - * Serialize incoming events from zfs or libudev sources - */ - (void) pthread_mutex_lock(&serialize); zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); (void) zfs_slm_deliver_event(class, subclass, nvl); - (void) pthread_mutex_unlock(&serialize); } diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index 64930a10b..80617df08 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -20,26 +20,623 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * The ZFS retire agent is responsible for managing hot spares across all pools. + * When we see a device fault or a device removal, we try to open the associated + * pool and look for any hot spares. We iterate over any available hot spares + * and attempt a 'zpool replace' for each one. + * + * For vdevs diagnosed as faulty, the agent is also responsible for proactively + * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). */ +#include <sys/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> +#include <libzfs.h> +#include <string.h> + #include "zfs_agents.h" -#include "../zed_log.h" +#include "fmd_api.h" -/*ARGSUSED*/ -void -zfs_retire_recv(nvlist_t *nvl, const char *class) + +typedef struct zfs_retire_repaired { + struct zfs_retire_repaired *zrr_next; + uint64_t zrr_pool; + uint64_t zrr_vdev; +} zfs_retire_repaired_t; + +typedef struct zfs_retire_data { + libzfs_handle_t *zrd_hdl; + zfs_retire_repaired_t *zrd_repaired; +} zfs_retire_data_t; + +static void +zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) { + zfs_retire_repaired_t *zrp; + + while ((zrp = zdp->zrd_repaired) != NULL) { + zdp->zrd_repaired = zrp->zrr_next; + fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); + } } -/*ARGSUSED*/ -int -zfs_retire_init(libzfs_handle_t *zfs_hdl) +/* + * Find a pool with a matching GUID. + */ +typedef struct find_cbdata { + uint64_t cb_guid; + const char *cb_fru; + zpool_handle_t *cb_zhp; + nvlist_t *cb_vdev; +} find_cbdata_t; + +static int +find_pool(zpool_handle_t *zhp, void *data) +{ + find_cbdata_t *cbp = data; + + if (cbp->cb_guid == + zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * Find a vdev within a tree with a matching GUID. + */ +static nvlist_t * +find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, const char *search_fru, + uint64_t search_guid) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + nvlist_t *ret; + char *fru; + + if (search_fru != NULL) { + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &fru) == 0 && + libzfs_fru_compare(zhdl, fru, search_fru)) + return (nv); + } else { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + guid == search_guid) { + fmd_hdl_debug(fmd_module_hdl("zfs-retire"), + "matched vdev %llu", guid); + return (nv); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_fru, + search_guid)) != NULL) + return (ret); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_fru, + search_guid)) != NULL) + return (ret); + } + + return (NULL); +} + +/* + * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp) +{ + find_cbdata_t cb; + zpool_handle_t *zhp; + nvlist_t *config, *nvroot; + + /* + * Find the corresponding pool and make sure the vdev still exists. + */ + cb.cb_guid = pool_guid; + if (zpool_iter(zhdl, find_pool, &cb) != 1) + return (NULL); + + zhp = cb.cb_zhp; + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zpool_close(zhp); + return (NULL); + } + + if (vdev_guid != 0) { + if ((*vdevp = find_vdev(zhdl, nvroot, NULL, + vdev_guid)) == NULL) { + zpool_close(zhp); + return (NULL); + } + } + + return (zhp); +} + +#ifdef _HAS_FMD_TOPO +static int +search_pool(zpool_handle_t *zhp, void *data) { + find_cbdata_t *cbp = data; + nvlist_t *config; + nvlist_t *nvroot; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zpool_close(zhp); + return (0); + } + + if ((cbp->cb_vdev = find_vdev(zpool_get_handle(zhp), nvroot, + cbp->cb_fru, 0)) != NULL) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); return (0); } +/* + * Given a FRU FMRI, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_fru(libzfs_handle_t *zhdl, const char *fru, nvlist_t **vdevp) +{ + find_cbdata_t cb; + + cb.cb_fru = fru; + cb.cb_zhp = NULL; + if (zpool_iter(zhdl, search_pool, &cb) != 1) + return (NULL); + + *vdevp = cb.cb_vdev; + return (cb.cb_zhp); +} +#endif /* _HAS_FMD_TOPO */ + +/* + * Given a vdev, attempt to replace it with every known spare until one + * succeeds. + */ +static void +replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) +{ + nvlist_t *config, *nvroot, *replacement; + nvlist_t **spares; + uint_t s, nspares; + char *dev_name; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) + return; + + /* + * Find out if there are any hot spares available in the pool. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + return; + + replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT); + + dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); + + /* + * Try to replace each spare, ending when we successfully + * replace it. + */ + for (s = 0; s < nspares; s++) { + char *spare_name; + + if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, + &spare_name) != 0) + continue; + + (void) nvlist_add_nvlist_array(replacement, + ZPOOL_CONFIG_CHILDREN, &spares[s], 1); + + fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", + dev_name, basename(spare_name)); + + if (zpool_vdev_attach(zhp, dev_name, spare_name, + replacement, B_TRUE) == 0) + break; + } + + free(dev_name); + nvlist_free(replacement); +} + +/* + * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and + * ASRU is now usable. ZFS has found the device to be present and + * functioning. + */ +/*ARGSUSED*/ +static void +zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + zfs_retire_repaired_t *zrp; + uint64_t pool_guid, vdev_guid; +#ifdef _HAS_FMD_TOPO + nvlist_t *asru; +#endif + + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + return; + + /* + * Before checking the state of the ASRU, go through and see if we've + * already made an attempt to repair this ASRU. This list is cleared + * whenever we receive any kind of list event, and is designed to + * prevent us from generating a feedback loop when we attempt repairs + * against a faulted pool. The problem is that checking the unusable + * state of the ASRU can involve opening the pool, which can post + * statechange events but otherwise leave the pool in the faulted + * state. This list allows us to detect when a statechange event is + * due to our own request. + */ + for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { + if (zrp->zrr_pool == pool_guid && + zrp->zrr_vdev == vdev_guid) + return; + } + +#ifdef _HAS_FMD_TOPO + asru = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_uint8(asru, FM_VERSION, ZFS_SCHEME_VERSION0); + (void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); + (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_POOL, pool_guid); + (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_VDEV, vdev_guid); + + /* + * We explicitly check for the unusable state here to make sure we + * aren't responding to a transient state change. As part of opening a + * vdev, it's possible to see the 'statechange' event, only to be + * followed by a vdev failure later. If we don't check the current + * state of the vdev (or pool) before marking it repaired, then we risk + * generating spurious repair events followed immediately by the same + * diagnosis. + * + * This assumes that the ZFS scheme code associated unusable (i.e. + * isolated) with its own definition of faulty state. In the case of a + * DEGRADED leaf vdev (due to checksum errors), this is not the case. + * This works, however, because the transient state change is not + * posted in this case. This could be made more explicit by not + * relying on the scheme's unusable callback and instead directly + * checking the vdev state, where we could correctly account for + * DEGRADED state. + */ + if (!fmd_nvl_fmri_unusable(hdl, asru) && fmd_nvl_fmri_has_fault(hdl, + asru, FMD_HAS_FAULT_ASRU, NULL)) { + topo_hdl_t *thp; + char *fmri = NULL; + int err; + + thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); + if (topo_fmri_nvl2str(thp, asru, &fmri, &err) == 0) + (void) fmd_repair_asru(hdl, fmri); + fmd_hdl_topo_rele(hdl, thp); + + topo_hdl_strfree(thp, fmri); + } + nvlist_free(asru); +#endif + zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); + zrp->zrr_next = zdp->zrd_repaired; + zrp->zrr_pool = pool_guid; + zrp->zrr_vdev = vdev_guid; + zdp->zrd_repaired = zrp; + + fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", + vdev_guid, pool_guid); +} + /*ARGSUSED*/ +static void +zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, + const char *class) +{ + uint64_t pool_guid, vdev_guid; + zpool_handle_t *zhp; + nvlist_t *resource, *fault; + nvlist_t **faults; + uint_t f, nfaults; + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + libzfs_handle_t *zhdl = zdp->zrd_hdl; + boolean_t fault_device, degrade_device; + boolean_t is_repair; + char *scheme; + nvlist_t *vdev = NULL; + char *uuid; + int repair_done = 0; + boolean_t retire; + boolean_t is_disk; + vdev_aux_t aux; + uint64_t state = 0; + + fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); + + /* + * If this is a resource notifying us of device removal, then simply + * check for an available spare and continue. + */ + if (strcmp(class, "resource.fs.zfs.removed") == 0) { + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + return; + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + return; + + if (fmd_prop_get_int32(hdl, "spare_on_remove")) + replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); + return; + } + + if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) + return; + + /* + * Note: on zfsonlinux statechange events are more than just + * healthy ones so we need to confim the actual state value. + */ + if (strcmp(class, "resource.fs.zfs.statechange") == 0 && + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, + &state) == 0 && state == VDEV_STATE_HEALTHY) {; + zfs_vdev_repair(hdl, nvl); + return; + } + if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + zfs_vdev_repair(hdl, nvl); + return; + } + + zfs_retire_clear_data(hdl, zdp); + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) + is_repair = B_TRUE; + else + is_repair = B_FALSE; + + /* + * We subscribe to zfs faults as well as all repair events. + */ + if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, + &faults, &nfaults) != 0) + return; + + for (f = 0; f < nfaults; f++) { + fault = faults[f]; + + fault_device = B_FALSE; + degrade_device = B_FALSE; + is_disk = B_FALSE; + + if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, + &retire) == 0 && retire == 0) + continue; + + /* + * While we subscribe to fault.fs.zfs.*, we only take action + * for faults targeting a specific vdev (open failure or SERD + * failure). We also subscribe to fault.io.* events, so that + * faulty disks will be faulted in the ZFS configuration. + */ + if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { + fault_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.checksum")) { + degrade_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.device")) { + fault_device = B_FALSE; + } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { + is_disk = B_TRUE; + fault_device = B_TRUE; + } else { + continue; + } + + if (is_disk) { +#ifdef _HAS_FMD_TOPO + /* + * This is a disk fault. Lookup the FRU, convert it to + * an FMRI string, and attempt to find a matching vdev. + */ + if (nvlist_lookup_nvlist(fault, FM_FAULT_FRU, + &fru) != 0 || + nvlist_lookup_string(fru, FM_FMRI_SCHEME, + &scheme) != 0) + continue; + + if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) + continue; + + thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); + if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) { + fmd_hdl_topo_rele(hdl, thp); + continue; + } + + zhp = find_by_fru(zhdl, fmri, &vdev); + topo_hdl_strfree(thp, fmri); + fmd_hdl_topo_rele(hdl, thp); + + if (zhp == NULL) + continue; + + (void) nvlist_lookup_uint64(vdev, + ZPOOL_CONFIG_GUID, &vdev_guid); + aux = VDEV_AUX_EXTERNAL; +#else + continue; +#endif + } else { + /* + * This is a ZFS fault. Lookup the resource, and + * attempt to find the matching vdev. + */ + if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, + &resource) != 0 || + nvlist_lookup_string(resource, FM_FMRI_SCHEME, + &scheme) != 0) + continue; + + if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, + &pool_guid) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, + &vdev_guid) != 0) { + if (is_repair) + vdev_guid = 0; + else + continue; + } + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + continue; + + aux = VDEV_AUX_ERR_EXCEEDED; + } + + if (vdev_guid == 0) { + /* + * For pool-level repair events, clear the entire pool. + */ + fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", + zpool_get_name(zhp)); + (void) zpool_clear(zhp, NULL, NULL); + zpool_close(zhp); + continue; + } + + /* + * If this is a repair event, then mark the vdev as repaired and + * continue. + */ + if (is_repair) { + repair_done = 1; + fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", + zpool_get_name(zhp), vdev_guid); + (void) zpool_vdev_clear(zhp, vdev_guid); + zpool_close(zhp); + continue; + } + + /* + * Actively fault the device if needed. + */ + if (fault_device) + (void) zpool_vdev_fault(zhp, vdev_guid, aux); + if (degrade_device) + (void) zpool_vdev_degrade(zhp, vdev_guid, aux); + + if (fault_device || degrade_device) + fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", + fault_device ? "fault" : "degrade", vdev_guid, + zpool_get_name(zhp)); + + /* + * Attempt to substitute a hot spare. + */ + replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); + } + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && + nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) + fmd_case_uuresolved(hdl, uuid); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_retire_recv, /* fmdo_recv */ + NULL, /* fmdo_timeout */ + NULL, /* fmdo_close */ + NULL, /* fmdo_stats */ + NULL, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "spare_on_remove", FMD_TYPE_BOOL, "true" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props +}; + void -zfs_retire_fini(void) +_zfs_retire_init(fmd_hdl_t *hdl) { + zfs_retire_data_t *zdp; + libzfs_handle_t *zhdl; + + if ((zhdl = __libzfs_init()) == NULL) + return; + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + libzfs_fini(zhdl); + return; + } + + zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); + zdp->zrd_hdl = zhdl; + + fmd_hdl_setspecific(hdl, zdp); +} + +void +_zfs_retire_fini(fmd_hdl_t *hdl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + + if (zdp != NULL) { + zfs_retire_clear_data(hdl, zdp); + __libzfs_fini(zdp->zrd_hdl); + fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); + } } diff --git a/cmd/zed/zed.d/checksum-spare.sh b/cmd/zed/zed.d/checksum-spare.sh deleted file mode 120000 index f564f9322..000000000 --- a/cmd/zed/zed.d/checksum-spare.sh +++ /dev/null @@ -1 +0,0 @@ -io-spare.sh
\ No newline at end of file diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh deleted file mode 100755 index 1835cb4a3..000000000 --- a/cmd/zed/zed.d/io-spare.sh +++ /dev/null @@ -1,239 +0,0 @@ -#!/bin/sh -# -# Replace a device with a hot spare in response to IO or CHECKSUM errors. -# The following actions will be performed automatically when the number -# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or -# ZED_SPARE_ON_CHECKSUM_ERRORS. -# -# 1) FAULT the device on IO errors, no futher IO will be attempted. -# DEGRADE the device on checksum errors, the device is still -# functional and can be used to service IO requests. -# 2) Set the SES fault beacon for the device. -# 3) Replace the device with a hot spare if any are available. -# -# Once the hot sparing operation is complete either the failed device or -# the hot spare must be manually retired using the 'zpool detach' command. -# The 'autoreplace' functionality which would normally take care of this -# under Illumos has not yet been implemented. -# -# Full support for autoreplace is planned, but it requires that the full -# ZFS Diagnosis Engine be ported. In the meanwhile this script provides -# the majority of the expected hot spare functionality. -# -# Exit codes: -# 0: hot spare replacement successful -# 1: hot spare device not available -# 2: hot sparing disabled or threshold not reached -# 3: device already faulted or degraded -# 9: internal error - -[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" -. "${ZED_ZEDLET_DIR}/zed-functions.sh" - -# Disabled by default. Enable in the zed.rc file. -: "${ZED_SPARE_ON_CHECKSUM_ERRORS:=0}" -: "${ZED_SPARE_ON_IO_ERRORS:=0}" - - -# query_vdev_status (pool, vdev) -# -# Given a [pool] and [vdev], return the matching vdev path & status on stdout. -# -# Warning: This function does not handle the case of [pool] or [vdev] -# containing whitespace. Beware of ShellCheck SC2046. Caveat emptor. -# -# Arguments -# pool: pool name -# vdev: virtual device name -# -# StdOut -# arg1: vdev pathname -# arg2: vdev status -# -query_vdev_status() -{ - local pool="$1" - local vdev="$2" - local t - - vdev="$(basename -- "${vdev}")" - ([ -n "${pool}" ] && [ -n "${vdev}" ]) || return - t="$(printf '\t')" - - "${ZPOOL}" status "${pool}" 2>/dev/null | sed -n -e \ - "s,^[ $t]*\(.*${vdev}\(-part[0-9]\+\)\?\)[ $t]*\([A-Z]\+\).*,\1 \3,p" \ - | tail -1 -} - - -# notify (old_vdev, new_vdev, num_errors) -# -# Send a notification regarding the hot spare replacement. -# -# Arguments -# old_vdev: path of old vdev that has failed -# new_vdev: path of new vdev used as the hot spare replacement -# num_errors: number of errors that triggered this replacement -# -notify() -{ - local old_vdev="$1" - local new_vdev="$2" - local num_errors="$3" - local note_subject - local note_pathname - local s - local rv - - umask 077 - note_subject="ZFS hot spare replacement for ${ZEVENT_POOL} on $(hostname)" - note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" - { - [ "${num_errors}" -ne 1 ] 2>/dev/null && s="s" - - echo "ZFS has replaced a failing device with a hot spare after" \ - "${num_errors} ${ZEVENT_SUBCLASS} error${s}:" - echo - echo " eid: ${ZEVENT_EID}" - echo " class: ${ZEVENT_SUBCLASS}" - echo " host: $(hostname)" - echo " time: ${ZEVENT_TIME_STRING}" - echo " old: ${old_vdev}" - echo " new: ${new_vdev}" - - "${ZPOOL}" status "${ZEVENT_POOL}" - - } > "${note_pathname}" - - zed_notify "${note_subject}" "${note_pathname}"; rv=$? - rm -f "${note_pathname}" - return "${rv}" -} - - -# main -# -# Arguments -# none -# -# Return -# see above -# -main() -{ - local num_errors - local action - local lockfile - local vdev_path - local vdev_status - local spare - local spare_path - local spare_status - local zpool_err - local zpool_rv - local rv - - # Avoid hot-sparing a hot-spare. - # - # Note: ZEVENT_VDEV_PATH is not defined for ZEVENT_VDEV_TYPE=spare. - # - [ "${ZEVENT_VDEV_TYPE}" = "spare" ] && exit 2 - - [ -n "${ZEVENT_POOL}" ] || exit 9 - [ -n "${ZEVENT_VDEV_GUID}" ] || exit 9 - [ -n "${ZEVENT_VDEV_PATH}" ] || exit 9 - - zed_check_cmd "${ZPOOL}" "${ZINJECT}" || exit 9 - - # Fault the device after a given number of I/O errors. - # - if [ "${ZEVENT_SUBCLASS}" = "io" ]; then - if [ "${ZED_SPARE_ON_IO_ERRORS}" -gt 0 ]; then - num_errors=$((ZEVENT_VDEV_READ_ERRORS + ZEVENT_VDEV_WRITE_ERRORS)) - [ "${num_errors}" -ge "${ZED_SPARE_ON_IO_ERRORS}" ] \ - && action="fault" - fi 2>/dev/null - - # Degrade the device after a given number of checksum errors. - # - elif [ "${ZEVENT_SUBCLASS}" = "checksum" ]; then - if [ "${ZED_SPARE_ON_CHECKSUM_ERRORS}" -gt 0 ]; then - num_errors="${ZEVENT_VDEV_CKSUM_ERRORS}" - [ "${num_errors}" -ge "${ZED_SPARE_ON_CHECKSUM_ERRORS}" ] \ - && action="degrade" - fi 2>/dev/null - - else - zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" - exit 9 - fi - - # Error threshold not reached. - # - if [ -z "${action}" ]; then - exit 2 - fi - - lockfile="zed.spare.lock" - zed_lock "${lockfile}" - - # shellcheck disable=SC2046 - set -- $(query_vdev_status "${ZEVENT_POOL}" "${ZEVENT_VDEV_PATH}") - vdev_path="$1" - vdev_status="$2" - - # Device is already FAULTED or DEGRADED. - # - if [ "${vdev_status}" = "FAULTED" ] \ - || [ "${vdev_status}" = "DEGRADED" ]; then - rv=3 - - else - rv=1 - - # 1) FAULT or DEGRADE the device. - # - "${ZINJECT}" -d "${ZEVENT_VDEV_GUID}" -A "${action}" "${ZEVENT_POOL}" - - # 2) Set the SES fault beacon. - # - # TODO: Set the 'fault' or 'ident' beacon for the device. This can - # be done through the sg_ses utility. The only hard part is to map - # the sd device to its corresponding enclosure and slot. We may - # be able to leverage the existing vdev_id scripts for this. - # - # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 - # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 - - # 3) Replace the device with a hot spare. - # - # Round-robin through the spares trying those that are available. - # - for spare in ${ZEVENT_VDEV_SPARE_PATHS}; do - - # shellcheck disable=SC2046 - set -- $(query_vdev_status "${ZEVENT_POOL}" "${spare}") - spare_path="$1" - spare_status="$2" - - [ "${spare_status}" = "AVAIL" ] || continue - - zpool_err="$("${ZPOOL}" replace "${ZEVENT_POOL}" \ - "${ZEVENT_VDEV_GUID}" "${spare_path}" 2>&1)"; zpool_rv=$? - - if [ "${zpool_rv}" -ne 0 ]; then - [ -n "${zpool_err}" ] && zed_log_err "zpool ${zpool_err}" - else - notify "${vdev_path}" "${spare_path}" "${num_errors}" - rv=0 - break - fi - done - fi - - zed_unlock "${lockfile}" - exit "${rv}" -} - - -main "$@" diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index b5f57508e..10f24c996 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -80,7 +80,7 @@ zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); - (void) zfs_slm_event(class, subclass, nvl); + (void) zfs_agent_post_event(class, subclass, nvl); } /* @@ -213,8 +213,6 @@ zed_udev_monitor(void *arg) strcmp(type, "disk") == 0 && part != NULL && part[0] != '\0') { /* skip and wait for partition event */ - zed_log_msg(LOG_INFO, "zed_udev_monitor: %s waiting " - "for slice", udev_device_get_devnode(dev)); udev_device_unref(dev); continue; } @@ -297,12 +295,19 @@ zed_udev_monitor(void *arg) * dev are the same name (i.e. /dev/dm-5), then * there is no real underlying disk for this * multipath device, and so this "change" event - * really a multipath removal. + * really is a multipath removal. */ class = EC_DEV_ADD; subclass = ESC_DISK; } else { - /* multipath remove, ignore it. */ + tmp = (char *) + udev_device_get_property_value(dev, + "DM_NR_VALID_PATHS"); + /* treat as a multipath remove */ + if (tmp != NULL && strcmp(tmp, "0") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } } free(tmp2); } diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c index 2c97b7115..c614f4fe8 100644 --- a/cmd/zed/zed_event.c +++ b/cmd/zed/zed_event.c @@ -55,12 +55,8 @@ zed_event_init(struct zed_conf *zcp) zed_log_die("Failed to open \"%s\": %s", ZFS_DEV, strerror(errno)); - if (zfs_slm_init(zcp->zfs_hdl) != 0) - zed_log_die("Failed to initialize zfs slm"); - if (zfs_diagnosis_init(zcp->zfs_hdl) != 0) - zed_log_die("Failed to initialize zfs diagnosis"); - if (zfs_retire_init(zcp->zfs_hdl) != 0) - zed_log_die("Failed to initialize zfs retire"); + zfs_agent_init(zcp->zfs_hdl); + if (zed_disk_event_init() != 0) zed_log_die("Failed to initialize disk events"); } @@ -75,9 +71,7 @@ zed_event_fini(struct zed_conf *zcp) zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL)); zed_disk_event_fini(); - zfs_retire_fini(); - zfs_diagnosis_fini(); - zfs_slm_fini(); + zfs_agent_fini(); if (zcp->zevent_fd >= 0) { if (close(zcp->zevent_fd) < 0) @@ -832,17 +826,6 @@ _zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) } } -static void -_zed_internal_event(const char *class, nvlist_t *nvl) -{ - /* - * NOTE: only vdev check is handled for now - */ - if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { - (void) zfs_slm_event("EC_zfs", "ESC_ZFS_vdev_check", nvl); - } -} - /* * Service the next zevent, blocking until one is available. */ @@ -894,7 +877,7 @@ zed_event_service(struct zed_conf *zcp) "Failed to lookup zevent class (eid=%llu)", eid); } else { /* let internal modules see this event first */ - _zed_internal_event(class, nvl); + zfs_agent_post_event(class, NULL, nvl); zsp = zed_strings_create(); diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c index 1a3b76d07..36ca9f345 100644 --- a/cmd/zed/zed_exec.c +++ b/cmd/zed/zed_exec.c @@ -20,6 +20,7 @@ #include <string.h> #include <sys/stat.h> #include <sys/wait.h> +#include <time.h> #include <unistd.h> #include "zed_file.h" #include "zed_log.h" @@ -115,19 +116,39 @@ _zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog, zed_file_close_from(ZEVENT_FILENO + 1); execle(path, prog, NULL, env); _exit(127); - } else { - zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", - prog, eid, pid); - /* FIXME: Timeout rogue child processes with sigalarm? */ -restart: - wpid = waitpid(pid, &status, 0); + } + + /* parent process */ + + zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", + prog, eid, pid); + + /* FIXME: Timeout rogue child processes with sigalarm? */ + + /* + * Wait for child process using WNOHANG to limit + * the time spent waiting to 10 seconds (10,000ms). + */ + for (n = 0; n < 1000; n++) { + wpid = waitpid(pid, &status, WNOHANG); if (wpid == (pid_t) -1) { if (errno == EINTR) - goto restart; + continue; zed_log_msg(LOG_WARNING, "Failed to wait for \"%s\" eid=%llu pid=%d", prog, eid, pid); - } else if (WIFEXITED(status)) { + break; + } else if (wpid == 0) { + struct timespec t; + + /* child still running */ + t.tv_sec = 0; + t.tv_nsec = 10000000; /* 10ms */ + (void) nanosleep(&t, NULL); + continue; + } + + if (WIFEXITED(status)) { zed_log_msg(LOG_INFO, "Finished \"%s\" eid=%llu pid=%d exit=%d", prog, eid, pid, WEXITSTATUS(status)); @@ -141,6 +162,16 @@ restart: "Finished \"%s\" eid=%llu pid=%d status=0x%X", prog, eid, (unsigned int) status); } + break; + } + + /* + * kill child process after 10 seconds + */ + if (wpid == 0) { + zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d", + prog, pid); + (void) kill(pid, SIGKILL); } } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c51ed43e6..db44d2ae1 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3373,6 +3373,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { + /* + * Since vdev_offline() code path is already in an offline + * state we can miss a statechange event to OFFLINE. Check + * the previous state to catch this condition. + */ + if (vd->vdev_ops->vdev_op_leaf && + (state == VDEV_STATE_OFFLINE) && + (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { + /* post an offline state change */ + zfs_post_state_change(spa, vd, vd->vdev_prevstate); + } vd->vdev_stat.vs_aux = aux; return; } |