diff options
-rw-r--r-- | cmd/zed/Makefile.am | 12 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_agents.h | 57 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_diagnosis.c | 47 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_mod.c | 879 | ||||
-rw-r--r-- | cmd/zed/agents/zfs_retire.c | 45 | ||||
-rwxr-xr-x | cmd/zed/zed.d/all-syslog.sh | 3 | ||||
-rw-r--r-- | cmd/zed/zed_disk_event.c | 367 | ||||
-rw-r--r-- | cmd/zed/zed_disk_event.h | 31 | ||||
-rw-r--r-- | cmd/zed/zed_event.c | 43 | ||||
-rw-r--r-- | cmd/zpool/zpool_main.c | 16 | ||||
-rw-r--r-- | include/libzfs.h | 7 | ||||
-rw-r--r-- | include/sys/fm/fs/zfs.h | 2 | ||||
-rw-r--r-- | include/sys/spa.h | 2 | ||||
-rw-r--r-- | include/sys/sysevent/Makefile.am | 3 | ||||
-rw-r--r-- | include/sys/sysevent/dev.h | 261 | ||||
-rw-r--r-- | lib/libzfs/libzfs_import.c | 16 | ||||
-rw-r--r-- | lib/libzfs/libzfs_pool.c | 10 | ||||
-rw-r--r-- | module/zfs/vdev.c | 27 | ||||
-rw-r--r-- | module/zfs/zfs_fm.c | 42 | ||||
-rwxr-xr-x | scripts/zconfig.sh | 6 |
20 files changed, 1837 insertions, 39 deletions
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index 20e8bf1dc..086d75d36 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -8,11 +8,13 @@ EXTRA_DIST = zed.d/README sbin_PROGRAMS = zed -zed_SOURCES = \ +ZED_SRC = \ zed.c \ zed.h \ zed_conf.c \ zed_conf.h \ + zed_disk_event.c \ + zed_disk_event.h \ zed_event.c \ zed_event.h \ zed_exec.c \ @@ -24,6 +26,14 @@ zed_SOURCES = \ zed_strings.c \ zed_strings.h +FMA_SRC = \ + agents/zfs_agents.h \ + agents/zfs_diagnosis.c \ + agents/zfs_mod.c \ + agents/zfs_retire.c + +zed_SOURCES = $(ZED_SRC) $(FMA_SRC) + zed_LDADD = \ $(top_builddir)/lib/libavl/libavl.la \ $(top_builddir)/lib/libnvpair/libnvpair.la \ diff --git a/cmd/zed/agents/zfs_agents.h b/cmd/zed/agents/zfs_agents.h new file mode 100644 index 000000000..4630f2212 --- /dev/null +++ b/cmd/zed/agents/zfs_agents.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZFS_AGENTS_H +#define ZFS_AGENTS_H + +#include <libzfs.h> +#include <libnvpair.h> + + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Agents from ZFS FMA and syseventd - linked directly into ZED daemon binary + */ + +/* + * ZFS Sysevent Linkable Module (SLM) + */ +extern int zfs_slm_init(libzfs_handle_t *zfs_hdl); +extern void zfs_slm_fini(void); +extern void zfs_slm_event(const char *, const char *, nvlist_t *); + +/* + * ZFS FMA Retire Agent + */ +extern int zfs_retire_init(libzfs_handle_t *zfs_hdl); +extern void zfs_retire_fini(void); +extern void zfs_retire_recv(nvlist_t *nvl, const char *class); + +/* + * ZFS FMA Diagnosis Engine + */ +extern int zfs_diagnosis_init(libzfs_handle_t *zfs_hdl); +extern void zfs_diagnosis_fini(void); +extern void zfs_diagnosis_recv(nvlist_t *nvl, const char *class); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZFS_AGENTS_H */ diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c new file mode 100644 index 000000000..4d534a4d3 --- /dev/null +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "zfs_agents.h" +#include "../zed_log.h" + + +/*ARGSUSED*/ +void +zfs_diagnosis_recv(nvlist_t *nvl, const char *class) +{ +} + +/*ARGSUSED*/ +int +zfs_diagnosis_init(libzfs_handle_t *zfs_hdl) +{ + return (0); +} + +/*ARGSUSED*/ +void +zfs_diagnosis_fini(void) +{ +} diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c new file mode 100644 index 000000000..c8326f21f --- /dev/null +++ b/cmd/zed/agents/zfs_mod.c @@ -0,0 +1,879 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * ZFS syseventd module. + * + * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c + * + * The purpose of this module is to identify when devices are added to the + * system, and appropriately online or replace the affected vdevs. + * + * When a device is added to the system: + * + * 1. Search for any vdevs whose devid matches that of the newly added + * device. + * + * 2. If no vdevs are found, then search for any vdevs whose udev path + * matches that of the new device. + * + * 3. If no vdevs match by either method, then ignore the event. + * + * 4. Attempt to online the device with a flag to indicate that it should + * be unspared when resilvering completes. If this succeeds, then the + * same device was inserted and we should continue normally. + * + * 5. If the pool does not have the 'autoreplace' property set, attempt to + * online the device again without the unspare flag, which will + * generate a FMA fault. + * + * 6. If the pool has the 'autoreplace' property set, and the matching vdev + * is a whole disk, then label the new disk and attempt a 'zpool + * replace'. + * + * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK + * event indicates that a device failed to open during pool load, but the + * autoreplace property was set. In this case, we deferred the associated + * FMA fault until our module had a chance to process the autoreplace logic. + * If the device could not be replaced, then the second online attempt will + * trigger the FMA fault that we skipped earlier. + * + * ZFS on Linux porting notes: + * In lieu of a thread pool, just spawn a thread on demmand. + * Linux udev provides a disk insert for both the disk and the partition + * + */ + +#include <ctype.h> +#include <devid.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libzfs.h> +#include <limits.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <sys/list.h> +#include <sys/sunddi.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <pthread.h> +#include <unistd.h> +#include "zfs_agents.h" +#include "../zed_log.h" + +#define DEV_BYID_PATH "/dev/disk/by-id/" +#define DEV_BYPATH_PATH "/dev/disk/by-path/" + +typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); + +libzfs_handle_t *g_zfshdl; +list_t g_pool_list; /* list of unavailable pools at initialization */ +list_t g_device_list; /* list of disks with asynchronous label request */ +boolean_t g_enumeration_done; +pthread_t g_zfs_tid; + +typedef struct unavailpool { + zpool_handle_t *uap_zhp; + pthread_t uap_enable_tid; /* dataset enable thread if activated */ + list_node_t uap_node; +} unavailpool_t; + +typedef struct pendingdev { + char pd_physpath[128]; + list_node_t pd_node; +} pendingdev_t; + +static int +zfs_toplevel_state(zpool_handle_t *zhp) +{ + nvlist_t *nvroot; + vdev_stat_t *vs; + unsigned int c; + + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + return (vs->vs_state); +} + +static int +zfs_unavail_pool(zpool_handle_t *zhp, void *data) +{ + zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", + zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); + + if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { + unavailpool_t *uap; + uap = malloc(sizeof (unavailpool_t)); + uap->uap_zhp = zhp; + uap->uap_enable_tid = 0; + list_insert_tail((list_t *)data, uap); + } else { + zpool_close(zhp); + } + return (0); +} + +/* + * Two stage replace on Linux + * since we get disk notifications + * we can wait for partitioned disk slice to show up! + * + * First stage tags the disk, initiates async partitioning, and returns + * Second stage finds the tag and proceeds to ZFS labeling/replace + * + * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach + * + * 1. physical match with no fs, no partition + * tag it top, partition disk + * + * 2. physical match again, see partion and tag + * + */ + +/* + * The device associated with the given vdev (either by devid or physical path) + * has been added to the system. If 'isdisk' is set, then we only attempt a + * replacement if it's a whole disk. This also implies that we should label the + * disk first. + * + * First, we attempt to online the device (making sure to undo any spare + * operation when finished). If this succeeds, then we're done. If it fails, + * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, + * but that the label was not what we expected. If the 'autoreplace' property + * is not set, then we relabel the disk (if specified), and attempt a 'zpool + * replace'. If the online is successful, but the new state is something else + * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of + * race, and we should avoid attempting to relabel the disk. + * + * Also can arrive here from a ESC_ZFS_VDEV_CHECK event + */ +static void +zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) +{ + char *path; + vdev_state_t newstate; + nvlist_t *nvroot, *newvd; + pendingdev_t *device; + uint64_t wholedisk = 0ULL; + uint64_t offline = 0ULL; + uint64_t guid = 0ULL; + char *physpath = NULL, *new_devid = NULL; + char rawpath[PATH_MAX], fullpath[PATH_MAX]; + char devpath[PATH_MAX]; + int ret; + + if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) + return; + + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); + + if (offline) + return; /* don't intervene if it was taken offline */ + + zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s' (%llu)", + zpool_get_name(zhp), path, (long long unsigned int)guid); + + /* + * The VDEV guid is preferred for identification (gets passed in path) + */ + if (guid != 0) { + (void) snprintf(fullpath, sizeof (fullpath), "%llu", + (long long unsigned int)guid); + } else { + /* + * otherwise use path sans partition suffix for whole disks + */ + (void) strlcpy(fullpath, path, sizeof (fullpath)); + if (wholedisk) { + char *spath = zfs_strip_partition(g_zfshdl, fullpath); + + (void) strlcpy(fullpath, spath, sizeof (fullpath)); + free(spath); + } + } + + /* + * Attempt to online the device. + */ + if (zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && + (newstate == VDEV_STATE_HEALTHY || + newstate == VDEV_STATE_DEGRADED)) { + zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", + fullpath, (newstate == VDEV_STATE_HEALTHY) ? + "HEALTHY" : "DEGRADED"); + return; + } + + /* + * If the pool doesn't have the autoreplace property set, then attempt + * a true online (without the unspare flag), which will trigger a FMA + * fault. + */ + if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || + !wholedisk || physpath == NULL) { + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", + fullpath, libzfs_error_description(g_zfshdl)); + return; + } + + /* + * convert physical path into its current device node + */ + (void) snprintf(rawpath, sizeof (rawpath), "%s%s", DEV_BYPATH_PATH, + physpath); + if (realpath(rawpath, devpath) == NULL) { + zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", + rawpath, strerror(errno)); + + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", + fullpath, libzfs_error_description(g_zfshdl)); + return; + } + + /* + * we're auto-replacing a raw disk, so label it first + */ + if (!labeled) { + char *leafname; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. Before we can label the disk, we need + * to map the physical string that was matched on to the under + * lying device node. + * + * If any part of this process fails, then do a force online + * to trigger a ZFS fault for the device (and any hot spare + * replacement). + */ + leafname = strrchr(devpath, '/') + 1; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. + */ + if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { + zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + "label '%s' (%s)", leafname, + libzfs_error_description(g_zfshdl)); + + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + /* + * The disk labeling is asynchronous on Linux. Just record + * this label request and return as there will be another + * disk add event for the partition after the labeling is + * completed. + */ + device = malloc(sizeof (pendingdev_t)); + (void) strlcpy(device->pd_physpath, physpath, + sizeof (device->pd_physpath)); + list_insert_tail(&g_device_list, device); + + zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + leafname, (long long unsigned int)guid); + + return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ + + } else /* labeled */ { + boolean_t found = B_FALSE; + /* + * match up with request above to label the disk + */ + for (device = list_head(&g_device_list); device != NULL; + device = list_next(&g_device_list, device)) { + if (strcmp(physpath, device->pd_physpath) == 0) { + list_remove(&g_device_list, device); + free(device); + found = B_TRUE; + break; + } + } + if (!found) { + /* unexpected partition slice encountered */ + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", + physpath, (long long unsigned int)guid); + + if (nvlist_lookup_string(vdev, "new_devid", &new_devid) != 0) { + zed_log_msg(LOG_INFO, " auto replace: missing devid!"); + return; + } + + (void) snprintf(devpath, sizeof (devpath), "%s%s", + DEV_BYID_PATH, new_devid); + path = devpath; + } + + /* + * Construct the root vdev to pass to zpool_vdev_attach(). While adding + * the entire vdev structure is harmless, we construct a reduced set of + * path/physpath/wholedisk to keep it simple. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + return; + } + if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + nvlist_free(nvroot); + return; + } + + if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || + (physpath != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || + nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || + nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, + 1) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); + nvlist_free(newvd); + nvlist_free(nvroot); + return; + } + + nvlist_free(newvd); + + /* + * auto replace a leaf disk at same physical location + */ + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); + + zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + fullpath, path, (ret == 0) ? "no errors" : + libzfs_error_description(g_zfshdl)); + + nvlist_free(nvroot); +} + +/* + * Utility functions to find a vdev matching given criteria. + */ +typedef struct dev_data { + const char *dd_compare; + const char *dd_prop; + zfs_process_func_t dd_func; + boolean_t dd_found; + boolean_t dd_islabeled; + uint64_t dd_pool_guid; + uint64_t dd_vdev_guid; + const char *dd_new_devid; +} dev_data_t; + +static void +zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) +{ + dev_data_t *dp = data; + char *path; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + return; + } + + /* once a vdev was matched and processed there is nothing left to do */ + if (dp->dd_found) + return; + + /* + * Match by GUID if available otherwise fallback to devid or physical + */ + if (dp->dd_vdev_guid != 0) { + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid != dp->dd_vdev_guid) { + return; + } + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); + dp->dd_found = B_TRUE; + + } else if (dp->dd_compare != NULL) { + /* + * NOTE: On Linux there is an event for partition, so unlike + * illumos, substring matching is not required to accomodate + * the partition suffix. An exact match will be present in + * the dp->dd_compare value. + */ + if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || + strcmp(dp->dd_compare, path) != 0) { + return; + } + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", + dp->dd_prop, path); + dp->dd_found = B_TRUE; + + /* pass the new devid for use by replacing code */ + if (dp->dd_islabeled && dp->dd_new_devid != NULL) { + (void) nvlist_add_string(nvl, "new_devid", + dp->dd_new_devid); + } + } + + (dp->dd_func)(zhp, nvl, dp->dd_islabeled); +} + +static void * +zfs_enable_ds(void *arg) +{ + unavailpool_t *pool = (unavailpool_t *)arg; + + assert(pool->uap_enable_tid = pthread_self()); + + (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); + zpool_close(pool->uap_zhp); + pool->uap_zhp = NULL; + + /* Note: zfs_slm_fini() will cleanup this pool entry on exit */ + return (NULL); +} + +static int +zfs_iter_pool(zpool_handle_t *zhp, void *data) +{ + nvlist_t *config, *nvl; + dev_data_t *dp = data; + uint64_t pool_guid; + unavailpool_t *pool; + + zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", + zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); + + /* + * For each vdev in this pool, look for a match to apply dd_func + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (dp->dd_pool_guid == 0 || + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { + (void) nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvl); + zfs_iter_vdev(zhp, nvl, data); + } + } + + /* + * if this pool was originally unavailable, + * then enable its datasets asynchronously + */ + if (g_enumeration_done) { + for (pool = list_head(&g_pool_list); pool != NULL; + pool = list_next(&g_pool_list, pool)) { + + if (pool->uap_enable_tid != 0) + continue; /* entry already processed */ + if (strcmp(zpool_get_name(zhp), + zpool_get_name(pool->uap_zhp))) + continue; + if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { + /* send to a background thread; keep on list */ + (void) pthread_create(&pool->uap_enable_tid, + NULL, zfs_enable_ds, pool); + break; + } + } + } + + zpool_close(zhp); + return (dp->dd_found); /* cease iteration after a match */ +} + +/* + * Given a physical device location, iterate over all + * (pool, vdev) pairs which correspond to that location. + */ +static boolean_t +devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, + boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = physical; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; /* used by auto replace code */ + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Given a device identifier, find any vdevs with a matching devid. + * On Linux we can match devid directly which is always a whole disk. + */ +static boolean_t +devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = devid; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_DEVID; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Handle a EC_DEV_ADD.ESC_DISK event. + * + * illumos + * Expects: DEV_PHYS_PATH string in schema + * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID + * + * path: '/dev/dsk/c0t1d0s0' (persistent) + * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' + * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' + * + * linux + * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema + * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID + * + * path: '/dev/sdc1' (not persistent) + * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' + */ +static int +zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) +{ + char *devpath = NULL, *devid; + boolean_t is_slice; + + /* + * Expecting a devid string and an optional physical location + */ + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) + return (-1); + + (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); + + zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s)", devid, + devpath ? devpath : "NULL"); + + is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); + + /* + * Iterate over all vdevs looking for a match in the folllowing order: + * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) + * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). + * + * For disks, we only want to pay attention to vdevs marked as whole + * disks. For multipath devices does whole disk apply? (TBD). + */ + if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) { + if (!is_slice) { + (void) devphys_iter(devpath, devid, zfs_process_add, + is_slice); + } + } + + return (0); +} + +/* + * Called when we receive a VDEV_CHECK event, which indicates a device could not + * be opened during initial pool open, but the autoreplace property was set on + * the pool. In this case, we treat it as if it were an add event. + */ +static int +zfs_deliver_check(nvlist_t *nvl) +{ + dev_data_t data = { 0 }; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, + &data.dd_pool_guid) != 0 || + nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, + &data.dd_vdev_guid) != 0 || + data.dd_vdev_guid == 0) + return (0); + + zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", + data.dd_pool_guid, data.dd_vdev_guid); + + data.dd_func = zfs_process_add; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (0); +} + +static int +zfsdle_vdev_online(zpool_handle_t *zhp, void *data) +{ + char *devname = data; + boolean_t avail_spare, l2cache; + vdev_state_t newstate; + nvlist_t *tgt; + + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", + devname, zpool_get_name(zhp)); + + if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, + &avail_spare, &l2cache, NULL)) != NULL) { + char *path, fullpath[MAXPATHLEN]; + uint64_t wholedisk = 0ULL; + + verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, + &path) == 0); + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) == 0); + + (void) strlcpy(fullpath, path, sizeof (fullpath)); + if (wholedisk) { + char *spath = zfs_strip_partition(g_zfshdl, fullpath); + + (void) strlcpy(fullpath, spath, sizeof (fullpath)); + free(spath); + + /* + * We need to reopen the pool associated with this + * device so that the kernel can update the size + * of the expanded device. + */ + (void) zpool_reopen(zhp); + } + + if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: setting " + "device '%s' to ONLINE state in pool '%s'", + fullpath, zpool_get_name(zhp)); + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) + (void) zpool_vdev_online(zhp, fullpath, 0, + &newstate); + } + zpool_close(zhp); + return (1); + } + zpool_close(zhp); + return (0); +} + +/* + * This function handles the ESC_DEV_DLE event. + */ +static int +zfs_deliver_dle(nvlist_t *nvl) +{ + char *devname; + + if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) { + zed_log_msg(LOG_INFO, "zfs_deliver_event: no physpath"); + return (-1); + } + + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) { + zed_log_msg(LOG_INFO, "zfs_deliver_event: device '%s' not " + "found", devname); + return (1); + } + return (0); +} + +/* + * syseventd daemon module event handler + * + * Handles syseventd daemon zfs device related events: + * + * EC_DEV_ADD.ESC_DISK + * EC_DEV_STATUS.ESC_DEV_DLE + * EC_ZFS.ESC_ZFS_VDEV_CHECK + * + * Note: assumes only one thread active at a time (not thread safe) + */ +static int +zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + int ret; + boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE; + + if (strcmp(class, EC_DEV_ADD) == 0) { + /* + * We're mainly interested in disk additions, but we also listen + * for new loop devices, to allow for simplified testing. + */ + if (strcmp(subclass, ESC_DISK) == 0) + is_lofi = B_FALSE; + else if (strcmp(subclass, ESC_LOFI) == 0) + is_lofi = B_TRUE; + else + return (0); + + is_check = B_FALSE; + } else if (strcmp(class, EC_ZFS) == 0 && + strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { + /* + * This event signifies that a device failed to open + * during pool load, but the 'autoreplace' property was + * set, so we should pretend it's just been added. + */ + is_check = B_TRUE; + } else if (strcmp(class, EC_DEV_STATUS) == 0 && + strcmp(subclass, ESC_DEV_DLE) == 0) { + is_dle = B_TRUE; + } else { + return (0); + } + + if (is_dle) + ret = zfs_deliver_dle(nvl); + else if (is_check) + ret = zfs_deliver_check(nvl); + else + ret = zfs_deliver_add(nvl, is_lofi); + + return (ret); +} + +/*ARGSUSED*/ +static void * +zfs_enum_pools(void *arg) +{ + (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); + /* + * Linux - instead of using a thread pool, each list entry + * will spawn a thread when an unavailable pool transitions + * to available. zfs_slm_fini will wait for these threads. + */ + g_enumeration_done = B_TRUE; + return (NULL); +} + +/* + * called from zed daemon at startup + * + * sent messages from zevents or udev monitor + * + * For now, each agent has it's own libzfs instance + */ +int +zfs_slm_init(libzfs_handle_t *zfs_hdl) +{ + if ((g_zfshdl = libzfs_init()) == NULL) + return (-1); + + /* + * collect a list of unavailable pools (asynchronously, + * since this can take a while) + */ + list_create(&g_pool_list, sizeof (struct unavailpool), + offsetof(struct unavailpool, uap_node)); + + if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { + list_destroy(&g_pool_list); + return (-1); + } + + list_create(&g_device_list, sizeof (struct pendingdev), + offsetof(struct pendingdev, pd_node)); + + return (0); +} + +void +zfs_slm_fini() +{ + unavailpool_t *pool; + pendingdev_t *device; + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_zfs_tid, NULL); + + while ((pool = (list_head(&g_pool_list))) != NULL) { + /* + * each pool entry has two possibilities + * 1. was made available (so wait for zfs_enable_ds thread) + * 2. still unavailable (just close the pool) + */ + if (pool->uap_enable_tid) + (void) pthread_join(pool->uap_enable_tid, NULL); + else if (pool->uap_zhp != NULL) + zpool_close(pool->uap_zhp); + + list_remove(&g_pool_list, pool); + free(pool); + } + list_destroy(&g_pool_list); + + while ((device = (list_head(&g_device_list))) != NULL) { + list_remove(&g_device_list, device); + free(device); + } + list_destroy(&g_device_list); + + libzfs_fini(g_zfshdl); +} + +void +zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + static pthread_mutex_t serialize = PTHREAD_MUTEX_INITIALIZER; + + /* + * Serialize incoming events from zfs or libudev sources + */ + (void) pthread_mutex_lock(&serialize); + zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); + (void) zfs_slm_deliver_event(class, subclass, nvl); + (void) pthread_mutex_unlock(&serialize); +} diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c new file mode 100644 index 000000000..64930a10b --- /dev/null +++ b/cmd/zed/agents/zfs_retire.c @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "zfs_agents.h" +#include "../zed_log.h" + +/*ARGSUSED*/ +void +zfs_retire_recv(nvlist_t *nvl, const char *class) +{ +} + +/*ARGSUSED*/ +int +zfs_retire_init(libzfs_handle_t *zfs_hdl) +{ + return (0); +} + +/*ARGSUSED*/ +void +zfs_retire_fini(void) +{ +} diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh index b34d17cef..5a3c8ad4d 100755 --- a/cmd/zed/zed.d/all-syslog.sh +++ b/cmd/zed/zed.d/all-syslog.sh @@ -6,5 +6,6 @@ . "${ZED_ZEDLET_DIR}/zed-functions.sh" zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \ - "${ZEVENT_POOL:+"pool=${ZEVENT_POOL}"}" + "${ZEVENT_POOL:+"pool=${ZEVENT_POOL}"}" \ + "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}" exit 0 diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c new file mode 100644 index 000000000..0360bb584 --- /dev/null +++ b/cmd/zed/zed_disk_event.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifdef HAVE_LIBUDEV + +#include <errno.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libudev.h> +#include <libzfs.h> +#include <pthread.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> + +#include "zed_log.h" +#include "zed_disk_event.h" +#include "agents/zfs_agents.h" + +/* + * Portions of ZED need to see disk events for disks belonging to ZFS pools. + * A libudev monitor is established to monitor block device actions and pass + * them on to internal ZED logic modules. Initially, zfs_mod.c is the only + * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM + * module responsible for handeling disk events for ZFS. + */ + +pthread_t g_mon_tid; +struct udev *g_udev; +struct udev_monitor *g_mon; + + +#define DEV_BYID_PATH "/dev/disk/by-id/" + +/* 64MB is minimum usable disk for ZFS */ +#define MINIMUM_SECTORS 131072 + + +/* + * Post disk event to SLM module + * + * occurs in the context of monitor thread + */ +static void +zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + char *strval; + uint64_t numval; + + zed_log_msg(LOG_INFO, "zed_disk_event:"); + zed_log_msg(LOG_INFO, "\tclass: %s", class); + zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass); + if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval); + if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); + if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); + if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); + + (void) zfs_slm_event(class, subclass, nvl); +} + +/* + * dev_event_nvlist: place event schema into an nv pair list + * + * NAME VALUE (example) + * -------------- -------------------------------------------------------- + * DEV_NAME /dev/sdl + * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/... + * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC + * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0 + * DEV_IS_PART --- + * DEV_SIZE 500107862016 + * ZFS_EV_POOL_GUID 17523635698032189180 + * ZFS_EV_VDEV_GUID 14663607734290803088 + */ +static nvlist_t * +dev_event_nvlist(struct udev_device *dev) +{ + nvlist_t *nvl; + char strval[128]; + const char *value, *path; + uint64_t guid; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval); + if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval); + if ((path = udev_device_get_devnode(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_NAME, path); + if ((value = udev_device_get_devpath(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_PATH, value); + value = udev_device_get_devtype(dev); + if ((value != NULL && strcmp("partition", value) == 0) || + (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER") + != NULL)) { + (void) nvlist_add_boolean(nvl, DEV_IS_PART); + } + if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) { + uint64_t numval = DEV_BSIZE; + + numval *= strtoull(value, NULL, 10); + (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); + } + + /* + * Grab the pool and vdev guids from blkid cache + */ + value = udev_device_get_property_value(dev, "ID_FS_UUID"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid); + + value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid); + + /* + * Either a vdev guid or a devid must be present for matching + */ + if (!nvlist_exists(nvl, DEV_IDENTIFIER) && + !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) { + nvlist_free(nvl); + return (NULL); + } + + return (nvl); +} + +/* + * Listen for block device uevents + */ +static void * +zed_udev_monitor(void *arg) +{ + struct udev_monitor *mon = arg; + + zed_log_msg(LOG_INFO, "Waiting for new uduev disk events..."); + + while (1) { + struct udev_device *dev; + const char *action, *type, *part, *sectors; + const char *bus, *uuid; + const char *class, *subclass; + nvlist_t *nvl; + boolean_t is_zfs = B_FALSE; + + /* allow a cancellation while blocked (recvmsg) */ + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + + /* blocks at recvmsg until an event occurs */ + if ((dev = udev_monitor_receive_device(mon)) == NULL) { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive " + "device error %d", errno); + continue; + } + + /* allow all steps to complete before a cancellation */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + /* + * Strongly typed device is the prefered filter + */ + type = udev_device_get_property_value(dev, "ID_FS_TYPE"); + if (type != NULL && type[0] != '\0') { + if (strcmp(type, "zfs_member") == 0) { + is_zfs = B_TRUE; + } else { + /* not ours, so skip */ + zed_log_msg(LOG_INFO, "zed_udev_monitor: skip " + "%s (in use by %s)", + udev_device_get_devnode(dev), type); + udev_device_unref(dev); + continue; + } + } + + /* + * if this is a disk and it is partitioned, then the + * zfs label will reside in a DEVTYPE=partition and + * we can skip passing this event + */ + type = udev_device_get_property_value(dev, "DEVTYPE"); + part = udev_device_get_property_value(dev, + "ID_PART_TABLE_TYPE"); + if (type != NULL && type[0] != '\0' && + strcmp(type, "disk") == 0 && + part != NULL && part[0] != '\0') { + /* skip and wait for partition event */ + zed_log_msg(LOG_INFO, "zed_udev_monitor: %s waiting " + "for slice", udev_device_get_devnode(dev)); + udev_device_unref(dev); + continue; + } + + /* + * ignore small partitions + */ + sectors = udev_device_get_property_value(dev, + "ID_PART_ENTRY_SIZE"); + if (sectors == NULL) + sectors = udev_device_get_sysattr_value(dev, "size"); + if (sectors != NULL && + strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { + udev_device_unref(dev); + continue; + } + + /* + * If the blkid probe didn't find ZFS, then a persistent + * device id string is required in the message schema + * for matching with vdevs. Preflight here for expected + * udev information. + */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (!is_zfs && (bus == NULL && uuid == NULL)) { + zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " + "source", udev_device_get_devnode(dev)); + udev_device_unref(dev); + continue; + } + + action = udev_device_get_action(dev); + if (strcmp(action, "add") == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else if (strcmp(action, "remove") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } else if (strcmp(action, "change") == 0) { + class = EC_DEV_STATUS; + subclass = ESC_DEV_DLE; + } else { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown", + action); + udev_device_unref(dev); + continue; + } + + /* + * Special case an EC_DEV_ADD for multipath devices + * + * When a multipath device is created, udev reports the + * following: + * + * 1. "add" event of the dm device for the multipath device + * (like /dev/dm-3). + * 2. "change" event to create the actual multipath device + * symlink (like /dev/mapper/mpatha). The event also + * passes back the relevant DM vars we care about, like + * DM_UUID. + * 3. Another "change" event identical to #2 (that we ignore). + * + * To get the behavior we want, we treat the "change" event + * in #2 as a "add" event; as if "/dev/mapper/mpatha" was + * a new disk being added. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "DM_UUID") && + udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) { + /* Fake a MP "change" event to look like a "create" */ + class = EC_DEV_ADD; + subclass = ESC_DISK; + } + + if ((nvl = dev_event_nvlist(dev)) != NULL) { + zed_udev_event(class, subclass, nvl); + nvlist_free(nvl); + } + + udev_device_unref(dev); + } + + return (NULL); +} + +int +zed_disk_event_init() +{ + int fd, fflags; + + if ((g_udev = udev_new()) == NULL) { + zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno); + return (-1); + } + + /* Set up a udev monitor for block devices */ + g_mon = udev_monitor_new_from_netlink(g_udev, "udev"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", + "partition"); + udev_monitor_enable_receiving(g_mon); + + /* Make sure monitoring socket is blocking */ + fd = udev_monitor_get_fd(g_mon); + if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK) + (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK); + + /* spawn a thread to monitor events */ + if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) { + udev_monitor_unref(g_mon); + udev_unref(g_udev); + zed_log_msg(LOG_WARNING, "pthread_create failed"); + return (-1); + } + + zed_log_msg(LOG_INFO, "zed_disk_event_init"); + + return (0); +} + +void +zed_disk_event_fini() +{ + /* cancel monitor thread at recvmsg() */ + (void) pthread_cancel(g_mon_tid); + (void) pthread_join(g_mon_tid, NULL); + + /* cleanup udev resources */ + udev_monitor_unref(g_mon); + udev_unref(g_udev); + + zed_log_msg(LOG_INFO, "zed_disk_event_fini"); +} + +#else + +#include "zed_disk_event.h" + +int +zed_disk_event_init() +{ + return (0); +} + +void +zed_disk_event_fini() +{ +} + +#endif /* HAVE_LIBUDEV */ diff --git a/cmd/zed/zed_disk_event.h b/cmd/zed/zed_disk_event.h new file mode 100644 index 000000000..ea9813d0a --- /dev/null +++ b/cmd/zed/zed_disk_event.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZED_DISK_EVENT_H +#define ZED_DISK_EVENT_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zed_disk_event_init(void); +extern void zed_disk_event_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZED_DISK_EVENT_H */ diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c index 7d819b803..0e5c6793d 100644 --- a/cmd/zed/zed_event.c +++ b/cmd/zed/zed_event.c @@ -24,13 +24,17 @@ #include <sys/zfs_ioctl.h> #include <time.h> #include <unistd.h> +#include <sys/fm/fs/zfs.h> #include "zed.h" #include "zed_conf.h" +#include "zed_disk_event.h" #include "zed_exec.h" #include "zed_file.h" #include "zed_log.h" #include "zed_strings.h" +#include "agents/zfs_agents.h" + #define MAXBUF 4096 /* @@ -50,6 +54,15 @@ zed_event_init(struct zed_conf *zcp) if (zcp->zevent_fd < 0) zed_log_die("Failed to open \"%s\": %s", ZFS_DEV, strerror(errno)); + + if (zfs_slm_init(zcp->zfs_hdl) != 0) + zed_log_die("Failed to initialize zfs slm"); + if (zfs_diagnosis_init(zcp->zfs_hdl) != 0) + zed_log_die("Failed to initialize zfs diagnosis"); + if (zfs_retire_init(zcp->zfs_hdl) != 0) + zed_log_die("Failed to initialize zfs retire"); + if (zed_disk_event_init() != 0) + zed_log_die("Failed to initialize disk events"); } /* @@ -61,6 +74,11 @@ zed_event_fini(struct zed_conf *zcp) if (!zcp) zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL)); + zed_disk_event_fini(); + zfs_retire_fini(); + zfs_diagnosis_fini(); + zfs_slm_fini(); + if (zcp->zevent_fd >= 0) { if (close(zcp->zevent_fd) < 0) zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s", @@ -624,6 +642,17 @@ _zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) _zed_event_add_var(eid, zsp, prefix, name, (_zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"), (u_longlong_t) i64); + /* + * shadow readable strings for vdev state pairs + */ + if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || + strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { + char alt[32]; + + (void) snprintf(alt, sizeof (alt), "%s_str", name); + _zed_event_add_var(eid, zsp, prefix, alt, "%s", + zpool_state_to_name(i64, VDEV_AUX_NONE)); + } break; case DATA_TYPE_DOUBLE: (void) nvpair_value_double(nvp, &d); @@ -803,6 +832,17 @@ _zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) } } +static void +_zed_internal_event(const char *class, nvlist_t *nvl) +{ + /* + * NOTE: only vdev check is handled for now + */ + if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { + (void) zfs_slm_event("EC_zfs", "ESC_ZFS_vdev_check", nvl); + } +} + /* * Service the next zevent, blocking until one is available. */ @@ -853,6 +893,9 @@ zed_event_service(struct zed_conf *zcp) zed_log_msg(LOG_WARNING, "Failed to lookup zevent class (eid=%llu)", eid); } else { + /* let internal modules see this event first */ + _zed_internal_event(class, nvl); + zsp = zed_strings_create(); nvp = NULL; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 9041f9c33..09531b21d 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -47,6 +47,7 @@ #include <zfs_prop.h> #include <sys/fs/zfs.h> #include <sys/stat.h> +#include <sys/fm/fs/zfs.h> #include <sys/fm/util.h> #include <sys/fm/protocol.h> #include <sys/zfs_ioctl.h> @@ -6849,7 +6850,20 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); - printf(gettext("0x%llx"), (u_longlong_t)i64); + /* + * translate vdev state values to readable + * strings to aide zpool events consumers + */ + if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || + strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { + printf(gettext("\"%s\" (0x%llx)"), + zpool_state_to_name(i64, VDEV_AUX_NONE), + (u_longlong_t)i64); + } else { + printf(gettext("0x%llx"), (u_longlong_t)i64); + } break; case DATA_TYPE_HRTIME: diff --git a/include/libzfs.h b/include/libzfs.h index f83e21423..287555acf 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -824,6 +824,13 @@ extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); */ extern boolean_t is_mpath_whole_disk(const char *); extern void update_vdev_config_dev_strs(nvlist_t *); +extern char *zfs_strip_partition(libzfs_handle_t *, char *); + +#ifdef HAVE_LIBUDEV +struct udev_device; +extern int zfs_device_get_devid(struct udev_device *, char *, size_t); +extern int zfs_device_get_physical(struct udev_device *, char *, size_t); +#endif #ifdef __cplusplus } diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index ad3f4a79e..25510f8ca 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -57,9 +57,11 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH "vdev_physpath" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE "vdev_laststate" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts" diff --git a/include/sys/spa.h b/include/sys/spa.h index 51d4619f4..fead2d9de 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -849,7 +849,7 @@ extern void spa_log_error(spa_t *spa, zio_t *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, zio_t *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); -extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); +extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern void zfs_post_sysevent(spa_t *spa, vdev_t *vd, const char *name); extern uint64_t spa_get_errlog_size(spa_t *spa); diff --git a/include/sys/sysevent/Makefile.am b/include/sys/sysevent/Makefile.am index 0d29eeb80..e9af2684f 100644 --- a/include/sys/sysevent/Makefile.am +++ b/include/sys/sysevent/Makefile.am @@ -1,5 +1,6 @@ COMMON_H = \ - $(top_srcdir)/include/sys/sysevent/eventdefs.h + $(top_srcdir)/include/sys/sysevent/eventdefs.h \ + $(top_srcdir)/include/sys/sysevent/dev.h KERNEL_H = diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h new file mode 100644 index 000000000..1117538d8 --- /dev/null +++ b/include/sys/sysevent/dev.h @@ -0,0 +1,261 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSEVENT_DEV_H +#define _SYS_SYSEVENT_DEV_H + +#include <sys/sysevent/eventdefs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Event schema for EC_DEV_ADD/ESC_DISK + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_DISK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name to the raw device. + * The name does not include the slice number component. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property> + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_ADD/ESC_NETWORK + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_NETWORK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name associated with the device if exists. + * /dev name associated with the driver for DLPI + * Style-2 only drivers. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property> + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_ADD/ESC_PRINTER + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_PRINTER + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev/printers name associated with the device + * if exists. + * /dev name associated with the device if it exists + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property> + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_REMOVE/ESC_DISK + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_DISK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name to the raw device. + * The name does not include the slice number component. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_REMOVE/ESC_NETWORK + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_NETWORK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name associated with the device if exists. + * /dev name associated with the driver for DLPI + * Style-2 only drivers. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_REMOVE/ESC_PRINTER + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_PRINTER + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev/printers name associated with the device + * if exists. + * /dev name associated with the device if it exists + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_BRANCH/ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE + * + * Event Class - EC_DEV_BRANCH + * Event Sub-Class - ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path to the root node of the device subtree + * without the "/devices" prefix. + */ + +#define EV_VERSION "version" +#define DEV_PHYS_PATH "phys_path" +#define DEV_NAME "dev_name" +#define DEV_DRIVER_NAME "driver_name" +#define DEV_INSTANCE "instance" +#define DEV_PROP_PREFIX "prop-" + +#ifdef __linux__ +#define DEV_IDENTIFIER "devid" +#define DEV_PATH "path" +#define DEV_IS_PART "is_slice" +#define DEV_SIZE "dev_size" +#endif /* __linux__ */ + +#define EV_V1 1 + +/* maximum number of devinfo node properties added to the event */ +#define MAX_PROP_COUNT 100 + +/* only properties with size less than PROP_LEN_LIMIT are added to the event */ +#define PROP_LEN_LIMIT 1024 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_DEV_H */ diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index edc0adcee..edd4e5d58 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -116,10 +116,10 @@ typedef struct vdev_dev_strs { /* * Obtain the persistent device id string (describes what) * - * used by ZED auto-{online,expand,replace} + * used by ZED vdev matching for auto-{online,expand,replace} */ -static int -udev_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +int +zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) { struct udev_list_entry *entry; const char *bus; @@ -167,10 +167,10 @@ udev_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) /* * Obtain the persistent physical location string (describes where) * - * used by ZED auto-{online,expand,replace} + * used by ZED vdev matching for auto-{online,expand,replace} */ -static int -udev_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +int +zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) { const char *physpath, *value; @@ -394,12 +394,12 @@ encode_device_strings(const char *path, vdev_dev_strs_t *ds, if (!wholedisk && !udev_mpath_whole_disk(dev)) goto no_dev; - ret = udev_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); + ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); if (ret != 0) goto no_dev_ref; /* physical location string (optional) */ - if (udev_device_get_physical(dev, ds->vds_devphys, + if (zfs_device_get_physical(dev, ds->vds_devphys, sizeof (ds->vds_devphys)) != 0) { ds->vds_devphys[0] = '\0'; /* empty string --> not available */ } diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 2484ddc12..a37ea7913 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3397,9 +3397,11 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The * third case only occurs when preceded by a string matching the regular * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. + * + * caller must free the returned string */ -static char * -strip_partition(libzfs_handle_t *hdl, char *path) +char * +zfs_strip_partition(libzfs_handle_t *hdl, char *path) { char *tmp = zfs_strdup(hdl, path); char *part = NULL, *d = NULL; @@ -3542,7 +3544,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { - return (strip_partition(hdl, path)); + return (zfs_strip_partition(hdl, path)); } } else { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0); @@ -4216,7 +4218,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); - if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) { + if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) { /* * This shouldn't happen. We've long since verified that this * is a valid device. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 75f6e5ce1..dcf56d8df 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3373,19 +3373,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); - /* - * If we have brought this vdev back into service, we need - * to notify fmd so that it can gracefully repair any outstanding - * cases due to a missing device. We do this in all cases, even those - * that probably don't correlate to a repaired fault. This is sure to - * catch all cases, and we let the zfs-retire agent sort it out. If - * this is a transient state it's OK, as the retire agent will - * double-check the state of the vdev before repairing it. - */ - if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && - vd->vdev_prevstate != state) - zfs_post_state_change(spa, vd); - if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { @@ -3466,6 +3453,20 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_removed = B_FALSE; } + /* + * Notify ZED of any significant state-change on a leaf vdev. + * + * We ignore transitions from a closed state to healthy unless + * the parent was degraded. + */ + if (vd->vdev_ops->vdev_op_leaf && + ((save_state > VDEV_STATE_CLOSED) || + (vd->vdev_state < VDEV_STATE_HEALTHY) || + (vd->vdev_parent != NULL && + vd->vdev_parent->vdev_prevstate == VDEV_STATE_DEGRADED))) { + zfs_post_state_change(spa, vd, save_state); + } + if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 0871fa95b..0d508c0b8 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -848,7 +848,8 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, } static void -zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name) +zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, + nvlist_t *aux) { #ifdef _KERNEL nvlist_t *resource; @@ -883,6 +884,13 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name) if (vd->vdev_fru != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); + /* also copy any optional payload data */ + if (aux) { + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) + (void) nvlist_add_nvpair(resource, elem); + } } zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); @@ -898,7 +906,7 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name) void zfs_post_remove(spa_t *spa, vdev_t *vd) { - zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED); + zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); } /* @@ -909,7 +917,7 @@ zfs_post_remove(spa_t *spa, vdev_t *vd) void zfs_post_autoreplace(spa_t *spa, vdev_t *vd) { - zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE); + zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); } /* @@ -919,9 +927,31 @@ zfs_post_autoreplace(spa_t *spa, vdev_t *vd) * open because the device was not found (fault.fs.zfs.device). */ void -zfs_post_state_change(spa_t *spa, vdev_t *vd) +zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) { - zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE); +#ifdef _KERNEL + nvlist_t *aux; + + /* + * Add optional supplemental keys to payload + */ + aux = fm_nvlist_create(NULL); + if (vd && aux) { + if (vd->vdev_physpath) { + (void) nvlist_add_string(aux, + FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, + vd->vdev_physpath); + } + (void) nvlist_add_uint64(aux, + FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); + } + + zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, + aux); + + if (aux) + fm_nvlist_destroy(aux, FM_NVA_FREE); +#endif } /* @@ -933,7 +963,7 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd) void zfs_post_sysevent(spa_t *spa, vdev_t *vd, const char *name) { - zfs_post_common(spa, vd, FM_SYSEVENT_CLASS, name); + zfs_post_common(spa, vd, FM_SYSEVENT_CLASS, name, NULL); } #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/scripts/zconfig.sh b/scripts/zconfig.sh index c2b97c2c5..45b66447f 100755 --- a/scripts/zconfig.sh +++ b/scripts/zconfig.sh @@ -579,10 +579,10 @@ test_9() { ${ZFS} create -V 300M ${FULL_NAME} || fail 3 udev_trigger - # Dump the events, there should be at least 5 lines. + # Dump the events, there should be a pool create event ${ZPOOL} events >${TMP_EVENTS} || fail 4 - EVENTS=`wc -l ${TMP_EVENTS} | cut -f1 -d' '` - [ $EVENTS -lt 5 ] && fail 5 + MATCHES=`grep -c sysevent\.fs\.zfs\.pool_create ${TMP_EVENTS}` + [ $MATCHES -eq 1 ] || fail 5 # Clear the events and ensure there are none. ${ZPOOL} events -c >/dev/null || fail 6 |