From 266852767f42781821c1d62544c9b9e985828304 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 26 Aug 2010 11:42:43 -0700 Subject: Add linux events This topic branch leverages the Solaris style FMA call points in ZFS to create a user space visible event notification system under Linux. This new system is called zevent and it unifies all previous Solaris style ereports and sysevent notifications. Under this Linux specific scheme when a sysevent or ereport event occurs an nvlist describing the event is created which looks almost exactly like a Solaris ereport. These events are queued up in the kernel when they occur and conditionally logged to the console. It is then up to a user space application to consume the events and do whatever it likes with them. To make this possible the existing /dev/zfs ABI has been extended with two new ioctls which behave as follows. * ZFS_IOC_EVENTS_NEXT Get the next pending event. The kernel will keep track of the last event consumed by the file descriptor and provide the next one if available. If no new events are available the ioctl() will block waiting for the next event. This ioctl may also be called in a non-blocking mode by setting zc.zc_guid = ZEVENT_NONBLOCK. In the non-blocking case if no events are available ENOENT will be returned. It is possible that ESHUTDOWN will be returned if the ioctl() is called while module unloading is in progress. And finally ENOMEM may occur if the provided nvlist buffer is not large enough to contain the entire event. * ZFS_IOC_EVENTS_CLEAR Clear are events queued by the kernel. The kernel will keep a fairly large number of recent events queued, use this ioctl to clear the in kernel list. This will effect all user space processes consuming events. The zpool command has been extended to use this events ABI with the 'events' subcommand. You may run 'zpool events -v' to output a verbose log of all recent events. This is very similar to the Solaris 'fmdump -ev' command with the key difference being it also includes what would be considered sysevents under Solaris. You may also run in follow mode with the '-f' option. To clear the in kernel event queue use the '-c' option. $ sudo cmd/zpool/zpool events -fv TIME CLASS May 13 2010 16:31:15.777711000 ereport.fs.zfs.config.sync class = "ereport.fs.zfs.config.sync" ena = 0x40982b7897700001 detector = (embedded nvlist) version = 0x0 scheme = "zfs" pool = 0xed976600de75dfa6 (end detector) time = 0x4bec8bc3 0x2e5aed98 pool = "zpios" pool_guid = 0xed976600de75dfa6 pool_context = 0x0 While the 'zpool events' command is handy for interactive debugging it is not expected to be the primary consumer of zevents. This ABI was primarily added to facilitate the addition of a user space monitoring daemon. This daemon would consume all events posted by the kernel and based on the type of event perform an action. For most events simply forwarding them on to syslog is likely enough. But this interface also cleanly allows for more sophisticated actions to be taken such as generating an email for a failed drive. Signed-off-by: Brian Behlendorf --- module/zfs/zfs_fm.c | 80 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 34 deletions(-) (limited to 'module/zfs/zfs_fm.c') diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 0b4812666..c93057e8e 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -98,6 +98,16 @@ * ereport with information about the differences. */ #ifdef _KERNEL +static void +zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) +{ + if (nvl) + fm_nvlist_destroy(nvl, FM_NVA_FREE); + + if (detector) + fm_nvlist_destroy(detector, FM_NVA_FREE); +} + static void zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, @@ -410,7 +420,7 @@ update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count) * to the new smallest gap, to prepare for our next invocation. */ static void -shrink_ranges(zfs_ecksum_info_t *eip) +zei_shrink_ranges(zfs_ecksum_info_t *eip) { uint32_t mingap = UINT32_MAX; uint32_t new_allowed_gap = eip->zei_mingap + 1; @@ -429,12 +439,13 @@ shrink_ranges(zfs_ecksum_info_t *eip) uint32_t end = r[idx].zr_end; while (idx < max - 1) { - idx++; + uint32_t nstart, nend, gap; - uint32_t nstart = r[idx].zr_start; - uint32_t nend = r[idx].zr_end; + idx++; + nstart = r[idx].zr_start; + nend = r[idx].zr_end; - uint32_t gap = nstart - end; + gap = nstart - end; if (gap < new_allowed_gap) { end = nend; continue; @@ -454,13 +465,13 @@ shrink_ranges(zfs_ecksum_info_t *eip) } static void -add_range(zfs_ecksum_info_t *eip, int start, int end) +zei_add_range(zfs_ecksum_info_t *eip, int start, int end) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; if (count >= MAX_RANGES) { - shrink_ranges(eip); + zei_shrink_ranges(eip); count = eip->zei_range_count; } if (count == 0) { @@ -482,7 +493,7 @@ add_range(zfs_ecksum_info_t *eip, int start, int end) } static size_t -range_total_size(zfs_ecksum_info_t *eip) +zei_range_total_size(zfs_ecksum_info_t *eip) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; @@ -559,7 +570,7 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, if (start == -1) continue; - add_range(eip, start, idx); + zei_add_range(eip, start, idx); start = -1; } else { if (start != -1) @@ -569,10 +580,10 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, } } if (start != -1) - add_range(eip, start, idx); + zei_add_range(eip, start, idx); /* See if it will fit in our inline buffers */ - inline_size = range_total_size(eip); + inline_size = zei_range_total_size(eip); if (inline_size > ZFM_MAX_INLINE) no_inline = 1; @@ -675,10 +686,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, if (ereport == NULL) return; - fm_ereport_post(ereport, EVCH_SLEEP); - - fm_nvlist_destroy(ereport, FM_NVA_FREE); - fm_nvlist_destroy(detector, FM_NVA_FREE); + /* Cleanup is handled by the callback function */ + zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #endif } @@ -730,12 +739,10 @@ zfs_ereport_finish_checksum(zio_cksum_report_t *report, good_data, bad_data, report->zcr_length, drop_if_identical); if (info != NULL) - fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); + zfs_zevent_post(report->zcr_ereport, + report->zcr_detector, zfs_zevent_post_cb); - fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE); - fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE); report->zcr_ereport = report->zcr_detector = NULL; - if (info != NULL) kmem_free(info, sizeof (*info)); #endif @@ -764,7 +771,7 @@ void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) { #ifdef _KERNEL - fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); + zfs_zevent_post(report->zcr_ereport, report->zcr_detector, NULL); #endif } @@ -787,14 +794,10 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); - if (info != NULL) - fm_ereport_post(ereport, EVCH_SLEEP); - - fm_nvlist_destroy(ereport, FM_NVA_FREE); - fm_nvlist_destroy(detector, FM_NVA_FREE); - - if (info != NULL) + if (info != NULL) { + zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); + } #endif } @@ -817,13 +820,14 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); VERIFY(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); - if (vd) + if (vd) { VERIFY(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state) == 0); + } - fm_ereport_post(resource, EVCH_SLEEP); - - fm_nvlist_destroy(resource, FM_NVA_FREE); + zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); #endif } @@ -836,7 +840,7 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) void zfs_post_remove(spa_t *spa, vdev_t *vd) { - zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); + zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_REMOVED); } /* @@ -847,7 +851,7 @@ zfs_post_remove(spa_t *spa, vdev_t *vd) void zfs_post_autoreplace(spa_t *spa, vdev_t *vd) { - zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); + zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_AUTOREPLACE); } /* @@ -859,5 +863,13 @@ zfs_post_autoreplace(spa_t *spa, vdev_t *vd) void zfs_post_state_change(spa_t *spa, vdev_t *vd) { - zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE); + zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_STATECHANGE); } + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(zfs_ereport_post); +EXPORT_SYMBOL(zfs_ereport_post_checksum); +EXPORT_SYMBOL(zfs_post_remove); +EXPORT_SYMBOL(zfs_post_autoreplace); +EXPORT_SYMBOL(zfs_post_state_change); +#endif /* _KERNEL */ -- cgit v1.2.3