diff options
author | Don Brady <[email protected]> | 2016-03-14 10:04:21 -0600 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-03-31 13:45:53 -0700 |
commit | 39fc0cb5577e409f415e25d35a452b46996c08b1 (patch) | |
tree | 400e9790da6d1d470ffb65b63758320c37875eb0 /lib | |
parent | bebd73f2c3822a86863e7508e7a3e2929d871113 (diff) |
Add support for devid and phys_path keys in vdev disk labels
This is foundational work for ZED.
Updates a leaf vdev's persistent device strings on Linux platform
* only applies for a dedicated leaf vdev (aka whole disk)
* updated during pool create|add|attach|import
* used for matching device matching during auto-{online,expand,replace}
* stored in a leaf disk config label (i.e. alongside 'path' NVP)
* can opt-out using env var ZFS_VDEV_DEVID_OPT_OUT=YES
Some examples:
path: '/dev/sdb1'
devid: 'scsi-350000394a8ca4fbc-part1'
phys_path: 'pci-0000:04:00.0-sas-0x50000394a8ca4fbf-lun-0'
path: '/dev/mapper/mpatha'
devid: 'dm-uuid-mpath-35000c5006304de3f'
Signed-off-by: Don Brady <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #2856
Closes #3978
Closes #4416
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libspl/include/devid.h | 77 | ||||
-rw-r--r-- | lib/libzfs/Makefile.am | 2 | ||||
-rw-r--r-- | lib/libzfs/libzfs_import.c | 348 | ||||
-rw-r--r-- | lib/libzfs/libzfs_pool.c | 16 |
4 files changed, 336 insertions, 107 deletions
diff --git a/lib/libspl/include/devid.h b/lib/libspl/include/devid.h index 4b346da0f..8e483281a 100644 --- a/lib/libspl/include/devid.h +++ b/lib/libspl/include/devid.h @@ -28,82 +28,5 @@ #define _LIBSPL_DEVID_H #include <sys/types.h> -#include <stdlib.h> - -typedef int ddi_devid_t; - -typedef struct devid_nmlist { - char *devname; - dev_t dev; -} devid_nmlist_t; - -static inline -int -devid_str_decode( - char *devidstr, - ddi_devid_t *retdevid, - char **retminor_name) -{ - abort(); -} - -static inline -int -devid_deviceid_to_nmlist( - char *search_path, - ddi_devid_t devid, - char *minor_name, - devid_nmlist_t **retlist) -{ - abort(); -} - -static inline -void -devid_str_free(char *str) -{ - abort(); -} - -static inline -void -devid_free(ddi_devid_t devid) -{ - abort(); -} - -static inline -void -devid_free_nmlist(devid_nmlist_t *list) -{ - abort(); -} - -static inline -int -devid_get( - int fd, - ddi_devid_t *retdevid) -{ - return (-1); -} - -static inline -int -devid_get_minor_name( - int fd, - char **retminor_name) -{ - abort(); -} - -static inline -char * -devid_str_encode( - ddi_devid_t devid, - char *minor_name) -{ - abort(); -} #endif diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index a9e2d99fc..9a32a3013 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -36,7 +36,7 @@ libzfs_la_LIBADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libzpool/libzpool.la -libzfs_la_LIBADD += -lm $(LIBBLKID) +libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) libzfs_la_LDFLAGS = -version-info 2:0:0 EXTRA_DIST = $(libzfs_pc_DATA) $(USER_C) diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index ca3736a1c..50c0019ce 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. */ /* @@ -46,6 +47,10 @@ #include <dirent.h> #include <errno.h> #include <libintl.h> +#ifdef HAVE_LIBUDEV +#include <libudev.h> +#include <sched.h> +#endif #include <stddef.h> #include <stdlib.h> #include <string.h> @@ -94,30 +99,328 @@ typedef struct pool_list { name_entry_t *names; } pool_list_t; -static char * -get_devid(const char *path) +/* + * Linux persistent device strings for vdev labels + * + * based on libudev for consistency with libudev disk add/remove events + */ +#ifdef HAVE_LIBUDEV + +#define DEV_BYID_PATH "/dev/disk/by-id/" + +typedef struct vdev_dev_strs { + char vds_devid[128]; + char vds_devphys[128]; +} vdev_dev_strs_t; + +/* + * Obtain the persistent device id string (describes what) + * + * used by ZED auto-{online,expand,replace} + */ +static int +udev_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) { - int fd; - ddi_devid_t devid; - char *minor, *ret; + struct udev_list_entry *entry; + const char *bus; + char devbyid[MAXPATHLEN]; - if ((fd = open(path, O_RDONLY)) < 0) - return (NULL); + /* The bus based by-id path is preferred */ + bus = udev_device_get_property_value(dev, "ID_BUS"); - minor = NULL; - ret = NULL; - if (devid_get(fd, &devid) == 0) { - if (devid_get_minor_name(fd, &minor) == 0) - ret = devid_str_encode(devid, minor); - if (minor != NULL) - devid_str_free(minor); - devid_free(devid); + if (bus == NULL) { + const char *dm_uuid; + + /* + * For multipath nodes use the persistent uuid based identifier + * + * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f + */ + dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (dm_uuid != NULL) { + (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); + return (0); + } + return (ENODATA); } - (void) close(fd); + + /* + * locate the bus specific by-id link + */ + (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, devbyid, strlen(devbyid)) == 0) { + name += strlen(DEV_BYID_PATH); + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + return (ENODATA); +} + +/* + * Obtain the persistent physical location string (describes where) + * + * used by ZED auto-{online,expand,replace} + */ +static int +udev_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +{ + const char *physpath, *value; + + /* + * Skip indirect multipath device nodes + */ + value = udev_device_get_property_value(dev, "DM_MULTIPATH_DEVICE_PATH"); + if (value != NULL && strcmp(value, "1") == 0) + return (ENODATA); /* skip physical for multipath nodes */ + + physpath = udev_device_get_property_value(dev, "ID_PATH"); + if (physpath != NULL && physpath[0] != '\0') { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + return (ENODATA); +} + +/* + * A disk is considered a multipath whole disk when: + * DEVNAME key value has "dm-" + * DM_NAME key value has "mpath" prefix + * DM_UUID key exists + * ID_PART_TABLE_TYPE key does not exist or is not gpt + */ +static boolean_t +udev_mpath_whole_disk(struct udev_device *dev) +{ + const char *devname, *mapname, *type, *uuid; + + devname = udev_device_get_property_value(dev, "DEVNAME"); + mapname = udev_device_get_property_value(dev, "DM_NAME"); + type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + + if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && + (mapname != NULL && strncmp(mapname, "mpath", 5) == 0) && + ((type == NULL) || (strcmp(type, "gpt") != 0)) && + (uuid != NULL)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Check if a disk is effectively a multipath whole disk + */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname; + boolean_t wholedisk = B_FALSE; + + if (realpath(path, nodepath) == NULL) + return (B_FALSE); + sysname = strrchr(nodepath, '/') + 1; + if (strncmp(sysname, "dm-", 3) != 0) + return (B_FALSE); + if ((udev = udev_new()) == NULL) + return (B_FALSE); + if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname)) == NULL) { + udev_device_unref(dev); + return (B_FALSE); + } + + wholedisk = udev_mpath_whole_disk(dev); + + udev_device_unref(dev); + return (wholedisk); +} + +static int +udev_device_is_ready(struct udev_device *dev) +{ +#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED + return (udev_device_get_is_initialized(dev)); +#else + /* wait for DEVLINKS property to be initialized */ + return (udev_device_get_property_value(dev, "DEVLINKS") != NULL); +#endif +} + +/* + * Encode the persistent devices strings + * used for the vdev disk label + */ +static int +encode_device_strings(const char *path, vdev_dev_strs_t *ds, + boolean_t wholedisk) +{ + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname; + int ret = ENODEV; + hrtime_t start; + + if ((udev = udev_new()) == NULL) + return (ENXIO); + + /* resolve path to a runtime device node instance */ + if (realpath(path, nodepath) == NULL) + goto no_dev; + + sysname = strrchr(nodepath, '/') + 1; + + /* + * Wait up to 3 seconds for udev to set up the device node context + */ + start = gethrtime(); + do { + dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname); + if (dev == NULL) + goto no_dev; + if (udev_device_is_ready(dev)) + break; /* udev ready */ + + udev_device_unref(dev); + dev = NULL; + + if (NSEC2MSEC(gethrtime() - start) < 10) + (void) sched_yield(); /* yield/busy wait up to 10ms */ + else + (void) usleep(10 * MILLISEC); + + } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC)); + + if (dev == NULL) + goto no_dev; + + /* + * Only whole disks require extra device strings + */ + if (!wholedisk && !udev_mpath_whole_disk(dev)) + goto no_dev; + + ret = udev_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); + if (ret != 0) + goto no_dev_ref; + + /* physical location string (optional) */ + if (udev_device_get_physical(dev, ds->vds_devphys, + sizeof (ds->vds_devphys)) != 0) { + ds->vds_devphys[0] = '\0'; /* empty string --> not available */ + } + +no_dev_ref: + udev_device_unref(dev); +no_dev: + udev_unref(udev); return (ret); } +/* + * Update a leaf vdev's persistent device strings (Linux only) + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * single device node example: + * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' + * + * multipath device node example: + * devid: 'dm-uuid-mpath-35000c5006304de3f' + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + vdev_dev_strs_t vds; + char *env, *type, *path; + uint64_t wholedisk = 0; + + /* + * For the benefit of legacy ZFS implementations, allow + * for opting out of devid strings in the vdev label. + * + * example use: + * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer + * + * explanation: + * Older ZFS on Linux implementations had issues when attempting to + * display pool config VDEV names if a "devid" NVP value is present + * in the pool's config. + * + * For example, a pool that originated on illumos platform would + * have a devid value in the config and "zpool status" would fail + * when listing the config. + * + * A pool can be stripped of any "devid" values on import or + * prevented from adding them on zpool create|add by setting + * ZFS_VDEV_DEVID_OPT_OUT. + */ + env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + return; + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || + strcmp(type, VDEV_TYPE_DISK) != 0) { + return; + } + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + + /* + * Update device string values in config nvlist + */ + if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); + if (vds.vds_devphys[0] != '\0') { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vds.vds_devphys); + } + } else { + /* clear out any stale entries */ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + } +} +#else + +boolean_t +is_mpath_whole_disk(const char *path) +{ + return (B_FALSE); +} + +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ +} + +#endif /* HAVE_LIBUDEV */ + /* * Go through and fix up any path and/or devid information for the given vdev @@ -130,7 +433,7 @@ fix_paths(nvlist_t *nv, name_entry_t *names) uint_t c, children; uint64_t guid; name_entry_t *ne, *best; - char *path, *devid; + char *path; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { @@ -197,15 +500,8 @@ fix_paths(nvlist_t *nv, name_entry_t *names) if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) return (-1); - if ((devid = get_devid(best->ne_name)) == NULL) { - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); - } else { - if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) { - devid_str_free(devid); - return (-1); - } - devid_str_free(devid); - } + /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */ + update_vdev_config_dev_strs(nv); return (0); } diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 9fc4bfc5b..e7a9a0011 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3312,6 +3312,7 @@ zpool_reopen(zpool_handle_t *zhp) return (zpool_standard_error(hdl, errno, msg)); } +#if defined(__sun__) || defined(__sun) /* * Convert from a devid string to a path. */ @@ -3389,6 +3390,7 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) (void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc); } +#endif /* sun */ /* * Remove partition suffix from a vdev path. Partition suffixes may take three @@ -3443,12 +3445,10 @@ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, int name_flags) { - char *path, *devid, *type, *env; + char *path, *type, *env; uint64_t value; char buf[PATH_BUF_LEN]; char tmpbuf[PATH_BUF_LEN]; - vdev_stat_t *vs; - uint_t vsc; env = getenv("ZPOOL_VDEV_NAME_PATH"); if (env && (strtoul(env, NULL, 0) > 0 || @@ -3471,6 +3471,15 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { +#if defined(__sun__) || defined(__sun) + /* + * Live VDEV path updates to a kernel VDEV during a + * zpool_vdev_name lookup are not supported on Linux. + */ + char *devid; + vdev_stat_t *vs; + uint_t vsc; + /* * If the device is dead (faulted, offline, etc) then don't * bother opening it. Otherwise we may be forcing the user to @@ -3508,6 +3517,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, if (newdevid) devid_str_free(newdevid); } +#endif /* sun */ if (name_flags & VDEV_NAME_FOLLOW_LINKS) { char *rp = realpath(path, NULL); |