aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorDon Brady <[email protected]>2018-11-05 12:22:33 -0700
committerBrian Behlendorf <[email protected]>2018-11-05 11:22:33 -0800
commite89f1295d4faa88bb05a62c8dd5f781657db5955 (patch)
tree8e39dfe33c6849e00813e54ec95c09a24448a43a /lib
parent6644e5bb6e1a6c25c5006c819abf93c7bb662e80 (diff)
Add libzutil for libzfs or libzpool consumers
Adds a libzutil for utility functions that are common to libzfs and libzpool consumers (most of what was in libzfs_import.c). This removes the need for utilities to link against both libzpool and libzfs. Reviewed-by: Matthew Ahrens <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Don Brady <[email protected]> Closes #8050
Diffstat (limited to 'lib')
-rw-r--r--lib/Makefile.am2
-rw-r--r--lib/libzfs/Makefile.am7
-rw-r--r--lib/libzfs/libzfs_dataset.c2
-rw-r--r--lib/libzfs/libzfs_import.c2255
-rw-r--r--lib/libzfs/libzfs_iter.c1
-rw-r--r--lib/libzfs/libzfs_pool.c403
-rw-r--r--lib/libzfs/libzfs_sendrecv.c1
-rw-r--r--lib/libzfs/libzfs_status.c66
-rw-r--r--lib/libzfs/libzfs_util.c335
-rw-r--r--lib/libzpool/Makefile.am5
-rw-r--r--lib/libzpool/util.c155
-rw-r--r--lib/libzutil/Makefile.am27
-rw-r--r--lib/libzutil/zutil_device_path.c625
-rw-r--r--lib/libzutil/zutil_import.c2389
-rw-r--r--lib/libzutil/zutil_nicenum.c157
-rw-r--r--lib/libzutil/zutil_pool.c145
16 files changed, 3425 insertions, 3150 deletions
diff --git a/lib/Makefile.am b/lib/Makefile.am
index e1833b842..8dff773df 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,6 +1,6 @@
# NB: GNU Automake Manual, Chapter 8.3.5: Libtool Convenience Libraries
# These six libraries are intermediary build components.
-SUBDIRS = libavl libefi libicp libshare libspl libtpool libunicode
+SUBDIRS = libavl libefi libicp libshare libspl libtpool libzutil libunicode
# These four libraries, which are installed as the final build product,
# incorporate the six convenience libraries given above.
diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am
index da40c96ce..34933e627 100644
--- a/lib/libzfs/Makefile.am
+++ b/lib/libzfs/Makefile.am
@@ -55,14 +55,13 @@ nodist_libzfs_la_SOURCES = \
$(KERNEL_C)
libzfs_la_LIBADD = \
- $(top_builddir)/lib/libefi/libefi.la \
$(top_builddir)/lib/libnvpair/libnvpair.la \
$(top_builddir)/lib/libshare/libshare.la \
- $(top_builddir)/lib/libtpool/libtpool.la \
$(top_builddir)/lib/libuutil/libuutil.la \
- $(top_builddir)/lib/libzfs_core/libzfs_core.la
+ $(top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(top_builddir)/lib/libzutil/libzutil.la
-libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) $(LIBSSL)
+libzfs_la_LIBADD += -lm $(LIBSSL)
libzfs_la_LDFLAGS = -version-info 2:0:0
EXTRA_DIST = $(libzfs_pc_DATA) $(USER_C)
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index c84ed5bda..e79a936f9 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -36,7 +36,6 @@
#include <ctype.h>
#include <errno.h>
#include <libintl.h>
-#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
@@ -61,6 +60,7 @@
#include <sys/zap.h>
#include <sys/dsl_crypt.h>
#include <libzfs.h>
+#include <libzutil.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index 634c076b8..44a46d1ce 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -26,822 +26,19 @@
* Copyright (c) 2016, Intel Corporation.
*/
-/*
- * Pool import support functions.
- *
- * To import a pool, we rely on reading the configuration information from the
- * ZFS label of each device. If we successfully read the label, then we
- * organize the configuration information in the following hierarchy:
- *
- * pool guid -> toplevel vdev guid -> label txg
- *
- * Duplicate entries matching this same tuple will be discarded. Once we have
- * examined every device, we pick the best label txg config for each toplevel
- * vdev. We then arrange these toplevel vdevs into a complete pool config, and
- * update any paths that have changed. Finally, we attempt to import the pool
- * using our derived config, and record the results.
- */
-
-#include <ctype.h>
#include <devid.h>
-#include <dirent.h>
#include <errno.h>
#include <libintl.h>
#include <libgen.h>
-#ifdef HAVE_LIBUDEV
-#include <libudev.h>
-#include <sched.h>
-#endif
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
-#include <fcntl.h>
-#include <sys/vtoc.h>
-#include <sys/dktp/fdisk.h>
-#include <sys/efi_partition.h>
-#include <thread_pool.h>
#include <sys/vdev_impl.h>
-#include <blkid/blkid.h>
-#include "libzfs.h"
-#include "libzfs_impl.h"
#include <libzfs.h>
-
-/*
- * Intermediate structures used to gather configuration information.
- */
-typedef struct config_entry {
- uint64_t ce_txg;
- nvlist_t *ce_config;
- struct config_entry *ce_next;
-} config_entry_t;
-
-typedef struct vdev_entry {
- uint64_t ve_guid;
- config_entry_t *ve_configs;
- struct vdev_entry *ve_next;
-} vdev_entry_t;
-
-typedef struct pool_entry {
- uint64_t pe_guid;
- vdev_entry_t *pe_vdevs;
- struct pool_entry *pe_next;
-} pool_entry_t;
-
-typedef struct name_entry {
- char *ne_name;
- uint64_t ne_guid;
- uint64_t ne_order;
- uint64_t ne_num_labels;
- struct name_entry *ne_next;
-} name_entry_t;
-
-typedef struct pool_list {
- pool_entry_t *pools;
- name_entry_t *names;
-} pool_list_t;
-
-#define DEV_BYID_PATH "/dev/disk/by-id/"
-
-/*
- * Linux persistent device strings for vdev labels
- *
- * based on libudev for consistency with libudev disk add/remove events
- */
-#ifdef HAVE_LIBUDEV
-
-typedef struct vdev_dev_strs {
- char vds_devid[128];
- char vds_devphys[128];
-} vdev_dev_strs_t;
-
-/*
- * Obtain the persistent device id string (describes what)
- *
- * used by ZED vdev matching for auto-{online,expand,replace}
- */
-int
-zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
-{
- struct udev_list_entry *entry;
- const char *bus;
- char devbyid[MAXPATHLEN];
-
- /* The bus based by-id path is preferred */
- bus = udev_device_get_property_value(dev, "ID_BUS");
-
- if (bus == NULL) {
- const char *dm_uuid;
-
- /*
- * For multipath nodes use the persistent uuid based identifier
- *
- * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
- */
- dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
- if (dm_uuid != NULL) {
- (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
- return (0);
- }
-
- /*
- * For volumes use the persistent /dev/zvol/dataset identifier
- */
- entry = udev_device_get_devlinks_list_entry(dev);
- while (entry != NULL) {
- const char *name;
-
- name = udev_list_entry_get_name(entry);
- if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
- (void) strlcpy(bufptr, name, buflen);
- return (0);
- }
- entry = udev_list_entry_get_next(entry);
- }
-
- /*
- * NVME 'by-id' symlinks are similar to bus case
- */
- struct udev_device *parent;
-
- parent = udev_device_get_parent_with_subsystem_devtype(dev,
- "nvme", NULL);
- if (parent != NULL)
- bus = "nvme"; /* continue with bus symlink search */
- else
- return (ENODATA);
- }
-
- /*
- * locate the bus specific by-id link
- */
- (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
- entry = udev_device_get_devlinks_list_entry(dev);
- while (entry != NULL) {
- const char *name;
-
- name = udev_list_entry_get_name(entry);
- if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
- name += strlen(DEV_BYID_PATH);
- (void) strlcpy(bufptr, name, buflen);
- return (0);
- }
- entry = udev_list_entry_get_next(entry);
- }
-
- return (ENODATA);
-}
-
-/*
- * Obtain the persistent physical location string (describes where)
- *
- * used by ZED vdev matching for auto-{online,expand,replace}
- */
-int
-zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
-{
- const char *physpath = NULL;
- struct udev_list_entry *entry;
-
- /*
- * Normal disks use ID_PATH for their physical path.
- */
- physpath = udev_device_get_property_value(dev, "ID_PATH");
- if (physpath != NULL && strlen(physpath) > 0) {
- (void) strlcpy(bufptr, physpath, buflen);
- return (0);
- }
-
- /*
- * Device mapper devices are virtual and don't have a physical
- * path. For them we use ID_VDEV instead, which is setup via the
- * /etc/vdev_id.conf file. ID_VDEV provides a persistent path
- * to a virtual device. If you don't have vdev_id.conf setup,
- * you cannot use multipath autoreplace with device mapper.
- */
- physpath = udev_device_get_property_value(dev, "ID_VDEV");
- if (physpath != NULL && strlen(physpath) > 0) {
- (void) strlcpy(bufptr, physpath, buflen);
- return (0);
- }
-
- /*
- * For ZFS volumes use the persistent /dev/zvol/dataset identifier
- */
- entry = udev_device_get_devlinks_list_entry(dev);
- while (entry != NULL) {
- physpath = udev_list_entry_get_name(entry);
- if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
- (void) strlcpy(bufptr, physpath, buflen);
- return (0);
- }
- entry = udev_list_entry_get_next(entry);
- }
-
- /*
- * For all other devices fallback to using the by-uuid name.
- */
- entry = udev_device_get_devlinks_list_entry(dev);
- while (entry != NULL) {
- physpath = udev_list_entry_get_name(entry);
- if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
- (void) strlcpy(bufptr, physpath, buflen);
- return (0);
- }
- entry = udev_list_entry_get_next(entry);
- }
-
- return (ENODATA);
-}
-
-boolean_t
-udev_is_mpath(struct udev_device *dev)
-{
- return udev_device_get_property_value(dev, "DM_UUID") &&
- udev_device_get_property_value(dev, "MPATH_SBIN_PATH");
-}
-
-/*
- * A disk is considered a multipath whole disk when:
- * DEVNAME key value has "dm-"
- * DM_NAME key value has "mpath" prefix
- * DM_UUID key exists
- * ID_PART_TABLE_TYPE key does not exist or is not gpt
- */
-static boolean_t
-udev_mpath_whole_disk(struct udev_device *dev)
-{
- const char *devname, *type, *uuid;
-
- devname = udev_device_get_property_value(dev, "DEVNAME");
- type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
- uuid = udev_device_get_property_value(dev, "DM_UUID");
-
- if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
- ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
- (uuid != NULL)) {
- return (B_TRUE);
- }
-
- return (B_FALSE);
-}
-
-/*
- * Check if a disk is effectively a multipath whole disk
- */
-boolean_t
-is_mpath_whole_disk(const char *path)
-{
- struct udev *udev;
- struct udev_device *dev = NULL;
- char nodepath[MAXPATHLEN];
- char *sysname;
- boolean_t wholedisk = B_FALSE;
-
- if (realpath(path, nodepath) == NULL)
- return (B_FALSE);
- sysname = strrchr(nodepath, '/') + 1;
- if (strncmp(sysname, "dm-", 3) != 0)
- return (B_FALSE);
- if ((udev = udev_new()) == NULL)
- return (B_FALSE);
- if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
- sysname)) == NULL) {
- udev_device_unref(dev);
- return (B_FALSE);
- }
-
- wholedisk = udev_mpath_whole_disk(dev);
-
- udev_device_unref(dev);
- return (wholedisk);
-}
-
-static int
-udev_device_is_ready(struct udev_device *dev)
-{
-#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
- return (udev_device_get_is_initialized(dev));
-#else
- /* wait for DEVLINKS property to be initialized */
- return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
-#endif
-}
-
-/*
- * Wait up to timeout_ms for udev to set up the device node. The device is
- * considered ready when libudev determines it has been initialized, all of
- * the device links have been verified to exist, and it has been allowed to
- * settle. At this point the device the device can be accessed reliably.
- * Depending on the complexity of the udev rules this process could take
- * several seconds.
- */
-int
-zpool_label_disk_wait(char *path, int timeout_ms)
-{
- struct udev *udev;
- struct udev_device *dev = NULL;
- char nodepath[MAXPATHLEN];
- char *sysname = NULL;
- int ret = ENODEV;
- int settle_ms = 50;
- long sleep_ms = 10;
- hrtime_t start, settle;
-
- if ((udev = udev_new()) == NULL)
- return (ENXIO);
-
- start = gethrtime();
- settle = 0;
-
- do {
- if (sysname == NULL) {
- if (realpath(path, nodepath) != NULL) {
- sysname = strrchr(nodepath, '/') + 1;
- } else {
- (void) usleep(sleep_ms * MILLISEC);
- continue;
- }
- }
-
- dev = udev_device_new_from_subsystem_sysname(udev,
- "block", sysname);
- if ((dev != NULL) && udev_device_is_ready(dev)) {
- struct udev_list_entry *links, *link = NULL;
-
- ret = 0;
- links = udev_device_get_devlinks_list_entry(dev);
-
- udev_list_entry_foreach(link, links) {
- struct stat64 statbuf;
- const char *name;
-
- name = udev_list_entry_get_name(link);
- errno = 0;
- if (stat64(name, &statbuf) == 0 && errno == 0)
- continue;
-
- settle = 0;
- ret = ENODEV;
- break;
- }
-
- if (ret == 0) {
- if (settle == 0) {
- settle = gethrtime();
- } else if (NSEC2MSEC(gethrtime() - settle) >=
- settle_ms) {
- udev_device_unref(dev);
- break;
- }
- }
- }
-
- udev_device_unref(dev);
- (void) usleep(sleep_ms * MILLISEC);
-
- } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
-
- udev_unref(udev);
-
- return (ret);
-}
-
-
-/*
- * Encode the persistent devices strings
- * used for the vdev disk label
- */
-static int
-encode_device_strings(const char *path, vdev_dev_strs_t *ds,
- boolean_t wholedisk)
-{
- struct udev *udev;
- struct udev_device *dev = NULL;
- char nodepath[MAXPATHLEN];
- char *sysname;
- int ret = ENODEV;
- hrtime_t start;
-
- if ((udev = udev_new()) == NULL)
- return (ENXIO);
-
- /* resolve path to a runtime device node instance */
- if (realpath(path, nodepath) == NULL)
- goto no_dev;
-
- sysname = strrchr(nodepath, '/') + 1;
-
- /*
- * Wait up to 3 seconds for udev to set up the device node context
- */
- start = gethrtime();
- do {
- dev = udev_device_new_from_subsystem_sysname(udev, "block",
- sysname);
- if (dev == NULL)
- goto no_dev;
- if (udev_device_is_ready(dev))
- break; /* udev ready */
-
- udev_device_unref(dev);
- dev = NULL;
-
- if (NSEC2MSEC(gethrtime() - start) < 10)
- (void) sched_yield(); /* yield/busy wait up to 10ms */
- else
- (void) usleep(10 * MILLISEC);
-
- } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
-
- if (dev == NULL)
- goto no_dev;
-
- /*
- * Only whole disks require extra device strings
- */
- if (!wholedisk && !udev_mpath_whole_disk(dev))
- goto no_dev;
-
- ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
- if (ret != 0)
- goto no_dev_ref;
-
- /* physical location string (optional) */
- if (zfs_device_get_physical(dev, ds->vds_devphys,
- sizeof (ds->vds_devphys)) != 0) {
- ds->vds_devphys[0] = '\0'; /* empty string --> not available */
- }
-
-no_dev_ref:
- udev_device_unref(dev);
-no_dev:
- udev_unref(udev);
-
- return (ret);
-}
-
-/*
- * Update a leaf vdev's persistent device strings (Linux only)
- *
- * - only applies for a dedicated leaf vdev (aka whole disk)
- * - updated during pool create|add|attach|import
- * - used for matching device matching during auto-{online,expand,replace}
- * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
- * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
- *
- * single device node example:
- * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1'
- * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
- *
- * multipath device node example:
- * devid: 'dm-uuid-mpath-35000c5006304de3f'
- *
- * We also store the enclosure sysfs path for turning on enclosure LEDs
- * (if applicable):
- * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
- */
-void
-update_vdev_config_dev_strs(nvlist_t *nv)
-{
- vdev_dev_strs_t vds;
- char *env, *type, *path;
- uint64_t wholedisk = 0;
- char *upath, *spath;
-
- /*
- * For the benefit of legacy ZFS implementations, allow
- * for opting out of devid strings in the vdev label.
- *
- * example use:
- * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
- *
- * explanation:
- * Older ZFS on Linux implementations had issues when attempting to
- * display pool config VDEV names if a "devid" NVP value is present
- * in the pool's config.
- *
- * For example, a pool that originated on illumos platform would
- * have a devid value in the config and "zpool status" would fail
- * when listing the config.
- *
- * A pool can be stripped of any "devid" values on import or
- * prevented from adding them on zpool create|add by setting
- * ZFS_VDEV_DEVID_OPT_OUT.
- */
- env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
- if (env && (strtoul(env, NULL, 0) > 0 ||
- !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
- (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
- (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
- return;
- }
-
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
- strcmp(type, VDEV_TYPE_DISK) != 0) {
- return;
- }
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
- return;
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
-
- /*
- * Update device string values in config nvlist
- */
- if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
- (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
- if (vds.vds_devphys[0] != '\0') {
- (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
- vds.vds_devphys);
- }
-
- /* Add enclosure sysfs path (if disk is in an enclosure) */
- upath = zfs_get_underlying_path(path);
- spath = zfs_get_enclosure_sysfs_path(upath);
- if (spath)
- nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
- spath);
- else
- nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
-
- free(upath);
- free(spath);
- } else {
- /* clear out any stale entries */
- (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
- (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
- (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
- }
-}
-#else
-
-boolean_t
-is_mpath_whole_disk(const char *path)
-{
- return (B_FALSE);
-}
-
-/*
- * Wait up to timeout_ms for udev to set up the device node. The device is
- * considered ready when the provided path have been verified to exist and
- * it has been allowed to settle. At this point the device the device can
- * be accessed reliably. Depending on the complexity of the udev rules thisi
- * process could take several seconds.
- */
-int
-zpool_label_disk_wait(char *path, int timeout_ms)
-{
- int settle_ms = 50;
- long sleep_ms = 10;
- hrtime_t start, settle;
- struct stat64 statbuf;
-
- start = gethrtime();
- settle = 0;
-
- do {
- errno = 0;
- if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
- if (settle == 0)
- settle = gethrtime();
- else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
- return (0);
- } else if (errno != ENOENT) {
- return (errno);
- }
-
- usleep(sleep_ms * MILLISEC);
- } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
-
- return (ENODEV);
-}
-
-void
-update_vdev_config_dev_strs(nvlist_t *nv)
-{
-}
-
-#endif /* HAVE_LIBUDEV */
-
-/*
- * Go through and fix up any path and/or devid information for the given vdev
- * configuration.
- */
-static int
-fix_paths(nvlist_t *nv, name_entry_t *names)
-{
- nvlist_t **child;
- uint_t c, children;
- uint64_t guid;
- name_entry_t *ne, *best;
- char *path;
-
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0) {
- for (c = 0; c < children; c++)
- if (fix_paths(child[c], names) != 0)
- return (-1);
- return (0);
- }
-
- /*
- * This is a leaf (file or disk) vdev. In either case, go through
- * the name list and see if we find a matching guid. If so, replace
- * the path and see if we can calculate a new devid.
- *
- * There may be multiple names associated with a particular guid, in
- * which case we have overlapping partitions or multiple paths to the
- * same disk. In this case we prefer to use the path name which
- * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we
- * use the lowest order device which corresponds to the first match
- * while traversing the ZPOOL_IMPORT_PATH search path.
- */
- verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
- path = NULL;
-
- best = NULL;
- for (ne = names; ne != NULL; ne = ne->ne_next) {
- if (ne->ne_guid == guid) {
- if (path == NULL) {
- best = ne;
- break;
- }
-
- if ((strlen(path) == strlen(ne->ne_name)) &&
- strncmp(path, ne->ne_name, strlen(path)) == 0) {
- best = ne;
- break;
- }
-
- if (best == NULL) {
- best = ne;
- continue;
- }
-
- /* Prefer paths with move vdev labels. */
- if (ne->ne_num_labels > best->ne_num_labels) {
- best = ne;
- continue;
- }
-
- /* Prefer paths earlier in the search order. */
- if (ne->ne_num_labels == best->ne_num_labels &&
- ne->ne_order < best->ne_order) {
- best = ne;
- continue;
- }
- }
- }
-
- if (best == NULL)
- return (0);
-
- if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
- return (-1);
-
- /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */
- update_vdev_config_dev_strs(nv);
-
- return (0);
-}
-
-/*
- * Add the given configuration to the list of known devices.
- */
-static int
-add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path,
- int order, int num_labels, nvlist_t *config)
-{
- uint64_t pool_guid, vdev_guid, top_guid, txg, state;
- pool_entry_t *pe;
- vdev_entry_t *ve;
- config_entry_t *ce;
- name_entry_t *ne;
-
- /*
- * If this is a hot spare not currently in use or level 2 cache
- * device, add it to the list of names to translate, but don't do
- * anything else.
- */
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
- &state) == 0 &&
- (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
- if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
- return (-1);
-
- if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
- free(ne);
- return (-1);
- }
- ne->ne_guid = vdev_guid;
- ne->ne_order = order;
- ne->ne_num_labels = num_labels;
- ne->ne_next = pl->names;
- pl->names = ne;
-
- return (0);
- }
-
- /*
- * If we have a valid config but cannot read any of these fields, then
- * it means we have a half-initialized label. In vdev_label_init()
- * we write a label with txg == 0 so that we can identify the device
- * in case the user refers to the same disk later on. If we fail to
- * create the pool, we'll be left with a label in this state
- * which should not be considered part of a valid pool.
- */
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
- &pool_guid) != 0 ||
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
- &vdev_guid) != 0 ||
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
- &top_guid) != 0 ||
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
- &txg) != 0 || txg == 0) {
- return (0);
- }
-
- /*
- * First, see if we know about this pool. If not, then add it to the
- * list of known pools.
- */
- for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
- if (pe->pe_guid == pool_guid)
- break;
- }
-
- if (pe == NULL) {
- if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
- return (-1);
- }
- pe->pe_guid = pool_guid;
- pe->pe_next = pl->pools;
- pl->pools = pe;
- }
-
- /*
- * Second, see if we know about this toplevel vdev. Add it if its
- * missing.
- */
- for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
- if (ve->ve_guid == top_guid)
- break;
- }
-
- if (ve == NULL) {
- if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
- return (-1);
- }
- ve->ve_guid = top_guid;
- ve->ve_next = pe->pe_vdevs;
- pe->pe_vdevs = ve;
- }
-
- /*
- * Third, see if we have a config with a matching transaction group. If
- * so, then we do nothing. Otherwise, add it to the list of known
- * configs.
- */
- for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
- if (ce->ce_txg == txg)
- break;
- }
-
- if (ce == NULL) {
- if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) {
- return (-1);
- }
- ce->ce_txg = txg;
- ce->ce_config = fnvlist_dup(config);
- ce->ce_next = ve->ve_configs;
- ve->ve_configs = ce;
- }
-
- /*
- * At this point we've successfully added our config to the list of
- * known configs. The last thing to do is add the vdev guid -> path
- * mappings so that we can fix up the configuration as necessary before
- * doing the import.
- */
- if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
- return (-1);
-
- if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
- free(ne);
- return (-1);
- }
-
- ne->ne_guid = vdev_guid;
- ne->ne_order = order;
- ne->ne_num_labels = num_labels;
- ne->ne_next = pl->names;
- pl->names = ne;
-
- return (0);
-}
+#include <libzfs_impl.h>
+#include <libzutil.h>
/*
* Returns true if the named pool matches the given GUID.
@@ -909,446 +106,25 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
return (nvl);
}
-/*
- * Determine if the vdev id is a hole in the namespace.
- */
-boolean_t
-vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
-{
- int c;
-
- for (c = 0; c < holes; c++) {
-
- /* Top-level is a hole */
- if (hole_array[c] == id)
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-/*
- * Convert our list of pools into the definitive set of configurations. We
- * start by picking the best config for each toplevel vdev. Once that's done,
- * we assemble the toplevel vdevs into a full config for the pool. We make a
- * pass to fix up any incorrect paths, and then add it to the main list to
- * return to the user.
- */
static nvlist_t *
-get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
- nvlist_t *policy)
+refresh_config_libzfs(void *handle, nvlist_t *tryconfig)
{
- pool_entry_t *pe;
- vdev_entry_t *ve;
- config_entry_t *ce;
- nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
- nvlist_t **spares, **l2cache;
- uint_t i, nspares, nl2cache;
- boolean_t config_seen;
- uint64_t best_txg;
- char *name, *hostname = NULL;
- uint64_t guid;
- uint_t children = 0;
- nvlist_t **child = NULL;
- uint_t holes;
- uint64_t *hole_array, max_id;
- uint_t c;
- boolean_t isactive;
- uint64_t hostid;
- nvlist_t *nvl;
- boolean_t valid_top_config = B_FALSE;
-
- if (nvlist_alloc(&ret, 0, 0) != 0)
- goto nomem;
-
- for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
- uint64_t id, max_txg = 0;
-
- if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
- goto nomem;
- config_seen = B_FALSE;
-
- /*
- * Iterate over all toplevel vdevs. Grab the pool configuration
- * from the first one we find, and then go through the rest and
- * add them as necessary to the 'vdevs' member of the config.
- */
- for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
-
- /*
- * Determine the best configuration for this vdev by
- * selecting the config with the latest transaction
- * group.
- */
- best_txg = 0;
- for (ce = ve->ve_configs; ce != NULL;
- ce = ce->ce_next) {
-
- if (ce->ce_txg > best_txg) {
- tmp = ce->ce_config;
- best_txg = ce->ce_txg;
- }
- }
-
- /*
- * We rely on the fact that the max txg for the
- * pool will contain the most up-to-date information
- * about the valid top-levels in the vdev namespace.
- */
- if (best_txg > max_txg) {
- (void) nvlist_remove(config,
- ZPOOL_CONFIG_VDEV_CHILDREN,
- DATA_TYPE_UINT64);
- (void) nvlist_remove(config,
- ZPOOL_CONFIG_HOLE_ARRAY,
- DATA_TYPE_UINT64_ARRAY);
-
- max_txg = best_txg;
- hole_array = NULL;
- holes = 0;
- max_id = 0;
- valid_top_config = B_FALSE;
-
- if (nvlist_lookup_uint64(tmp,
- ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
- verify(nvlist_add_uint64(config,
- ZPOOL_CONFIG_VDEV_CHILDREN,
- max_id) == 0);
- valid_top_config = B_TRUE;
- }
-
- if (nvlist_lookup_uint64_array(tmp,
- ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
- &holes) == 0) {
- verify(nvlist_add_uint64_array(config,
- ZPOOL_CONFIG_HOLE_ARRAY,
- hole_array, holes) == 0);
- }
- }
-
- if (!config_seen) {
- /*
- * Copy the relevant pieces of data to the pool
- * configuration:
- *
- * version
- * pool guid
- * name
- * comment (if available)
- * pool state
- * hostid (if available)
- * hostname (if available)
- */
- uint64_t state, version;
- char *comment = NULL;
-
- version = fnvlist_lookup_uint64(tmp,
- ZPOOL_CONFIG_VERSION);
- fnvlist_add_uint64(config,
- ZPOOL_CONFIG_VERSION, version);
- guid = fnvlist_lookup_uint64(tmp,
- ZPOOL_CONFIG_POOL_GUID);
- fnvlist_add_uint64(config,
- ZPOOL_CONFIG_POOL_GUID, guid);
- name = fnvlist_lookup_string(tmp,
- ZPOOL_CONFIG_POOL_NAME);
- fnvlist_add_string(config,
- ZPOOL_CONFIG_POOL_NAME, name);
-
- if (nvlist_lookup_string(tmp,
- ZPOOL_CONFIG_COMMENT, &comment) == 0)
- fnvlist_add_string(config,
- ZPOOL_CONFIG_COMMENT, comment);
-
- state = fnvlist_lookup_uint64(tmp,
- ZPOOL_CONFIG_POOL_STATE);
- fnvlist_add_uint64(config,
- ZPOOL_CONFIG_POOL_STATE, state);
-
- hostid = 0;
- if (nvlist_lookup_uint64(tmp,
- ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
- fnvlist_add_uint64(config,
- ZPOOL_CONFIG_HOSTID, hostid);
- hostname = fnvlist_lookup_string(tmp,
- ZPOOL_CONFIG_HOSTNAME);
- fnvlist_add_string(config,
- ZPOOL_CONFIG_HOSTNAME, hostname);
- }
-
- config_seen = B_TRUE;
- }
-
- /*
- * Add this top-level vdev to the child array.
- */
- verify(nvlist_lookup_nvlist(tmp,
- ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
- verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
- &id) == 0);
-
- if (id >= children) {
- nvlist_t **newchild;
-
- newchild = zfs_alloc(hdl, (id + 1) *
- sizeof (nvlist_t *));
- if (newchild == NULL)
- goto nomem;
-
- for (c = 0; c < children; c++)
- newchild[c] = child[c];
-
- free(child);
- child = newchild;
- children = id + 1;
- }
- if (nvlist_dup(nvtop, &child[id], 0) != 0)
- goto nomem;
-
- }
-
- /*
- * If we have information about all the top-levels then
- * clean up the nvlist which we've constructed. This
- * means removing any extraneous devices that are
- * beyond the valid range or adding devices to the end
- * of our array which appear to be missing.
- */
- if (valid_top_config) {
- if (max_id < children) {
- for (c = max_id; c < children; c++)
- nvlist_free(child[c]);
- children = max_id;
- } else if (max_id > children) {
- nvlist_t **newchild;
-
- newchild = zfs_alloc(hdl, (max_id) *
- sizeof (nvlist_t *));
- if (newchild == NULL)
- goto nomem;
-
- for (c = 0; c < children; c++)
- newchild[c] = child[c];
-
- free(child);
- child = newchild;
- children = max_id;
- }
- }
-
- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
- &guid) == 0);
-
- /*
- * The vdev namespace may contain holes as a result of
- * device removal. We must add them back into the vdev
- * tree before we process any missing devices.
- */
- if (holes > 0) {
- ASSERT(valid_top_config);
-
- for (c = 0; c < children; c++) {
- nvlist_t *holey;
-
- if (child[c] != NULL ||
- !vdev_is_hole(hole_array, holes, c))
- continue;
-
- if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
- 0) != 0)
- goto nomem;
-
- /*
- * Holes in the namespace are treated as
- * "hole" top-level vdevs and have a
- * special flag set on them.
- */
- if (nvlist_add_string(holey,
- ZPOOL_CONFIG_TYPE,
- VDEV_TYPE_HOLE) != 0 ||
- nvlist_add_uint64(holey,
- ZPOOL_CONFIG_ID, c) != 0 ||
- nvlist_add_uint64(holey,
- ZPOOL_CONFIG_GUID, 0ULL) != 0) {
- nvlist_free(holey);
- goto nomem;
- }
- child[c] = holey;
- }
- }
-
- /*
- * Look for any missing top-level vdevs. If this is the case,
- * create a faked up 'missing' vdev as a placeholder. We cannot
- * simply compress the child array, because the kernel performs
- * certain checks to make sure the vdev IDs match their location
- * in the configuration.
- */
- for (c = 0; c < children; c++) {
- if (child[c] == NULL) {
- nvlist_t *missing;
- if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
- 0) != 0)
- goto nomem;
- if (nvlist_add_string(missing,
- ZPOOL_CONFIG_TYPE,
- VDEV_TYPE_MISSING) != 0 ||
- nvlist_add_uint64(missing,
- ZPOOL_CONFIG_ID, c) != 0 ||
- nvlist_add_uint64(missing,
- ZPOOL_CONFIG_GUID, 0ULL) != 0) {
- nvlist_free(missing);
- goto nomem;
- }
- child[c] = missing;
- }
- }
-
- /*
- * Put all of this pool's top-level vdevs into a root vdev.
- */
- if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
- goto nomem;
- if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
- VDEV_TYPE_ROOT) != 0 ||
- nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||
- nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||
- nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- child, children) != 0) {
- nvlist_free(nvroot);
- goto nomem;
- }
-
- for (c = 0; c < children; c++)
- nvlist_free(child[c]);
- free(child);
- children = 0;
- child = NULL;
-
- /*
- * Go through and fix up any paths and/or devids based on our
- * known list of vdev GUID -> path mappings.
- */
- if (fix_paths(nvroot, pl->names) != 0) {
- nvlist_free(nvroot);
- goto nomem;
- }
-
- /*
- * Add the root vdev to this pool's configuration.
- */
- if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
- nvroot) != 0) {
- nvlist_free(nvroot);
- goto nomem;
- }
- nvlist_free(nvroot);
-
- /*
- * zdb uses this path to report on active pools that were
- * imported or created using -R.
- */
- if (active_ok)
- goto add_pool;
-
- /*
- * Determine if this pool is currently active, in which case we
- * can't actually import it.
- */
- verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
- &name) == 0);
- verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
- &guid) == 0);
-
- if (pool_active(hdl, name, guid, &isactive) != 0)
- goto error;
-
- if (isactive) {
- nvlist_free(config);
- config = NULL;
- continue;
- }
-
- if (policy != NULL) {
- if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
- policy) != 0)
- goto nomem;
- }
-
- if ((nvl = refresh_config(hdl, config)) == NULL) {
- nvlist_free(config);
- config = NULL;
- continue;
- }
-
- nvlist_free(config);
- config = nvl;
-
- /*
- * Go through and update the paths for spares, now that we have
- * them.
- */
- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) == 0) {
- for (i = 0; i < nspares; i++) {
- if (fix_paths(spares[i], pl->names) != 0)
- goto nomem;
- }
- }
-
- /*
- * Update the paths for l2cache devices.
- */
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
- &l2cache, &nl2cache) == 0) {
- for (i = 0; i < nl2cache; i++) {
- if (fix_paths(l2cache[i], pl->names) != 0)
- goto nomem;
- }
- }
-
- /*
- * Restore the original information read from the actual label.
- */
- (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
- DATA_TYPE_UINT64);
- (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
- DATA_TYPE_STRING);
- if (hostid != 0) {
- verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
- hostid) == 0);
- verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
- hostname) == 0);
- }
-
-add_pool:
- /*
- * Add this pool to the list of configs.
- */
- verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
- &name) == 0);
- if (nvlist_add_nvlist(ret, name, config) != 0)
- goto nomem;
-
- nvlist_free(config);
- config = NULL;
- }
-
- return (ret);
+ return (refresh_config((libzfs_handle_t *)handle, tryconfig));
+}
-nomem:
- (void) no_memory(hdl);
-error:
- nvlist_free(config);
- nvlist_free(ret);
- for (c = 0; c < children; c++)
- nvlist_free(child[c]);
- free(child);
- return (NULL);
+static int
+pool_active_libzfs(void *handle, const char *name, uint64_t guid,
+ boolean_t *isactive)
+{
+ return (pool_active((libzfs_handle_t *)handle, name, guid, isactive));
}
+const pool_config_ops_t libzfs_config_ops = {
+ .pco_refresh_config = refresh_config_libzfs,
+ .pco_pool_active = pool_active_libzfs,
+};
+
/*
* Return the offset of the given label.
*/
@@ -1361,346 +137,6 @@ label_offset(uint64_t size, int l)
}
/*
- * Given a file descriptor, read the label information and return an nvlist
- * describing the configuration, if there is one. The number of valid
- * labels found will be returned in num_labels when non-NULL.
- */
-int
-zpool_read_label(int fd, nvlist_t **config, int *num_labels)
-{
- struct stat64 statbuf;
- int l, count = 0;
- vdev_label_t *label;
- nvlist_t *expected_config = NULL;
- uint64_t expected_guid = 0, size;
- int error;
-
- *config = NULL;
-
- if (fstat64_blk(fd, &statbuf) == -1)
- return (0);
- size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
-
- error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
- if (error)
- return (-1);
-
- for (l = 0; l < VDEV_LABELS; l++) {
- uint64_t state, guid, txg;
-
- if (pread64(fd, label, sizeof (vdev_label_t),
- label_offset(size, l)) != sizeof (vdev_label_t))
- continue;
-
- if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
- sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0)
- continue;
-
- if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
- &guid) != 0 || guid == 0) {
- nvlist_free(*config);
- continue;
- }
-
- if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
- &state) != 0 || state > POOL_STATE_L2CACHE) {
- nvlist_free(*config);
- continue;
- }
-
- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
- (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
- &txg) != 0 || txg == 0)) {
- nvlist_free(*config);
- continue;
- }
-
- if (expected_guid) {
- if (expected_guid == guid)
- count++;
-
- nvlist_free(*config);
- } else {
- expected_config = *config;
- expected_guid = guid;
- count++;
- }
- }
-
- if (num_labels != NULL)
- *num_labels = count;
-
- free(label);
- *config = expected_config;
-
- return (0);
-}
-
-typedef struct rdsk_node {
- char *rn_name; /* Full path to device */
- int rn_order; /* Preferred order (low to high) */
- int rn_num_labels; /* Number of valid labels */
- uint64_t rn_vdev_guid; /* Expected vdev guid when set */
- libzfs_handle_t *rn_hdl;
- nvlist_t *rn_config; /* Label config */
- avl_tree_t *rn_avl;
- avl_node_t rn_node;
- pthread_mutex_t *rn_lock;
- boolean_t rn_labelpaths;
-} rdsk_node_t;
-
-/*
- * Sorted by vdev guid and full path to allow for multiple entries with
- * the same full path name. This is required because it's possible to
- * have multiple block devices with labels that refer to the same
- * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both
- * entries need to be added to the cache. Scenarios where this can occur
- * include overwritten pool labels, devices which are visible from multiple
- * hosts and multipath devices.
- */
-static int
-slice_cache_compare(const void *arg1, const void *arg2)
-{
- const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
- const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
- uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
- uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
- int rv;
-
- rv = AVL_CMP(guid1, guid2);
- if (rv)
- return (rv);
-
- return (AVL_ISIGN(strcmp(nm1, nm2)));
-}
-
-static boolean_t
-is_watchdog_dev(char *dev)
-{
- /* For 'watchdog' dev */
- if (strcmp(dev, "watchdog") == 0)
- return (B_TRUE);
-
- /* For 'watchdog<digit><whatever> */
- if (strstr(dev, "watchdog") == dev && isdigit(dev[8]))
- return (B_TRUE);
-
- return (B_FALSE);
-}
-
-static int
-label_paths_impl(libzfs_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,
- uint64_t vdev_guid, char **path, char **devid)
-{
- nvlist_t **child;
- uint_t c, children;
- uint64_t guid;
- char *val;
- int error;
-
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0) {
- for (c = 0; c < children; c++) {
- error = label_paths_impl(hdl, child[c],
- pool_guid, vdev_guid, path, devid);
- if (error)
- return (error);
- }
- return (0);
- }
-
- if (nvroot == NULL)
- return (0);
-
- error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);
- if ((error != 0) || (guid != vdev_guid))
- return (0);
-
- error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);
- if (error == 0)
- *path = val;
-
- error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);
- if (error == 0)
- *devid = val;
-
- return (0);
-}
-
-/*
- * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
- * and store these strings as config_path and devid_path respectively.
- * The returned pointers are only valid as long as label remains valid.
- */
-static int
-label_paths(libzfs_handle_t *hdl, nvlist_t *label, char **path, char **devid)
-{
- nvlist_t *nvroot;
- uint64_t pool_guid;
- uint64_t vdev_guid;
-
- *path = NULL;
- *devid = NULL;
-
- if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid))
- return (ENOENT);
-
- return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,
- devid));
-}
-
-static void
-zpool_open_func(void *arg)
-{
- rdsk_node_t *rn = arg;
- libzfs_handle_t *hdl = rn->rn_hdl;
- struct stat64 statbuf;
- nvlist_t *config;
- char *bname, *dupname;
- uint64_t vdev_guid = 0;
- int error;
- int num_labels;
- int fd;
-
- /*
- * Skip devices with well known prefixes there can be side effects
- * when opening devices which need to be avoided.
- *
- * hpet - High Precision Event Timer
- * watchdog - Watchdog must be closed in a special way.
- */
- dupname = zfs_strdup(hdl, rn->rn_name);
- bname = basename(dupname);
- error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname));
- free(dupname);
- if (error)
- return;
-
- /*
- * Ignore failed stats. We only want regular files and block devices.
- */
- if (stat64(rn->rn_name, &statbuf) != 0 ||
- (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
- return;
-
- /*
- * Preferentially open using O_DIRECT to bypass the block device
- * cache which may be stale for multipath devices. An EINVAL errno
- * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
- */
- fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
- if ((fd < 0) && (errno == EINVAL))
- fd = open(rn->rn_name, O_RDONLY);
-
- if (fd < 0)
- return;
-
- /*
- * This file is too small to hold a zpool
- */
- if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
- (void) close(fd);
- return;
- }
-
- error = zpool_read_label(fd, &config, &num_labels);
- if (error != 0) {
- (void) close(fd);
- return;
- }
-
- if (num_labels == 0) {
- (void) close(fd);
- nvlist_free(config);
- return;
- }
-
- /*
- * Check that the vdev is for the expected guid. Additional entries
- * are speculatively added based on the paths stored in the labels.
- * Entries with valid paths but incorrect guids must be removed.
- */
- error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
- if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
- (void) close(fd);
- nvlist_free(config);
- return;
- }
-
- (void) close(fd);
-
- rn->rn_config = config;
- rn->rn_num_labels = num_labels;
-
- /*
- * Add additional entries for paths described by this label.
- */
- if (rn->rn_labelpaths) {
- char *path = NULL;
- char *devid = NULL;
- rdsk_node_t *slice;
- avl_index_t where;
- int error;
-
- if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
- return;
-
- /*
- * Allow devlinks to stabilize so all paths are available.
- */
- zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT);
-
- if (path != NULL) {
- slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
- slice->rn_name = zfs_strdup(hdl, path);
- slice->rn_vdev_guid = vdev_guid;
- slice->rn_avl = rn->rn_avl;
- slice->rn_hdl = hdl;
- slice->rn_order = IMPORT_ORDER_PREFERRED_1;
- slice->rn_labelpaths = B_FALSE;
- pthread_mutex_lock(rn->rn_lock);
- if (avl_find(rn->rn_avl, slice, &where)) {
- pthread_mutex_unlock(rn->rn_lock);
- free(slice->rn_name);
- free(slice);
- } else {
- avl_insert(rn->rn_avl, slice, where);
- pthread_mutex_unlock(rn->rn_lock);
- zpool_open_func(slice);
- }
- }
-
- if (devid != NULL) {
- slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
- error = asprintf(&slice->rn_name, "%s%s",
- DEV_BYID_PATH, devid);
- if (error == -1) {
- free(slice);
- return;
- }
-
- slice->rn_vdev_guid = vdev_guid;
- slice->rn_avl = rn->rn_avl;
- slice->rn_hdl = hdl;
- slice->rn_order = IMPORT_ORDER_PREFERRED_2;
- slice->rn_labelpaths = B_FALSE;
- pthread_mutex_lock(rn->rn_lock);
- if (avl_find(rn->rn_avl, slice, &where)) {
- pthread_mutex_unlock(rn->rn_lock);
- free(slice->rn_name);
- free(slice);
- } else {
- avl_insert(rn->rn_avl, slice, where);
- pthread_mutex_unlock(rn->rn_lock);
- zpool_open_func(slice);
- }
- }
- }
-}
-
-/*
* Given a file descriptor, clear (zero) the label information. This function
* is used in the appliance stack as part of the ZFS sysevent module and
* to implement the "zpool labelclear" command.
@@ -1732,668 +168,7 @@ zpool_clear_label(int fd)
return (0);
}
-static void
-zpool_find_import_scan_add_slice(libzfs_handle_t *hdl, pthread_mutex_t *lock,
- avl_tree_t *cache, char *path, const char *name, int order)
-{
- avl_index_t where;
- rdsk_node_t *slice;
-
- slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
- if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {
- free(slice);
- return;
- }
- slice->rn_vdev_guid = 0;
- slice->rn_lock = lock;
- slice->rn_avl = cache;
- slice->rn_hdl = hdl;
- slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;
- slice->rn_labelpaths = B_FALSE;
-
- pthread_mutex_lock(lock);
- if (avl_find(cache, slice, &where)) {
- free(slice->rn_name);
- free(slice);
- } else {
- avl_insert(cache, slice, where);
- }
- pthread_mutex_unlock(lock);
-}
-
-static int
-zpool_find_import_scan_dir(libzfs_handle_t *hdl, pthread_mutex_t *lock,
- avl_tree_t *cache, char *dir, int order)
-{
- int error;
- char path[MAXPATHLEN];
- struct dirent64 *dp;
- DIR *dirp;
-
- if (realpath(dir, path) == NULL) {
- error = errno;
- if (error == ENOENT)
- return (0);
-
- zfs_error_aux(hdl, strerror(error));
- (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
- TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
- return (error);
- }
-
- dirp = opendir(path);
- if (dirp == NULL) {
- error = errno;
- zfs_error_aux(hdl, strerror(error));
- (void) zfs_error_fmt(hdl, EZFS_BADPATH,
- dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
- return (error);
- }
-
- while ((dp = readdir64(dirp)) != NULL) {
- const char *name = dp->d_name;
- if (name[0] == '.' &&
- (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
- continue;
-
- zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,
- order);
- }
-
- (void) closedir(dirp);
- return (0);
-}
-
-static int
-zpool_find_import_scan_path(libzfs_handle_t *hdl, pthread_mutex_t *lock,
- avl_tree_t *cache, char *dir, int order)
-{
- int error = 0;
- char path[MAXPATHLEN];
- char *d, *b;
- char *dpath, *name;
-
- /*
- * Seperate the directory part and last part of the
- * path. We do this so that we can get the realpath of
- * the directory. We don't get the realpath on the
- * whole path because if it's a symlink, we want the
- * path of the symlink not where it points to.
- */
- d = zfs_strdup(hdl, dir);
- b = zfs_strdup(hdl, dir);
- dpath = dirname(d);
- name = basename(b);
-
- if (realpath(dpath, path) == NULL) {
- error = errno;
- if (error == ENOENT) {
- error = 0;
- goto out;
- }
-
- zfs_error_aux(hdl, strerror(error));
- (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
- TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
- goto out;
- }
-
- zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);
-
-out:
- free(b);
- free(d);
- return (error);
-}
-
-/*
- * Scan a list of directories for zfs devices.
- */
-static int
-zpool_find_import_scan(libzfs_handle_t *hdl, pthread_mutex_t *lock,
- avl_tree_t **slice_cache, char **dir, int dirs)
-{
- avl_tree_t *cache;
- rdsk_node_t *slice;
- void *cookie;
- int i, error;
-
- *slice_cache = NULL;
- cache = zfs_alloc(hdl, sizeof (avl_tree_t));
- avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),
- offsetof(rdsk_node_t, rn_node));
-
- for (i = 0; i < dirs; i++) {
- struct stat sbuf;
-
- if (stat(dir[i], &sbuf) != 0) {
- error = errno;
- if (error == ENOENT)
- continue;
-
- zfs_error_aux(hdl, strerror(error));
- (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
- TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);
- goto error;
- }
-
- /*
- * If dir[i] is a directory, we walk through it and add all
- * the entry to the cache. If it's not a directory, we just
- * add it to the cache.
- */
- if (S_ISDIR(sbuf.st_mode)) {
- if ((error = zpool_find_import_scan_dir(hdl, lock,
- cache, dir[i], i)) != 0)
- goto error;
- } else {
- if ((error = zpool_find_import_scan_path(hdl, lock,
- cache, dir[i], i)) != 0)
- goto error;
- }
- }
-
- *slice_cache = cache;
- return (0);
-
-error:
- cookie = NULL;
- while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
- free(slice->rn_name);
- free(slice);
- }
- free(cache);
-
- return (error);
-}
-
-/*
- * Use libblkid to quickly enumerate all known zfs devices.
- */
-static int
-zpool_find_import_blkid(libzfs_handle_t *hdl, pthread_mutex_t *lock,
- avl_tree_t **slice_cache)
-{
- rdsk_node_t *slice;
- blkid_cache cache;
- blkid_dev_iterate iter;
- blkid_dev dev;
- avl_index_t where;
- int error;
-
- *slice_cache = NULL;
-
- error = blkid_get_cache(&cache, NULL);
- if (error != 0)
- return (error);
-
- error = blkid_probe_all_new(cache);
- if (error != 0) {
- blkid_put_cache(cache);
- return (error);
- }
-
- iter = blkid_dev_iterate_begin(cache);
- if (iter == NULL) {
- blkid_put_cache(cache);
- return (EINVAL);
- }
-
- error = blkid_dev_set_search(iter, "TYPE", "zfs_member");
- if (error != 0) {
- blkid_dev_iterate_end(iter);
- blkid_put_cache(cache);
- return (error);
- }
-
- *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t));
- avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
- offsetof(rdsk_node_t, rn_node));
-
- while (blkid_dev_next(iter, &dev) == 0) {
- slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
- slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
- slice->rn_vdev_guid = 0;
- slice->rn_lock = lock;
- slice->rn_avl = *slice_cache;
- slice->rn_hdl = hdl;
- slice->rn_labelpaths = B_TRUE;
-
- error = zfs_path_order(slice->rn_name, &slice->rn_order);
- if (error == 0)
- slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
- else
- slice->rn_order = IMPORT_ORDER_DEFAULT;
-
- pthread_mutex_lock(lock);
- if (avl_find(*slice_cache, slice, &where)) {
- free(slice->rn_name);
- free(slice);
- } else {
- avl_insert(*slice_cache, slice, where);
- }
- pthread_mutex_unlock(lock);
- }
-
- blkid_dev_iterate_end(iter);
- blkid_put_cache(cache);
-
- return (0);
-}
-
-char *
-zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
- "/dev/disk/by-vdev", /* Custom rules, use first if they exist */
- "/dev/mapper", /* Use multipath devices before components */
- "/dev/disk/by-partlabel", /* Single unique entry set by user */
- "/dev/disk/by-partuuid", /* Generated partition uuid */
- "/dev/disk/by-label", /* Custom persistent labels */
- "/dev/disk/by-uuid", /* Single unique entry and persistent */
- "/dev/disk/by-id", /* May be multiple entries and persistent */
- "/dev/disk/by-path", /* Encodes physical location and persistent */
- "/dev" /* UNSAFE device names will change */
-};
-
-/*
- * Given a list of directories to search, find all pools stored on disk. This
- * includes partial pools which are not available to import. If no args are
- * given (argc is 0), then the default directory (/dev/dsk) is searched.
- * poolname or guid (but not both) are provided by the caller when trying
- * to import a specific pool.
- */
-static nvlist_t *
-zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
-{
- nvlist_t *ret = NULL;
- pool_list_t pools = { 0 };
- pool_entry_t *pe, *penext;
- vdev_entry_t *ve, *venext;
- config_entry_t *ce, *cenext;
- name_entry_t *ne, *nenext;
- pthread_mutex_t lock;
- avl_tree_t *cache;
- rdsk_node_t *slice;
- void *cookie;
- tpool_t *t;
-
- verify(iarg->poolname == NULL || iarg->guid == 0);
- pthread_mutex_init(&lock, NULL);
-
- /*
- * Locate pool member vdevs using libblkid or by directory scanning.
- * On success a newly allocated AVL tree which is populated with an
- * entry for each discovered vdev will be returned as the cache.
- * It's the callers responsibility to consume and destroy this tree.
- */
- if (iarg->scan || iarg->paths != 0) {
- int dirs = iarg->paths;
- char **dir = iarg->path;
-
- if (dirs == 0) {
- dir = zpool_default_import_path;
- dirs = DEFAULT_IMPORT_PATH_SIZE;
- }
-
- if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0)
- return (NULL);
- } else {
- if (zpool_find_import_blkid(hdl, &lock, &cache) != 0)
- return (NULL);
- }
-
- /*
- * Create a thread pool to parallelize the process of reading and
- * validating labels, a large number of threads can be used due to
- * minimal contention.
- */
- t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
- for (slice = avl_first(cache); slice;
- (slice = avl_walk(cache, slice, AVL_AFTER)))
- (void) tpool_dispatch(t, zpool_open_func, slice);
-
- tpool_wait(t);
- tpool_destroy(t);
-
- /*
- * Process the cache filtering out any entries which are not
- * for the specificed pool then adding matching label configs.
- */
- cookie = NULL;
- while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
- if (slice->rn_config != NULL) {
- nvlist_t *config = slice->rn_config;
- boolean_t matched = B_TRUE;
- boolean_t aux = B_FALSE;
- int fd;
-
- /*
- * Check if it's a spare or l2cache device. If it is,
- * we need to skip the name and guid check since they
- * don't exist on aux device label.
- */
- if (iarg->poolname != NULL || iarg->guid != 0) {
- uint64_t state;
- aux = nvlist_lookup_uint64(config,
- ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&
- (state == POOL_STATE_SPARE ||
- state == POOL_STATE_L2CACHE);
- }
-
- if (iarg->poolname != NULL && !aux) {
- char *pname;
-
- matched = nvlist_lookup_string(config,
- ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&
- strcmp(iarg->poolname, pname) == 0;
- } else if (iarg->guid != 0 && !aux) {
- uint64_t this_guid;
-
- matched = nvlist_lookup_uint64(config,
- ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&
- iarg->guid == this_guid;
- }
- if (matched) {
- /*
- * Verify all remaining entries can be opened
- * exclusively. This will prune all underlying
- * multipath devices which otherwise could
- * result in the vdev appearing as UNAVAIL.
- *
- * Under zdb, this step isn't required and
- * would prevent a zdb -e of active pools with
- * no cachefile.
- */
- fd = open(slice->rn_name, O_RDONLY | O_EXCL);
- if (fd >= 0 || iarg->can_be_active) {
- if (fd >= 0)
- close(fd);
- add_config(hdl, &pools,
- slice->rn_name, slice->rn_order,
- slice->rn_num_labels, config);
- }
- }
- nvlist_free(config);
- }
- free(slice->rn_name);
- free(slice);
- }
- avl_destroy(cache);
- free(cache);
- pthread_mutex_destroy(&lock);
-
- ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
-
- for (pe = pools.pools; pe != NULL; pe = penext) {
- penext = pe->pe_next;
- for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
- venext = ve->ve_next;
- for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
- cenext = ce->ce_next;
- nvlist_free(ce->ce_config);
- free(ce);
- }
- free(ve);
- }
- free(pe);
- }
-
- for (ne = pools.names; ne != NULL; ne = nenext) {
- nenext = ne->ne_next;
- free(ne->ne_name);
- free(ne);
- }
-
- return (ret);
-}
-
-nvlist_t *
-zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
-{
- importargs_t iarg = { 0 };
-
- iarg.paths = argc;
- iarg.path = argv;
-
- return (zpool_find_import_impl(hdl, &iarg));
-}
-
-/*
- * Given a cache file, return the contents as a list of importable pools.
- * poolname or guid (but not both) are provided by the caller when trying
- * to import a specific pool.
- */
-nvlist_t *
-zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
- char *poolname, uint64_t guid)
-{
- char *buf;
- int fd;
- struct stat64 statbuf;
- nvlist_t *raw, *src, *dst;
- nvlist_t *pools;
- nvpair_t *elem;
- char *name;
- uint64_t this_guid;
- boolean_t active;
-
- verify(poolname == NULL || guid == 0);
-
- if ((fd = open(cachefile, O_RDONLY)) < 0) {
- zfs_error_aux(hdl, "%s", strerror(errno));
- (void) zfs_error(hdl, EZFS_BADCACHE,
- dgettext(TEXT_DOMAIN, "failed to open cache file"));
- return (NULL);
- }
-
- if (fstat64(fd, &statbuf) != 0) {
- zfs_error_aux(hdl, "%s", strerror(errno));
- (void) close(fd);
- (void) zfs_error(hdl, EZFS_BADCACHE,
- dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
- return (NULL);
- }
-
- if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) {
- (void) close(fd);
- return (NULL);
- }
-
- if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
- (void) close(fd);
- free(buf);
- (void) zfs_error(hdl, EZFS_BADCACHE,
- dgettext(TEXT_DOMAIN,
- "failed to read cache file contents"));
- return (NULL);
- }
-
- (void) close(fd);
-
- if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
- free(buf);
- (void) zfs_error(hdl, EZFS_BADCACHE,
- dgettext(TEXT_DOMAIN,
- "invalid or corrupt cache file contents"));
- return (NULL);
- }
-
- free(buf);
-
- /*
- * Go through and get the current state of the pools and refresh their
- * state.
- */
- if (nvlist_alloc(&pools, 0, 0) != 0) {
- (void) no_memory(hdl);
- nvlist_free(raw);
- return (NULL);
- }
-
- elem = NULL;
- while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
- src = fnvpair_value_nvlist(elem);
-
- name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
- if (poolname != NULL && strcmp(poolname, name) != 0)
- continue;
-
- this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
- if (guid != 0 && guid != this_guid)
- continue;
-
- if (pool_active(hdl, name, this_guid, &active) != 0) {
- nvlist_free(raw);
- nvlist_free(pools);
- return (NULL);
- }
-
- if (active)
- continue;
-
- if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
- cachefile) != 0) {
- (void) no_memory(hdl);
- nvlist_free(raw);
- nvlist_free(pools);
- return (NULL);
- }
-
- if ((dst = refresh_config(hdl, src)) == NULL) {
- nvlist_free(raw);
- nvlist_free(pools);
- return (NULL);
- }
-
- if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
- (void) no_memory(hdl);
- nvlist_free(dst);
- nvlist_free(raw);
- nvlist_free(pools);
- return (NULL);
- }
- nvlist_free(dst);
- }
-
- nvlist_free(raw);
- return (pools);
-}
-
-static int
-name_or_guid_exists(zpool_handle_t *zhp, void *data)
-{
- importargs_t *import = data;
- int found = 0;
-
- if (import->poolname != NULL) {
- char *pool_name;
-
- verify(nvlist_lookup_string(zhp->zpool_config,
- ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
- if (strcmp(pool_name, import->poolname) == 0)
- found = 1;
- } else {
- uint64_t pool_guid;
-
- verify(nvlist_lookup_uint64(zhp->zpool_config,
- ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
- if (pool_guid == import->guid)
- found = 1;
- }
-
- zpool_close(zhp);
- return (found);
-}
-
-nvlist_t *
-zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
-{
- verify(import->poolname == NULL || import->guid == 0);
-
- if (import->unique)
- import->exists = zpool_iter(hdl, name_or_guid_exists, import);
-
- if (import->cachefile != NULL)
- return (zpool_find_import_cached(hdl, import->cachefile,
- import->poolname, import->guid));
-
- return (zpool_find_import_impl(hdl, import));
-}
-
static boolean_t
-pool_match(nvlist_t *cfg, char *tgt)
-{
- uint64_t v, guid = strtoull(tgt, NULL, 0);
- char *s;
-
- if (guid != 0) {
- if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
- return (v == guid);
- } else {
- if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
- return (strcmp(s, tgt) == 0);
- }
- return (B_FALSE);
-}
-
-int
-zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp,
- importargs_t *args)
-{
- nvlist_t *pools;
- nvlist_t *match = NULL;
- nvlist_t *config = NULL;
- char *name = NULL, *sepp = NULL;
- char sep = '\0';
- int count = 0;
- char *targetdup = strdup(target);
-
- *configp = NULL;
-
- if ((sepp = strpbrk(targetdup, "/@")) != NULL) {
- sep = *sepp;
- *sepp = '\0';
- }
-
- pools = zpool_search_import(hdl, args);
-
- if (pools != NULL) {
- nvpair_t *elem = NULL;
- while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
- VERIFY0(nvpair_value_nvlist(elem, &config));
- if (pool_match(config, targetdup)) {
- count++;
- if (match != NULL) {
- /* multiple matches found */
- continue;
- } else {
- match = config;
- name = nvpair_name(elem);
- }
- }
- }
- }
-
- if (count == 0) {
- (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "no pools found"));
- free(targetdup);
- return (ENOENT);
- }
-
- if (count > 1) {
- (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "%d pools found, use pool GUID\n"), count);
- free(targetdup);
- return (EINVAL);
- }
-
- *configp = match;
- free(targetdup);
-
- return (0);
-}
-
-boolean_t
find_guid(nvlist_t *nv, uint64_t guid)
{
uint64_t tmp;
diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c
index 73dc2c793..b1bdc4a6d 100644
--- a/lib/libzfs/libzfs_iter.c
+++ b/lib/libzfs/libzfs_iter.c
@@ -33,6 +33,7 @@
#include <stddef.h>
#include <libintl.h>
#include <libzfs.h>
+#include <libzutil.h>
#include <sys/mntent.h>
#include "libzfs_impl.h"
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index fca1a4178..128c6efe9 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -29,10 +29,8 @@
* Copyright (c) 2017, Intel Corporation.
*/
-#include <ctype.h>
#include <errno.h>
#include <devid.h>
-#include <fcntl.h>
#include <libintl.h>
#include <stdio.h>
#include <stdlib.h>
@@ -47,6 +45,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/vdev_disk.h>
#include <dlfcn.h>
+#include <libzutil.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@@ -3697,80 +3696,6 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
}
#endif /* sun */
-/*
- * Remove partition suffix from a vdev path. Partition suffixes may take three
- * forms: "-partX", "pX", or "X", where X is a string of digits. The second
- * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
- * third case only occurs when preceded by a string matching the regular
- * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
- *
- * caller must free the returned string
- */
-char *
-zfs_strip_partition(char *path)
-{
- char *tmp = strdup(path);
- char *part = NULL, *d = NULL;
- if (!tmp)
- return (NULL);
-
- if ((part = strstr(tmp, "-part")) && part != tmp) {
- d = part + 5;
- } else if ((part = strrchr(tmp, 'p')) &&
- part > tmp + 1 && isdigit(*(part-1))) {
- d = part + 1;
- } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
- tmp[1] == 'd') {
- for (d = &tmp[2]; isalpha(*d); part = ++d) { }
- } else if (strncmp("xvd", tmp, 3) == 0) {
- for (d = &tmp[3]; isalpha(*d); part = ++d) { }
- }
- if (part && d && *d != '\0') {
- for (; isdigit(*d); d++) { }
- if (*d == '\0')
- *part = '\0';
- }
-
- return (tmp);
-}
-
-/*
- * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
- *
- * path: /dev/sda1
- * returns: /dev/sda
- *
- * Returned string must be freed.
- */
-char *
-zfs_strip_partition_path(char *path)
-{
- char *newpath = strdup(path);
- char *sd_offset;
- char *new_sd;
-
- if (!newpath)
- return (NULL);
-
- /* Point to "sda1" part of "/dev/sda1" */
- sd_offset = strrchr(newpath, '/') + 1;
-
- /* Get our new name "sda" */
- new_sd = zfs_strip_partition(sd_offset);
- if (!new_sd) {
- free(newpath);
- return (NULL);
- }
-
- /* Paste the "sda" where "sda1" was */
- strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1);
-
- /* Free temporary "sda" */
- free(new_sd);
-
- return (newpath);
-}
-
#define PATH_BUF_LEN 64
/*
@@ -4134,54 +4059,6 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
}
/*
- * Process the buffer of nvlists, unpacking and storing each nvlist record
- * into 'records'. 'leftover' is set to the number of bytes that weren't
- * processed as there wasn't a complete record.
- */
-int
-zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
- nvlist_t ***records, uint_t *numrecords)
-{
- uint64_t reclen;
- nvlist_t *nv;
- int i;
- void *tmp;
-
- while (bytes_read > sizeof (reclen)) {
-
- /* get length of packed record (stored as little endian) */
- for (i = 0, reclen = 0; i < sizeof (reclen); i++)
- reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i);
-
- if (bytes_read < sizeof (reclen) + reclen)
- break;
-
- /* unpack record */
- if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0)
- return (ENOMEM);
- bytes_read -= sizeof (reclen) + reclen;
- buf += sizeof (reclen) + reclen;
-
- /* add record to nvlist array */
- (*numrecords)++;
- if (ISP2(*numrecords + 1)) {
- tmp = realloc(*records,
- *numrecords * 2 * sizeof (nvlist_t *));
- if (tmp == NULL) {
- nvlist_free(nv);
- (*numrecords)--;
- return (ENOMEM);
- }
- *records = tmp;
- }
- (*records)[*numrecords - 1] = nv;
- }
-
- *leftover = bytes_read;
- return (0);
-}
-
-/*
* Retrieve the command history of a pool.
*/
int
@@ -4669,281 +4546,3 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
return (0);
}
-
-/*
- * Allocate and return the underlying device name for a device mapper device.
- * If a device mapper device maps to multiple devices, return the first device.
- *
- * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
- * DM device (like /dev/disk/by-vdev/A0) are also allowed.
- *
- * Returns device name, or NULL on error or no match. If dm_name is not a DM
- * device then return NULL.
- *
- * NOTE: The returned name string must be *freed*.
- */
-char *
-dm_get_underlying_path(char *dm_name)
-{
- DIR *dp = NULL;
- struct dirent *ep;
- char *realp;
- char *tmp = NULL;
- char *path = NULL;
- char *dev_str;
- int size;
-
- if (dm_name == NULL)
- return (NULL);
-
- /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
- realp = realpath(dm_name, NULL);
- if (realp == NULL)
- return (NULL);
-
- /*
- * If they preface 'dev' with a path (like "/dev") then strip it off.
- * We just want the 'dm-N' part.
- */
- tmp = strrchr(realp, '/');
- if (tmp != NULL)
- dev_str = tmp + 1; /* +1 since we want the chr after '/' */
- else
- dev_str = tmp;
-
- size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str);
- if (size == -1 || !tmp)
- goto end;
-
- dp = opendir(tmp);
- if (dp == NULL)
- goto end;
-
- /* Return first sd* entry in /sys/block/dm-N/slaves/ */
- while ((ep = readdir(dp))) {
- if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */
- size = asprintf(&path, "/dev/%s", ep->d_name);
- break;
- }
- }
-
-end:
- if (dp != NULL)
- closedir(dp);
- free(tmp);
- free(realp);
- return (path);
-}
-
-/*
- * Return 1 if device is a device mapper or multipath device.
- * Return 0 if not.
- */
-int
-zfs_dev_is_dm(char *dev_name)
-{
-
- char *tmp;
- tmp = dm_get_underlying_path(dev_name);
- if (tmp == NULL)
- return (0);
-
- free(tmp);
- return (1);
-}
-
-/*
- * By "whole disk" we mean an entire physical disk (something we can
- * label, toggle the write cache on, etc.) as opposed to the full
- * capacity of a pseudo-device such as lofi or did. We act as if we
- * are labeling the disk, which should be a pretty good test of whether
- * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
- * it isn't.
- */
-int
-zfs_dev_is_whole_disk(char *dev_name)
-{
- struct dk_gpt *label;
- int fd;
-
- if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0)
- return (0);
-
- if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
- (void) close(fd);
- return (0);
- }
-
- efi_free(label);
- (void) close(fd);
-
- return (1);
-}
-
-/*
- * Lookup the underlying device for a device name
- *
- * Often you'll have a symlink to a device, a partition device,
- * or a multipath device, and want to look up the underlying device.
- * This function returns the underlying device name. If the device
- * name is already the underlying device, then just return the same
- * name. If the device is a DM device with multiple underlying devices
- * then return the first one.
- *
- * For example:
- *
- * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
- * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
- * returns: /dev/sda
- *
- * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
- * dev_name: /dev/mapper/mpatha
- * returns: /dev/sda (first device)
- *
- * 3. /dev/sda (already the underlying device)
- * dev_name: /dev/sda
- * returns: /dev/sda
- *
- * 4. /dev/dm-3 (mapped to /dev/sda)
- * dev_name: /dev/dm-3
- * returns: /dev/sda
- *
- * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
- * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
- * returns: /dev/sdb
- *
- * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
- * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
- * returns: /dev/sda
- *
- * Returns underlying device name, or NULL on error or no match.
- *
- * NOTE: The returned name string must be *freed*.
- */
-char *
-zfs_get_underlying_path(char *dev_name)
-{
- char *name = NULL;
- char *tmp;
-
- if (dev_name == NULL)
- return (NULL);
-
- tmp = dm_get_underlying_path(dev_name);
-
- /* dev_name not a DM device, so just un-symlinkize it */
- if (tmp == NULL)
- tmp = realpath(dev_name, NULL);
-
- if (tmp != NULL) {
- name = zfs_strip_partition_path(tmp);
- free(tmp);
- }
-
- return (name);
-}
-
-/*
- * Given a dev name like "sda", return the full enclosure sysfs path to
- * the disk. You can also pass in the name with "/dev" prepended
- * to it (like /dev/sda).
- *
- * For example, disk "sda" in enclosure slot 1:
- * dev: "sda"
- * returns: "/sys/class/enclosure/1:0:3:0/Slot 1"
- *
- * 'dev' must be a non-devicemapper device.
- *
- * Returned string must be freed.
- */
-char *
-zfs_get_enclosure_sysfs_path(char *dev_name)
-{
- DIR *dp = NULL;
- struct dirent *ep;
- char buf[MAXPATHLEN];
- char *tmp1 = NULL;
- char *tmp2 = NULL;
- char *tmp3 = NULL;
- char *path = NULL;
- size_t size;
- int tmpsize;
-
- if (dev_name == NULL)
- return (NULL);
-
- /* If they preface 'dev' with a path (like "/dev") then strip it off */
- tmp1 = strrchr(dev_name, '/');
- if (tmp1 != NULL)
- dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */
-
- tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name);
- if (tmpsize == -1 || tmp1 == NULL) {
- tmp1 = NULL;
- goto end;
- }
-
- dp = opendir(tmp1);
- if (dp == NULL) {
- tmp1 = NULL; /* To make free() at the end a NOP */
- goto end;
- }
-
- /*
- * Look though all sysfs entries in /sys/block/<dev>/device for
- * the enclosure symlink.
- */
- while ((ep = readdir(dp))) {
- /* Ignore everything that's not our enclosure_device link */
- if (strstr(ep->d_name, "enclosure_device") == NULL)
- continue;
-
- if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 ||
- tmp2 == NULL)
- break;
-
- size = readlink(tmp2, buf, sizeof (buf));
-
- /* Did readlink fail or crop the link name? */
- if (size == -1 || size >= sizeof (buf)) {
- free(tmp2);
- tmp2 = NULL; /* To make free() at the end a NOP */
- break;
- }
-
- /*
- * We got a valid link. readlink() doesn't terminate strings
- * so we have to do it.
- */
- buf[size] = '\0';
-
- /*
- * Our link will look like:
- *
- * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
- *
- * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
- */
- tmp3 = strstr(buf, "enclosure");
- if (tmp3 == NULL)
- break;
-
- if (asprintf(&path, "/sys/class/%s", tmp3) == -1) {
- /* If asprintf() fails, 'path' is undefined */
- path = NULL;
- break;
- }
-
- if (path == NULL)
- break;
- }
-
-end:
- free(tmp2);
- free(tmp1);
-
- if (dp != NULL)
- closedir(dp);
-
- return (path);
-}
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index b5c91ec20..4a620a9da 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -54,6 +54,7 @@
#include <libzfs.h>
#include <libzfs_core.h>
+#include <libzutil.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c
index 4089f0cc6..e49d79baa 100644
--- a/lib/libzfs/libzfs_status.c
+++ b/lib/libzfs/libzfs_status.c
@@ -42,6 +42,7 @@
*/
#include <libzfs.h>
+#include <libzutil.h>
#include <string.h>
#include <unistd.h>
#include <sys/systeminfo.h>
@@ -425,68 +426,3 @@ zpool_import_status(nvlist_t *config, char **msgid, zpool_errata_t *errata)
return (ret);
}
-
-static void
-dump_ddt_stat(const ddt_stat_t *dds, int h)
-{
- char refcnt[6];
- char blocks[6], lsize[6], psize[6], dsize[6];
- char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
-
- if (dds == NULL || dds->dds_blocks == 0)
- return;
-
- if (h == -1)
- (void) strcpy(refcnt, "Total");
- else
- zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt));
-
- zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks));
- zfs_nicebytes(dds->dds_lsize, lsize, sizeof (lsize));
- zfs_nicebytes(dds->dds_psize, psize, sizeof (psize));
- zfs_nicebytes(dds->dds_dsize, dsize, sizeof (dsize));
- zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks));
- zfs_nicebytes(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize));
- zfs_nicebytes(dds->dds_ref_psize, ref_psize, sizeof (ref_psize));
- zfs_nicebytes(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize));
-
- (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
- refcnt,
- blocks, lsize, psize, dsize,
- ref_blocks, ref_lsize, ref_psize, ref_dsize);
-}
-
-/*
- * Print the DDT histogram and the column totals.
- */
-void
-zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh)
-{
- int h;
-
- (void) printf("\n");
-
- (void) printf("bucket "
- " allocated "
- " referenced \n");
- (void) printf("______ "
- "______________________________ "
- "______________________________\n");
-
- (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
- "refcnt",
- "blocks", "LSIZE", "PSIZE", "DSIZE",
- "blocks", "LSIZE", "PSIZE", "DSIZE");
-
- (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
- "------",
- "------", "-----", "-----", "-----",
- "------", "-----", "-----", "-----");
-
- for (h = 0; h < 64; h++)
- dump_ddt_stat(&ddh->ddh_stat[h], h);
-
- dump_ddt_stat(dds_total, -1);
-
- (void) printf("\n");
-}
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 440ed3bc0..9f4fe3b72 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -39,7 +39,6 @@
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
-#include <ctype.h>
#include <math.h>
#include <sys/stat.h>
#include <sys/mnttab.h>
@@ -54,6 +53,7 @@
#include "zfs_prop.h"
#include "zfeature_common.h"
#include <zfs_fletcher.h>
+#include <libzutil.h>
int
libzfs_errno(libzfs_handle_t *hdl)
@@ -677,135 +677,6 @@ zfs_strdup(libzfs_handle_t *hdl, const char *str)
return (ret);
}
-/*
- * Convert a number to an appropriately human-readable output.
- */
-void
-zfs_nicenum_format(uint64_t num, char *buf, size_t buflen,
- enum zfs_nicenum_format format)
-{
- uint64_t n = num;
- int index = 0;
- const char *u;
- const char *units[3][7] = {
- [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"},
- [ZFS_NICENUM_BYTES] = {"B", "K", "M", "G", "T", "P", "E"},
- [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"}
- };
-
- const int units_len[] = {[ZFS_NICENUM_1024] = 6,
- [ZFS_NICENUM_BYTES] = 6,
- [ZFS_NICENUM_TIME] = 4};
-
- const int k_unit[] = { [ZFS_NICENUM_1024] = 1024,
- [ZFS_NICENUM_BYTES] = 1024,
- [ZFS_NICENUM_TIME] = 1000};
-
- double val;
-
- if (format == ZFS_NICENUM_RAW) {
- snprintf(buf, buflen, "%llu", (u_longlong_t)num);
- return;
- } else if (format == ZFS_NICENUM_RAWTIME && num > 0) {
- snprintf(buf, buflen, "%llu", (u_longlong_t)num);
- return;
- } else if (format == ZFS_NICENUM_RAWTIME && num == 0) {
- snprintf(buf, buflen, "%s", "-");
- return;
- }
-
- while (n >= k_unit[format] && index < units_len[format]) {
- n /= k_unit[format];
- index++;
- }
-
- u = units[format][index];
-
- /* Don't print zero latencies since they're invalid */
- if ((format == ZFS_NICENUM_TIME) && (num == 0)) {
- (void) snprintf(buf, buflen, "-");
- } else if ((index == 0) || ((num %
- (uint64_t)powl(k_unit[format], index)) == 0)) {
- /*
- * If this is an even multiple of the base, always display
- * without any decimal precision.
- */
- (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t)n, u);
-
- } else {
- /*
- * We want to choose a precision that reflects the best choice
- * for fitting in 5 characters. This can get rather tricky when
- * we have numbers that are very close to an order of magnitude.
- * For example, when displaying 10239 (which is really 9.999K),
- * we want only a single place of precision for 10.0K. We could
- * develop some complex heuristics for this, but it's much
- * easier just to try each combination in turn.
- */
- int i;
- for (i = 2; i >= 0; i--) {
- val = (double)num /
- (uint64_t)powl(k_unit[format], index);
-
- /*
- * Don't print floating point values for time. Note,
- * we use floor() instead of round() here, since
- * round can result in undesirable results. For
- * example, if "num" is in the range of
- * 999500-999999, it will print out "1000us". This
- * doesn't happen if we use floor().
- */
- if (format == ZFS_NICENUM_TIME) {
- if (snprintf(buf, buflen, "%d%s",
- (unsigned int) floor(val), u) <= 5)
- break;
-
- } else {
- if (snprintf(buf, buflen, "%.*f%s", i,
- val, u) <= 5)
- break;
- }
- }
- }
-}
-
-/*
- * Convert a number to an appropriately human-readable output.
- */
-void
-zfs_nicenum(uint64_t num, char *buf, size_t buflen)
-{
- zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024);
-}
-
-/*
- * Convert a time to an appropriately human-readable output.
- * @num: Time in nanoseconds
- */
-void
-zfs_nicetime(uint64_t num, char *buf, size_t buflen)
-{
- zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME);
-}
-
-/*
- * Print out a raw number with correct column spacing
- */
-void
-zfs_niceraw(uint64_t num, char *buf, size_t buflen)
-{
- zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW);
-}
-
-/*
- * Convert a number of bytes to an appropriately human-readable output.
- */
-void
-zfs_nicebytes(uint64_t num, char *buf, size_t buflen)
-{
- zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_BYTES);
-}
-
void
libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr)
{
@@ -1233,210 +1104,6 @@ zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype)
}
/*
- * Append partition suffix to an otherwise fully qualified device path.
- * This is used to generate the name the full path as its stored in
- * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length
- * of 'path' will be returned on error a negative value is returned.
- */
-int
-zfs_append_partition(char *path, size_t max_len)
-{
- int len = strlen(path);
-
- if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) ||
- (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) {
- if (len + 6 >= max_len)
- return (-1);
-
- (void) strcat(path, "-part1");
- len += 6;
- } else {
- if (len + 2 >= max_len)
- return (-1);
-
- if (isdigit(path[len-1])) {
- (void) strcat(path, "p1");
- len += 2;
- } else {
- (void) strcat(path, "1");
- len += 1;
- }
- }
-
- return (len);
-}
-
-/*
- * Given a shorthand device name check if a file by that name exists in any
- * of the 'zpool_default_import_path' or ZPOOL_IMPORT_PATH directories. If
- * one is found, store its fully qualified path in the 'path' buffer passed
- * by the caller and return 0, otherwise return an error.
- */
-int
-zfs_resolve_shortname(const char *name, char *path, size_t len)
-{
- int i, error = -1;
- char *dir, *env, *envdup;
-
- env = getenv("ZPOOL_IMPORT_PATH");
- errno = ENOENT;
-
- if (env) {
- envdup = strdup(env);
- dir = strtok(envdup, ":");
- while (dir && error) {
- (void) snprintf(path, len, "%s/%s", dir, name);
- error = access(path, F_OK);
- dir = strtok(NULL, ":");
- }
- free(envdup);
- } else {
- for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE && error < 0; i++) {
- (void) snprintf(path, len, "%s/%s",
- zpool_default_import_path[i], name);
- error = access(path, F_OK);
- }
- }
-
- return (error ? ENOENT : 0);
-}
-
-/*
- * Given a shorthand device name look for a match against 'cmp_name'. This
- * is done by checking all prefix expansions using either the default
- * 'zpool_default_import_paths' or the ZPOOL_IMPORT_PATH environment
- * variable. Proper partition suffixes will be appended if this is a
- * whole disk. When a match is found 0 is returned otherwise ENOENT.
- */
-static int
-zfs_strcmp_shortname(char *name, char *cmp_name, int wholedisk)
-{
- int path_len, cmp_len, i = 0, error = ENOENT;
- char *dir, *env, *envdup = NULL;
- char path_name[MAXPATHLEN];
-
- cmp_len = strlen(cmp_name);
- env = getenv("ZPOOL_IMPORT_PATH");
-
- if (env) {
- envdup = strdup(env);
- dir = strtok(envdup, ":");
- } else {
- dir = zpool_default_import_path[i];
- }
-
- while (dir) {
- /* Trim trailing directory slashes from ZPOOL_IMPORT_PATH */
- while (dir[strlen(dir)-1] == '/')
- dir[strlen(dir)-1] = '\0';
-
- path_len = snprintf(path_name, MAXPATHLEN, "%s/%s", dir, name);
- if (wholedisk)
- path_len = zfs_append_partition(path_name, MAXPATHLEN);
-
- if ((path_len == cmp_len) && strcmp(path_name, cmp_name) == 0) {
- error = 0;
- break;
- }
-
- if (env) {
- dir = strtok(NULL, ":");
- } else if (++i < DEFAULT_IMPORT_PATH_SIZE) {
- dir = zpool_default_import_path[i];
- } else {
- dir = NULL;
- }
- }
-
- if (env)
- free(envdup);
-
- return (error);
-}
-
-/*
- * Given either a shorthand or fully qualified path name look for a match
- * against 'cmp'. The passed name will be expanded as needed for comparison
- * purposes and redundant slashes stripped to ensure an accurate match.
- */
-int
-zfs_strcmp_pathname(char *name, char *cmp, int wholedisk)
-{
- int path_len, cmp_len;
- char path_name[MAXPATHLEN];
- char cmp_name[MAXPATHLEN];
- char *dir, *dup;
-
- /* Strip redundant slashes if one exists due to ZPOOL_IMPORT_PATH */
- memset(cmp_name, 0, MAXPATHLEN);
- dup = strdup(cmp);
- dir = strtok(dup, "/");
- while (dir) {
- strlcat(cmp_name, "/", sizeof (cmp_name));
- strlcat(cmp_name, dir, sizeof (cmp_name));
- dir = strtok(NULL, "/");
- }
- free(dup);
-
- if (name[0] != '/')
- return (zfs_strcmp_shortname(name, cmp_name, wholedisk));
-
- (void) strlcpy(path_name, name, MAXPATHLEN);
- path_len = strlen(path_name);
- cmp_len = strlen(cmp_name);
-
- if (wholedisk) {
- path_len = zfs_append_partition(path_name, MAXPATHLEN);
- if (path_len == -1)
- return (ENOMEM);
- }
-
- if ((path_len != cmp_len) || strcmp(path_name, cmp_name))
- return (ENOENT);
-
- return (0);
-}
-
-/*
- * Given a full path to a device determine if that device appears in the
- * import search path. If it does return the first match and store the
- * index in the passed 'order' variable, otherwise return an error.
- */
-int
-zfs_path_order(char *name, int *order)
-{
- int i = 0, error = ENOENT;
- char *dir, *env, *envdup;
-
- env = getenv("ZPOOL_IMPORT_PATH");
- if (env) {
- envdup = strdup(env);
- dir = strtok(envdup, ":");
- while (dir) {
- if (strncmp(name, dir, strlen(dir)) == 0) {
- *order = i;
- error = 0;
- break;
- }
- dir = strtok(NULL, ":");
- i++;
- }
- free(envdup);
- } else {
- for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) {
- if (strncmp(name, zpool_default_import_path[i],
- strlen(zpool_default_import_path[i])) == 0) {
- *order = i;
- error = 0;
- break;
- }
- }
- }
-
- return (error);
-}
-
-/*
* Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
* an ioctl().
*/
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index e10f20dd9..efc44b27e 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -188,11 +188,10 @@ nodist_libzpool_la_SOURCES = \
$(LUA_C)
libzpool_la_LIBADD = \
- $(top_builddir)/lib/libavl/libavl.la \
$(top_builddir)/lib/libicp/libicp.la \
$(top_builddir)/lib/libnvpair/libnvpair.la \
- $(top_builddir)/lib/libspl/libspl.la \
- $(top_builddir)/lib/libunicode/libunicode.la
+ $(top_builddir)/lib/libunicode/libunicode.la \
+ $(top_builddir)/lib/libzutil/libzutil.la
libzpool_la_LIBADD += $(ZLIB) -ldl
libzpool_la_LDFLAGS = -pthread -version-info 2:0:0
diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c
index 8a6f4c325..87772bcb7 100644
--- a/lib/libzpool/util.c
+++ b/lib/libzpool/util.c
@@ -34,112 +34,14 @@
#include <sys/spa.h>
#include <sys/fs/zfs.h>
#include <sys/refcount.h>
+#include <sys/zfs_ioctl.h>
#include <dlfcn.h>
+#include <libzutil.h>
/*
* Routines needed by more than one client of libzpool.
*/
-/* The largest suffix that can fit, aka an exabyte (2^60 / 10^18) */
-#define INDEX_MAX (6)
-
-/* Verify INDEX_MAX fits */
-CTASSERT_GLOBAL(INDEX_MAX * 10 < sizeof (uint64_t) * 8);
-
-void
-nicenum_scale(uint64_t n, size_t units, char *buf, size_t buflen,
- uint32_t flags)
-{
- uint64_t divamt = 1024;
- uint64_t divisor = 1;
- int index = 0;
- int rc = 0;
- char u;
-
- if (units == 0)
- units = 1;
-
- if (n > 0) {
- n *= units;
- if (n < units)
- goto overflow;
- }
-
- if (flags & NN_DIVISOR_1000)
- divamt = 1000;
-
- /*
- * This tries to find the suffix S(n) such that
- * S(n) <= n < S(n+1), where S(n) = 2^(n*10) | 10^(3*n)
- * (i.e. 1024/1000, 1,048,576/1,000,000, etc). Stop once S(n)
- * is the largest prefix supported (i.e. don't bother computing
- * and checking S(n+1). Since INDEX_MAX should be the largest
- * suffix that fits (currently an exabyte), S(INDEX_MAX + 1) is
- * never checked as it would overflow.
- */
- while (index < INDEX_MAX) {
- uint64_t newdiv = divisor * divamt;
-
- /* CTASSERT() guarantee these never trip */
- VERIFY3U(newdiv, >=, divamt);
- VERIFY3U(newdiv, >=, divisor);
-
- if (n < newdiv)
- break;
-
- divisor = newdiv;
- index++;
- }
-
- u = " KMGTPE"[index];
-
- if (index == 0) {
- rc = snprintf(buf, buflen, "%llu", (u_longlong_t)n);
- } else if (n % divisor == 0) {
- /*
- * If this is an even multiple of the base, always display
- * without any decimal precision.
- */
- rc = snprintf(buf, buflen, "%llu%c",
- (u_longlong_t)(n / divisor), u);
- } else {
- /*
- * We want to choose a precision that reflects the best choice
- * for fitting in 5 characters. This can get rather tricky
- * when we have numbers that are very close to an order of
- * magnitude. For example, when displaying 10239 (which is
- * really 9.999K), we want only a single place of precision
- * for 10.0K. We could develop some complex heuristics for
- * this, but it's much easier just to try each combination
- * in turn.
- */
- int i;
- for (i = 2; i >= 0; i--) {
- if ((rc = snprintf(buf, buflen, "%.*f%c", i,
- (double)n / divisor, u)) <= 5)
- break;
- }
- }
-
- if (rc + 1 > buflen || rc < 0)
- goto overflow;
-
- return;
-
-overflow:
- /* prefer a more verbose message if possible */
- if (buflen > 10)
- (void) strlcpy(buf, "<overflow>", buflen);
- else
- (void) strlcpy(buf, "??", buflen);
-}
-
-void
-nicenum(uint64_t num, char *buf, size_t buflen)
-{
- nicenum_scale(num, 1, buf, buflen, 0);
-}
-
static void
show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
{
@@ -300,3 +202,56 @@ set_global_var(char *arg)
return (0);
}
+
+static nvlist_t *
+refresh_config(void *unused, nvlist_t *tryconfig)
+{
+ return (spa_tryimport(tryconfig));
+}
+
+static int
+pool_active(void *unused, const char *name, uint64_t guid,
+ boolean_t *isactive)
+{
+ zfs_cmd_t *zcp;
+ nvlist_t *innvl;
+ char *packed = NULL;
+ size_t size = 0;
+ int fd, ret;
+
+ /*
+ * Use ZFS_IOC_POOL_SYNC to confirm if a pool is active
+ */
+
+ fd = open("/dev/zfs", O_RDWR);
+ if (fd < 0)
+ return (-1);
+
+ zcp = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL);
+
+ innvl = fnvlist_alloc();
+ fnvlist_add_boolean_value(innvl, "force", B_FALSE);
+
+ (void) strlcpy(zcp->zc_name, name, sizeof (zcp->zc_name));
+ packed = fnvlist_pack(innvl, &size);
+ zcp->zc_nvlist_src = (uint64_t)(uintptr_t)packed;
+ zcp->zc_nvlist_src_size = size;
+
+ ret = ioctl(fd, ZFS_IOC_POOL_SYNC, zcp);
+
+ fnvlist_pack_free(packed, size);
+ free((void *)(uintptr_t)zcp->zc_nvlist_dst);
+ nvlist_free(innvl);
+ umem_free(zcp, sizeof (zfs_cmd_t));
+
+ (void) close(fd);
+
+ *isactive = (ret == 0);
+
+ return (0);
+}
+
+const pool_config_ops_t libzpool_config_ops = {
+ .pco_refresh_config = refresh_config,
+ .pco_pool_active = pool_active,
+};
diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am
new file mode 100644
index 000000000..720b843ab
--- /dev/null
+++ b/lib/libzutil/Makefile.am
@@ -0,0 +1,27 @@
+include $(top_srcdir)/config/Rules.am
+
+# Suppress unused but set variable warnings often due to ASSERTs
+AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE)
+
+DEFAULT_INCLUDES += \
+ -I$(top_srcdir)/include \
+ -I$(top_srcdir)/lib/libspl/include
+
+noinst_LTLIBRARIES = libzutil.la
+
+USER_C = \
+ zutil_device_path.c \
+ zutil_import.c \
+ zutil_nicenum.c \
+ zutil_pool.c
+
+nodist_libzutil_la_SOURCES = $(USER_C)
+
+libzutil_la_LIBADD = \
+ $(top_builddir)/lib/libavl/libavl.la \
+ $(top_builddir)/lib/libefi/libefi.la \
+ $(top_builddir)/lib/libtpool/libtpool.la
+
+libzutil_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV)
+
+EXTRA_DIST = $(USER_C)
diff --git a/lib/libzutil/zutil_device_path.c b/lib/libzutil/zutil_device_path.c
new file mode 100644
index 000000000..1dc0d4d1d
--- /dev/null
+++ b/lib/libzutil/zutil_device_path.c
@@ -0,0 +1,625 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <ctype.h>
+#include <errno.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/efi_partition.h>
+
+#include <libzutil.h>
+#ifdef HAVE_LIBUDEV
+#include <libudev.h>
+#endif
+
+/*
+ * Append partition suffix to an otherwise fully qualified device path.
+ * This is used to generate the name the full path as its stored in
+ * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length
+ * of 'path' will be returned on error a negative value is returned.
+ */
+int
+zfs_append_partition(char *path, size_t max_len)
+{
+ int len = strlen(path);
+
+ if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) ||
+ (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) {
+ if (len + 6 >= max_len)
+ return (-1);
+
+ (void) strcat(path, "-part1");
+ len += 6;
+ } else {
+ if (len + 2 >= max_len)
+ return (-1);
+
+ if (isdigit(path[len-1])) {
+ (void) strcat(path, "p1");
+ len += 2;
+ } else {
+ (void) strcat(path, "1");
+ len += 1;
+ }
+ }
+
+ return (len);
+}
+
+/*
+ * Given a shorthand device name check if a file by that name exists in any
+ * of the 'zpool_default_import_path' or ZPOOL_IMPORT_PATH directories. If
+ * one is found, store its fully qualified path in the 'path' buffer passed
+ * by the caller and return 0, otherwise return an error.
+ */
+int
+zfs_resolve_shortname(const char *name, char *path, size_t len)
+{
+ int i, error = -1;
+ char *dir, *env, *envdup;
+
+ env = getenv("ZPOOL_IMPORT_PATH");
+ errno = ENOENT;
+
+ if (env) {
+ envdup = strdup(env);
+ dir = strtok(envdup, ":");
+ while (dir && error) {
+ (void) snprintf(path, len, "%s/%s", dir, name);
+ error = access(path, F_OK);
+ dir = strtok(NULL, ":");
+ }
+ free(envdup);
+ } else {
+ const char * const *zpool_default_import_path;
+ size_t count;
+
+ zpool_default_import_path = zpool_default_search_paths(&count);
+
+ for (i = 0; i < count && error < 0; i++) {
+ (void) snprintf(path, len, "%s/%s",
+ zpool_default_import_path[i], name);
+ error = access(path, F_OK);
+ }
+ }
+
+ return (error ? ENOENT : 0);
+}
+
+/*
+ * Given a shorthand device name look for a match against 'cmp_name'. This
+ * is done by checking all prefix expansions using either the default
+ * 'zpool_default_import_paths' or the ZPOOL_IMPORT_PATH environment
+ * variable. Proper partition suffixes will be appended if this is a
+ * whole disk. When a match is found 0 is returned otherwise ENOENT.
+ */
+static int
+zfs_strcmp_shortname(const char *name, const char *cmp_name, int wholedisk)
+{
+ int path_len, cmp_len, i = 0, error = ENOENT;
+ char *dir, *env, *envdup = NULL;
+ char path_name[MAXPATHLEN];
+ const char * const *zpool_default_import_path;
+ size_t count;
+
+ zpool_default_import_path = zpool_default_search_paths(&count);
+
+ cmp_len = strlen(cmp_name);
+ env = getenv("ZPOOL_IMPORT_PATH");
+
+ if (env) {
+ envdup = strdup(env);
+ dir = strtok(envdup, ":");
+ } else {
+ dir = (char *)zpool_default_import_path[i];
+ }
+
+ while (dir) {
+ /* Trim trailing directory slashes from ZPOOL_IMPORT_PATH */
+ if (env) {
+ while (dir[strlen(dir)-1] == '/')
+ dir[strlen(dir)-1] = '\0';
+ }
+
+ path_len = snprintf(path_name, MAXPATHLEN, "%s/%s", dir, name);
+ if (wholedisk)
+ path_len = zfs_append_partition(path_name, MAXPATHLEN);
+
+ if ((path_len == cmp_len) && strcmp(path_name, cmp_name) == 0) {
+ error = 0;
+ break;
+ }
+
+ if (env) {
+ dir = strtok(NULL, ":");
+ } else if (++i < count) {
+ dir = (char *)zpool_default_import_path[i];
+ } else {
+ dir = NULL;
+ }
+ }
+
+ if (env)
+ free(envdup);
+
+ return (error);
+}
+
+/*
+ * Given either a shorthand or fully qualified path name look for a match
+ * against 'cmp'. The passed name will be expanded as needed for comparison
+ * purposes and redundant slashes stripped to ensure an accurate match.
+ */
+int
+zfs_strcmp_pathname(const char *name, const char *cmp, int wholedisk)
+{
+ int path_len, cmp_len;
+ char path_name[MAXPATHLEN];
+ char cmp_name[MAXPATHLEN];
+ char *dir, *dup;
+
+ /* Strip redundant slashes if one exists due to ZPOOL_IMPORT_PATH */
+ memset(cmp_name, 0, MAXPATHLEN);
+ dup = strdup(cmp);
+ dir = strtok(dup, "/");
+ while (dir) {
+ strlcat(cmp_name, "/", sizeof (cmp_name));
+ strlcat(cmp_name, dir, sizeof (cmp_name));
+ dir = strtok(NULL, "/");
+ }
+ free(dup);
+
+ if (name[0] != '/')
+ return (zfs_strcmp_shortname(name, cmp_name, wholedisk));
+
+ (void) strlcpy(path_name, name, MAXPATHLEN);
+ path_len = strlen(path_name);
+ cmp_len = strlen(cmp_name);
+
+ if (wholedisk) {
+ path_len = zfs_append_partition(path_name, MAXPATHLEN);
+ if (path_len == -1)
+ return (ENOMEM);
+ }
+
+ if ((path_len != cmp_len) || strcmp(path_name, cmp_name))
+ return (ENOENT);
+
+ return (0);
+}
+
+/*
+ * Allocate and return the underlying device name for a device mapper device.
+ * If a device mapper device maps to multiple devices, return the first device.
+ *
+ * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
+ * DM device (like /dev/disk/by-vdev/A0) are also allowed.
+ *
+ * Returns device name, or NULL on error or no match. If dm_name is not a DM
+ * device then return NULL.
+ *
+ * NOTE: The returned name string must be *freed*.
+ */
+static char *
+dm_get_underlying_path(const char *dm_name)
+{
+ DIR *dp = NULL;
+ struct dirent *ep;
+ char *realp;
+ char *tmp = NULL;
+ char *path = NULL;
+ char *dev_str;
+ int size;
+
+ if (dm_name == NULL)
+ return (NULL);
+
+ /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
+ realp = realpath(dm_name, NULL);
+ if (realp == NULL)
+ return (NULL);
+
+ /*
+ * If they preface 'dev' with a path (like "/dev") then strip it off.
+ * We just want the 'dm-N' part.
+ */
+ tmp = strrchr(realp, '/');
+ if (tmp != NULL)
+ dev_str = tmp + 1; /* +1 since we want the chr after '/' */
+ else
+ dev_str = tmp;
+
+ size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str);
+ if (size == -1 || !tmp)
+ goto end;
+
+ dp = opendir(tmp);
+ if (dp == NULL)
+ goto end;
+
+ /* Return first sd* entry in /sys/block/dm-N/slaves/ */
+ while ((ep = readdir(dp))) {
+ if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */
+ size = asprintf(&path, "/dev/%s", ep->d_name);
+ break;
+ }
+ }
+
+end:
+ if (dp != NULL)
+ closedir(dp);
+ free(tmp);
+ free(realp);
+ return (path);
+}
+
+/*
+ * Return 1 if device is a device mapper or multipath device.
+ * Return 0 if not.
+ */
+int
+zfs_dev_is_dm(const char *dev_name)
+{
+
+ char *tmp;
+ tmp = dm_get_underlying_path(dev_name);
+ if (tmp == NULL)
+ return (0);
+
+ free(tmp);
+ return (1);
+}
+
+/*
+ * By "whole disk" we mean an entire physical disk (something we can
+ * label, toggle the write cache on, etc.) as opposed to the full
+ * capacity of a pseudo-device such as lofi or did. We act as if we
+ * are labeling the disk, which should be a pretty good test of whether
+ * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
+ * it isn't.
+ */
+int
+zfs_dev_is_whole_disk(const char *dev_name)
+{
+ struct dk_gpt *label;
+ int fd;
+
+ if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0)
+ return (0);
+
+ if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
+ (void) close(fd);
+ return (0);
+ }
+
+ efi_free(label);
+ (void) close(fd);
+
+ return (1);
+}
+
+/*
+ * Lookup the underlying device for a device name
+ *
+ * Often you'll have a symlink to a device, a partition device,
+ * or a multipath device, and want to look up the underlying device.
+ * This function returns the underlying device name. If the device
+ * name is already the underlying device, then just return the same
+ * name. If the device is a DM device with multiple underlying devices
+ * then return the first one.
+ *
+ * For example:
+ *
+ * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
+ * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
+ * returns: /dev/sda
+ *
+ * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
+ * dev_name: /dev/mapper/mpatha
+ * returns: /dev/sda (first device)
+ *
+ * 3. /dev/sda (already the underlying device)
+ * dev_name: /dev/sda
+ * returns: /dev/sda
+ *
+ * 4. /dev/dm-3 (mapped to /dev/sda)
+ * dev_name: /dev/dm-3
+ * returns: /dev/sda
+ *
+ * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
+ * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
+ * returns: /dev/sdb
+ *
+ * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
+ * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
+ * returns: /dev/sda
+ *
+ * Returns underlying device name, or NULL on error or no match.
+ *
+ * NOTE: The returned name string must be *freed*.
+ */
+char *
+zfs_get_underlying_path(const char *dev_name)
+{
+ char *name = NULL;
+ char *tmp;
+
+ if (dev_name == NULL)
+ return (NULL);
+
+ tmp = dm_get_underlying_path(dev_name);
+
+ /* dev_name not a DM device, so just un-symlinkize it */
+ if (tmp == NULL)
+ tmp = realpath(dev_name, NULL);
+
+ if (tmp != NULL) {
+ name = zfs_strip_partition_path(tmp);
+ free(tmp);
+ }
+
+ return (name);
+}
+
+/*
+ * Given a dev name like "sda", return the full enclosure sysfs path to
+ * the disk. You can also pass in the name with "/dev" prepended
+ * to it (like /dev/sda).
+ *
+ * For example, disk "sda" in enclosure slot 1:
+ * dev: "sda"
+ * returns: "/sys/class/enclosure/1:0:3:0/Slot 1"
+ *
+ * 'dev' must be a non-devicemapper device.
+ *
+ * Returned string must be freed.
+ */
+char *
+zfs_get_enclosure_sysfs_path(const char *dev_name)
+{
+ DIR *dp = NULL;
+ struct dirent *ep;
+ char buf[MAXPATHLEN];
+ char *tmp1 = NULL;
+ char *tmp2 = NULL;
+ char *tmp3 = NULL;
+ char *path = NULL;
+ size_t size;
+ int tmpsize;
+
+ if (dev_name == NULL)
+ return (NULL);
+
+ /* If they preface 'dev' with a path (like "/dev") then strip it off */
+ tmp1 = strrchr(dev_name, '/');
+ if (tmp1 != NULL)
+ dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */
+
+ tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name);
+ if (tmpsize == -1 || tmp1 == NULL) {
+ tmp1 = NULL;
+ goto end;
+ }
+
+ dp = opendir(tmp1);
+ if (dp == NULL) {
+ tmp1 = NULL; /* To make free() at the end a NOP */
+ goto end;
+ }
+
+ /*
+ * Look though all sysfs entries in /sys/block/<dev>/device for
+ * the enclosure symlink.
+ */
+ while ((ep = readdir(dp))) {
+ /* Ignore everything that's not our enclosure_device link */
+ if (strstr(ep->d_name, "enclosure_device") == NULL)
+ continue;
+
+ if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 ||
+ tmp2 == NULL)
+ break;
+
+ size = readlink(tmp2, buf, sizeof (buf));
+
+ /* Did readlink fail or crop the link name? */
+ if (size == -1 || size >= sizeof (buf)) {
+ free(tmp2);
+ tmp2 = NULL; /* To make free() at the end a NOP */
+ break;
+ }
+
+ /*
+ * We got a valid link. readlink() doesn't terminate strings
+ * so we have to do it.
+ */
+ buf[size] = '\0';
+
+ /*
+ * Our link will look like:
+ *
+ * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
+ *
+ * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
+ */
+ tmp3 = strstr(buf, "enclosure");
+ if (tmp3 == NULL)
+ break;
+
+ if (asprintf(&path, "/sys/class/%s", tmp3) == -1) {
+ /* If asprintf() fails, 'path' is undefined */
+ path = NULL;
+ break;
+ }
+
+ if (path == NULL)
+ break;
+ }
+
+end:
+ free(tmp2);
+ free(tmp1);
+
+ if (dp != NULL)
+ closedir(dp);
+
+ return (path);
+}
+
+/*
+ * Remove partition suffix from a vdev path. Partition suffixes may take three
+ * forms: "-partX", "pX", or "X", where X is a string of digits. The second
+ * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
+ * third case only occurs when preceded by a string matching the regular
+ * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
+ *
+ * caller must free the returned string
+ */
+char *
+zfs_strip_partition(char *path)
+{
+ char *tmp = strdup(path);
+ char *part = NULL, *d = NULL;
+ if (!tmp)
+ return (NULL);
+
+ if ((part = strstr(tmp, "-part")) && part != tmp) {
+ d = part + 5;
+ } else if ((part = strrchr(tmp, 'p')) &&
+ part > tmp + 1 && isdigit(*(part-1))) {
+ d = part + 1;
+ } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
+ tmp[1] == 'd') {
+ for (d = &tmp[2]; isalpha(*d); part = ++d) { }
+ } else if (strncmp("xvd", tmp, 3) == 0) {
+ for (d = &tmp[3]; isalpha(*d); part = ++d) { }
+ }
+ if (part && d && *d != '\0') {
+ for (; isdigit(*d); d++) { }
+ if (*d == '\0')
+ *part = '\0';
+ }
+
+ return (tmp);
+}
+
+/*
+ * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
+ *
+ * path: /dev/sda1
+ * returns: /dev/sda
+ *
+ * Returned string must be freed.
+ */
+char *
+zfs_strip_partition_path(char *path)
+{
+ char *newpath = strdup(path);
+ char *sd_offset;
+ char *new_sd;
+
+ if (!newpath)
+ return (NULL);
+
+ /* Point to "sda1" part of "/dev/sda1" */
+ sd_offset = strrchr(newpath, '/') + 1;
+
+ /* Get our new name "sda" */
+ new_sd = zfs_strip_partition(sd_offset);
+ if (!new_sd) {
+ free(newpath);
+ return (NULL);
+ }
+
+ /* Paste the "sda" where "sda1" was */
+ strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1);
+
+ /* Free temporary "sda" */
+ free(new_sd);
+
+ return (newpath);
+}
+
+#ifdef HAVE_LIBUDEV
+/*
+ * A disk is considered a multipath whole disk when:
+ * DEVNAME key value has "dm-"
+ * DM_NAME key value has "mpath" prefix
+ * DM_UUID key exists
+ * ID_PART_TABLE_TYPE key does not exist or is not gpt
+ */
+static boolean_t
+udev_mpath_whole_disk(struct udev_device *dev)
+{
+ const char *devname, *type, *uuid;
+
+ devname = udev_device_get_property_value(dev, "DEVNAME");
+ type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
+ uuid = udev_device_get_property_value(dev, "DM_UUID");
+
+ if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
+ ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
+ (uuid != NULL)) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Check if a disk is effectively a multipath whole disk
+ */
+boolean_t
+is_mpath_whole_disk(const char *path)
+{
+ struct udev *udev;
+ struct udev_device *dev = NULL;
+ char nodepath[MAXPATHLEN];
+ char *sysname;
+ boolean_t wholedisk = B_FALSE;
+
+ if (realpath(path, nodepath) == NULL)
+ return (B_FALSE);
+ sysname = strrchr(nodepath, '/') + 1;
+ if (strncmp(sysname, "dm-", 3) != 0)
+ return (B_FALSE);
+ if ((udev = udev_new()) == NULL)
+ return (B_FALSE);
+ if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
+ sysname)) == NULL) {
+ udev_device_unref(dev);
+ return (B_FALSE);
+ }
+
+ wholedisk = udev_mpath_whole_disk(dev);
+
+ udev_device_unref(dev);
+ return (wholedisk);
+}
+#endif
diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c
new file mode 100644
index 000000000..f6e56fabf
--- /dev/null
+++ b/lib/libzutil/zutil_import.c
@@ -0,0 +1,2389 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright 2015 RackTop Systems.
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+/*
+ * Pool import support functions.
+ *
+ * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
+ * these commands are expected to run in the global zone, we can assume
+ * that the devices are all readable when called.
+ *
+ * To import a pool, we rely on reading the configuration information from the
+ * ZFS label of each device. If we successfully read the label, then we
+ * organize the configuration information in the following hierarchy:
+ *
+ * pool guid -> toplevel vdev guid -> label txg
+ *
+ * Duplicate entries matching this same tuple will be discarded. Once we have
+ * examined every device, we pick the best label txg config for each toplevel
+ * vdev. We then arrange these toplevel vdevs into a complete pool config, and
+ * update any paths that have changed. Finally, we attempt to import the pool
+ * using our derived config, and record the results.
+ */
+
+#include <ctype.h>
+#include <devid.h>
+#include <dirent.h>
+#include <errno.h>
+#include <libintl.h>
+#include <libgen.h>
+#ifdef HAVE_LIBUDEV
+#include <libudev.h>
+#include <sched.h>
+#endif
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/dktp/fdisk.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+
+#include <blkid/blkid.h>
+#include <thread_pool.h>
+#include <libzutil.h>
+#include <libnvpair.h>
+
+#define IMPORT_ORDER_PREFERRED_1 1
+#define IMPORT_ORDER_PREFERRED_2 2
+#define IMPORT_ORDER_SCAN_OFFSET 10
+#define IMPORT_ORDER_DEFAULT 100
+#define DEFAULT_IMPORT_PATH_SIZE 9
+
+#define EZFS_BADCACHE "invalid or missing cache file"
+#define EZFS_BADPATH "must be an absolute path"
+#define EZFS_NOMEM "out of memory"
+#define EZFS_EACESS "some devices require root privileges"
+
+typedef struct libpc_handle {
+ boolean_t lpc_printerr;
+ boolean_t lpc_open_access_error;
+ boolean_t lpc_desc_active;
+ char lpc_desc[1024];
+ const pool_config_ops_t *lpc_ops;
+ void *lpc_lib_handle;
+} libpc_handle_t;
+
+/*PRINTFLIKE2*/
+static void
+zfs_error_aux(libpc_handle_t *hdl, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ (void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap);
+ hdl->lpc_desc_active = B_TRUE;
+
+ va_end(ap);
+}
+
+static void
+zfs_verror(libpc_handle_t *hdl, const char *error, const char *fmt, va_list ap)
+{
+ char action[1024];
+
+ (void) vsnprintf(action, sizeof (action), fmt, ap);
+
+ if (hdl->lpc_desc_active)
+ hdl->lpc_desc_active = B_FALSE;
+ else
+ hdl->lpc_desc[0] = '\0';
+
+ if (hdl->lpc_printerr) {
+ if (hdl->lpc_desc[0] != '\0')
+ error = hdl->lpc_desc;
+
+ (void) fprintf(stderr, "%s: %s\n", action, error);
+ }
+}
+
+/*PRINTFLIKE3*/
+static int
+zfs_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ zfs_verror(hdl, error, fmt, ap);
+
+ va_end(ap);
+
+ return (-1);
+}
+
+static int
+zfs_error(libpc_handle_t *hdl, const char *error, const char *msg)
+{
+ return (zfs_error_fmt(hdl, error, "%s", msg));
+}
+
+static int
+no_memory(libpc_handle_t *hdl)
+{
+ zfs_error(hdl, EZFS_NOMEM, "internal error");
+ exit(1);
+}
+
+static void *
+zfs_alloc(libpc_handle_t *hdl, size_t size)
+{
+ void *data;
+
+ if ((data = calloc(1, size)) == NULL)
+ (void) no_memory(hdl);
+
+ return (data);
+}
+
+static char *
+zfs_strdup(libpc_handle_t *hdl, const char *str)
+{
+ char *ret;
+
+ if ((ret = strdup(str)) == NULL)
+ (void) no_memory(hdl);
+
+ return (ret);
+}
+
+/*
+ * Intermediate structures used to gather configuration information.
+ */
+typedef struct config_entry {
+ uint64_t ce_txg;
+ nvlist_t *ce_config;
+ struct config_entry *ce_next;
+} config_entry_t;
+
+typedef struct vdev_entry {
+ uint64_t ve_guid;
+ config_entry_t *ve_configs;
+ struct vdev_entry *ve_next;
+} vdev_entry_t;
+
+typedef struct pool_entry {
+ uint64_t pe_guid;
+ vdev_entry_t *pe_vdevs;
+ struct pool_entry *pe_next;
+} pool_entry_t;
+
+typedef struct name_entry {
+ char *ne_name;
+ uint64_t ne_guid;
+ uint64_t ne_order;
+ uint64_t ne_num_labels;
+ struct name_entry *ne_next;
+} name_entry_t;
+
+typedef struct pool_list {
+ pool_entry_t *pools;
+ name_entry_t *names;
+} pool_list_t;
+
+#define ZVOL_ROOT "/dev/zvol"
+#define DEV_BYID_PATH "/dev/disk/by-id/"
+
+/*
+ * Linux persistent device strings for vdev labels
+ *
+ * based on libudev for consistency with libudev disk add/remove events
+ */
+
+typedef struct vdev_dev_strs {
+ char vds_devid[128];
+ char vds_devphys[128];
+} vdev_dev_strs_t;
+
+/*
+ * Obtain the persistent device id string (describes what)
+ *
+ * used by ZED vdev matching for auto-{online,expand,replace}
+ */
+int
+zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
+{
+ struct udev_list_entry *entry;
+ const char *bus;
+ char devbyid[MAXPATHLEN];
+
+ /* The bus based by-id path is preferred */
+ bus = udev_device_get_property_value(dev, "ID_BUS");
+
+ if (bus == NULL) {
+ const char *dm_uuid;
+
+ /*
+ * For multipath nodes use the persistent uuid based identifier
+ *
+ * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
+ */
+ dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
+ if (dm_uuid != NULL) {
+ (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
+ return (0);
+ }
+
+ /*
+ * For volumes use the persistent /dev/zvol/dataset identifier
+ */
+ entry = udev_device_get_devlinks_list_entry(dev);
+ while (entry != NULL) {
+ const char *name;
+
+ name = udev_list_entry_get_name(entry);
+ if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
+ (void) strlcpy(bufptr, name, buflen);
+ return (0);
+ }
+ entry = udev_list_entry_get_next(entry);
+ }
+
+ /*
+ * NVME 'by-id' symlinks are similar to bus case
+ */
+ struct udev_device *parent;
+
+ parent = udev_device_get_parent_with_subsystem_devtype(dev,
+ "nvme", NULL);
+ if (parent != NULL)
+ bus = "nvme"; /* continue with bus symlink search */
+ else
+ return (ENODATA);
+ }
+
+ /*
+ * locate the bus specific by-id link
+ */
+ (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
+ entry = udev_device_get_devlinks_list_entry(dev);
+ while (entry != NULL) {
+ const char *name;
+
+ name = udev_list_entry_get_name(entry);
+ if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
+ name += strlen(DEV_BYID_PATH);
+ (void) strlcpy(bufptr, name, buflen);
+ return (0);
+ }
+ entry = udev_list_entry_get_next(entry);
+ }
+
+ return (ENODATA);
+}
+
+/*
+ * Obtain the persistent physical location string (describes where)
+ *
+ * used by ZED vdev matching for auto-{online,expand,replace}
+ */
+int
+zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
+{
+ const char *physpath = NULL;
+ struct udev_list_entry *entry;
+
+ /*
+ * Normal disks use ID_PATH for their physical path.
+ */
+ physpath = udev_device_get_property_value(dev, "ID_PATH");
+ if (physpath != NULL && strlen(physpath) > 0) {
+ (void) strlcpy(bufptr, physpath, buflen);
+ return (0);
+ }
+
+ /*
+ * Device mapper devices are virtual and don't have a physical
+ * path. For them we use ID_VDEV instead, which is setup via the
+ * /etc/vdev_id.conf file. ID_VDEV provides a persistent path
+ * to a virtual device. If you don't have vdev_id.conf setup,
+ * you cannot use multipath autoreplace with device mapper.
+ */
+ physpath = udev_device_get_property_value(dev, "ID_VDEV");
+ if (physpath != NULL && strlen(physpath) > 0) {
+ (void) strlcpy(bufptr, physpath, buflen);
+ return (0);
+ }
+
+ /*
+ * For ZFS volumes use the persistent /dev/zvol/dataset identifier
+ */
+ entry = udev_device_get_devlinks_list_entry(dev);
+ while (entry != NULL) {
+ physpath = udev_list_entry_get_name(entry);
+ if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
+ (void) strlcpy(bufptr, physpath, buflen);
+ return (0);
+ }
+ entry = udev_list_entry_get_next(entry);
+ }
+
+ /*
+ * For all other devices fallback to using the by-uuid name.
+ */
+ entry = udev_device_get_devlinks_list_entry(dev);
+ while (entry != NULL) {
+ physpath = udev_list_entry_get_name(entry);
+ if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
+ (void) strlcpy(bufptr, physpath, buflen);
+ return (0);
+ }
+ entry = udev_list_entry_get_next(entry);
+ }
+
+ return (ENODATA);
+}
+
+/*
+ * A disk is considered a multipath whole disk when:
+ * DEVNAME key value has "dm-"
+ * DM_NAME key value has "mpath" prefix
+ * DM_UUID key exists
+ * ID_PART_TABLE_TYPE key does not exist or is not gpt
+ */
+static boolean_t
+udev_mpath_whole_disk(struct udev_device *dev)
+{
+ const char *devname, *type, *uuid;
+
+ devname = udev_device_get_property_value(dev, "DEVNAME");
+ type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
+ uuid = udev_device_get_property_value(dev, "DM_UUID");
+
+ if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
+ ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
+ (uuid != NULL)) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+udev_device_is_ready(struct udev_device *dev)
+{
+#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
+ return (udev_device_get_is_initialized(dev));
+#else
+ /* wait for DEVLINKS property to be initialized */
+ return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
+#endif
+}
+
+/*
+ * Wait up to timeout_ms for udev to set up the device node. The device is
+ * considered ready when libudev determines it has been initialized, all of
+ * the device links have been verified to exist, and it has been allowed to
+ * settle. At this point the device the device can be accessed reliably.
+ * Depending on the complexity of the udev rules this process could take
+ * several seconds.
+ */
+int
+zpool_label_disk_wait(const char *path, int timeout_ms)
+{
+#ifdef HAVE_LIBUDEV
+ struct udev *udev;
+ struct udev_device *dev = NULL;
+ char nodepath[MAXPATHLEN];
+ char *sysname = NULL;
+ int ret = ENODEV;
+ int settle_ms = 50;
+ long sleep_ms = 10;
+ hrtime_t start, settle;
+
+ if ((udev = udev_new()) == NULL)
+ return (ENXIO);
+
+ start = gethrtime();
+ settle = 0;
+
+ do {
+ if (sysname == NULL) {
+ if (realpath(path, nodepath) != NULL) {
+ sysname = strrchr(nodepath, '/') + 1;
+ } else {
+ (void) usleep(sleep_ms * MILLISEC);
+ continue;
+ }
+ }
+
+ dev = udev_device_new_from_subsystem_sysname(udev,
+ "block", sysname);
+ if ((dev != NULL) && udev_device_is_ready(dev)) {
+ struct udev_list_entry *links, *link = NULL;
+
+ ret = 0;
+ links = udev_device_get_devlinks_list_entry(dev);
+
+ udev_list_entry_foreach(link, links) {
+ struct stat64 statbuf;
+ const char *name;
+
+ name = udev_list_entry_get_name(link);
+ errno = 0;
+ if (stat64(name, &statbuf) == 0 && errno == 0)
+ continue;
+
+ settle = 0;
+ ret = ENODEV;
+ break;
+ }
+
+ if (ret == 0) {
+ if (settle == 0) {
+ settle = gethrtime();
+ } else if (NSEC2MSEC(gethrtime() - settle) >=
+ settle_ms) {
+ udev_device_unref(dev);
+ break;
+ }
+ }
+ }
+
+ udev_device_unref(dev);
+ (void) usleep(sleep_ms * MILLISEC);
+
+ } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
+
+ udev_unref(udev);
+
+ return (ret);
+#else
+ int settle_ms = 50;
+ long sleep_ms = 10;
+ hrtime_t start, settle;
+ struct stat64 statbuf;
+
+ start = gethrtime();
+ settle = 0;
+
+ do {
+ errno = 0;
+ if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
+ if (settle == 0)
+ settle = gethrtime();
+ else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
+ return (0);
+ } else if (errno != ENOENT) {
+ return (errno);
+ }
+
+ usleep(sleep_ms * MILLISEC);
+ } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
+
+ return (ENODEV);
+#endif /* HAVE_LIBUDEV */
+}
+
+/*
+ * Encode the persistent devices strings
+ * used for the vdev disk label
+ */
+static int
+encode_device_strings(const char *path, vdev_dev_strs_t *ds,
+ boolean_t wholedisk)
+{
+#ifdef HAVE_LIBUDEV
+ struct udev *udev;
+ struct udev_device *dev = NULL;
+ char nodepath[MAXPATHLEN];
+ char *sysname;
+ int ret = ENODEV;
+ hrtime_t start;
+
+ if ((udev = udev_new()) == NULL)
+ return (ENXIO);
+
+ /* resolve path to a runtime device node instance */
+ if (realpath(path, nodepath) == NULL)
+ goto no_dev;
+
+ sysname = strrchr(nodepath, '/') + 1;
+
+ /*
+ * Wait up to 3 seconds for udev to set up the device node context
+ */
+ start = gethrtime();
+ do {
+ dev = udev_device_new_from_subsystem_sysname(udev, "block",
+ sysname);
+ if (dev == NULL)
+ goto no_dev;
+ if (udev_device_is_ready(dev))
+ break; /* udev ready */
+
+ udev_device_unref(dev);
+ dev = NULL;
+
+ if (NSEC2MSEC(gethrtime() - start) < 10)
+ (void) sched_yield(); /* yield/busy wait up to 10ms */
+ else
+ (void) usleep(10 * MILLISEC);
+
+ } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
+
+ if (dev == NULL)
+ goto no_dev;
+
+ /*
+ * Only whole disks require extra device strings
+ */
+ if (!wholedisk && !udev_mpath_whole_disk(dev))
+ goto no_dev;
+
+ ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
+ if (ret != 0)
+ goto no_dev_ref;
+
+ /* physical location string (optional) */
+ if (zfs_device_get_physical(dev, ds->vds_devphys,
+ sizeof (ds->vds_devphys)) != 0) {
+ ds->vds_devphys[0] = '\0'; /* empty string --> not available */
+ }
+
+no_dev_ref:
+ udev_device_unref(dev);
+no_dev:
+ udev_unref(udev);
+
+ return (ret);
+#else
+ return (ENOENT);
+#endif
+}
+
+/*
+ * Update a leaf vdev's persistent device strings (Linux only)
+ *
+ * - only applies for a dedicated leaf vdev (aka whole disk)
+ * - updated during pool create|add|attach|import
+ * - used for matching device matching during auto-{online,expand,replace}
+ * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
+ * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
+ *
+ * single device node example:
+ * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1'
+ * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
+ *
+ * multipath device node example:
+ * devid: 'dm-uuid-mpath-35000c5006304de3f'
+ *
+ * We also store the enclosure sysfs path for turning on enclosure LEDs
+ * (if applicable):
+ * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
+ */
+void
+update_vdev_config_dev_strs(nvlist_t *nv)
+{
+ vdev_dev_strs_t vds;
+ char *env, *type, *path;
+ uint64_t wholedisk = 0;
+ char *upath, *spath;
+
+ /*
+ * For the benefit of legacy ZFS implementations, allow
+ * for opting out of devid strings in the vdev label.
+ *
+ * example use:
+ * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
+ *
+ * explanation:
+ * Older ZFS on Linux implementations had issues when attempting to
+ * display pool config VDEV names if a "devid" NVP value is present
+ * in the pool's config.
+ *
+ * For example, a pool that originated on illumos platform would
+ * have a devid value in the config and "zpool status" would fail
+ * when listing the config.
+ *
+ * A pool can be stripped of any "devid" values on import or
+ * prevented from adding them on zpool create|add by setting
+ * ZFS_VDEV_DEVID_OPT_OUT.
+ */
+ env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
+ if (env && (strtoul(env, NULL, 0) > 0 ||
+ !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
+ return;
+ }
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
+ strcmp(type, VDEV_TYPE_DISK) != 0) {
+ return;
+ }
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
+ return;
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
+
+ /*
+ * Update device string values in config nvlist
+ */
+ if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
+ (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
+ if (vds.vds_devphys[0] != '\0') {
+ (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ vds.vds_devphys);
+ }
+
+ /* Add enclosure sysfs path (if disk is in an enclosure) */
+ upath = zfs_get_underlying_path(path);
+ spath = zfs_get_enclosure_sysfs_path(upath);
+ if (spath)
+ nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+ spath);
+ else
+ nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
+
+ free(upath);
+ free(spath);
+ } else {
+ /* clear out any stale entries */
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
+ }
+}
+
+/*
+ * Go through and fix up any path and/or devid information for the given vdev
+ * configuration.
+ */
+static int
+fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ uint64_t guid;
+ name_entry_t *ne, *best;
+ char *path;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if (fix_paths(hdl, child[c], names) != 0)
+ return (-1);
+ return (0);
+ }
+
+ /*
+ * This is a leaf (file or disk) vdev. In either case, go through
+ * the name list and see if we find a matching guid. If so, replace
+ * the path and see if we can calculate a new devid.
+ *
+ * There may be multiple names associated with a particular guid, in
+ * which case we have overlapping partitions or multiple paths to the
+ * same disk. In this case we prefer to use the path name which
+ * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we
+ * use the lowest order device which corresponds to the first match
+ * while traversing the ZPOOL_IMPORT_PATH search path.
+ */
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
+ path = NULL;
+
+ best = NULL;
+ for (ne = names; ne != NULL; ne = ne->ne_next) {
+ if (ne->ne_guid == guid) {
+ if (path == NULL) {
+ best = ne;
+ break;
+ }
+
+ if ((strlen(path) == strlen(ne->ne_name)) &&
+ strncmp(path, ne->ne_name, strlen(path)) == 0) {
+ best = ne;
+ break;
+ }
+
+ if (best == NULL) {
+ best = ne;
+ continue;
+ }
+
+ /* Prefer paths with move vdev labels. */
+ if (ne->ne_num_labels > best->ne_num_labels) {
+ best = ne;
+ continue;
+ }
+
+ /* Prefer paths earlier in the search order. */
+ if (ne->ne_num_labels == best->ne_num_labels &&
+ ne->ne_order < best->ne_order) {
+ best = ne;
+ continue;
+ }
+ }
+ }
+
+ if (best == NULL)
+ return (0);
+
+ if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
+ return (-1);
+
+ /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */
+ update_vdev_config_dev_strs(nv);
+
+ return (0);
+}
+
+/*
+ * Add the given configuration to the list of known devices.
+ */
+static int
+add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path,
+ int order, int num_labels, nvlist_t *config)
+{
+ uint64_t pool_guid, vdev_guid, top_guid, txg, state;
+ pool_entry_t *pe;
+ vdev_entry_t *ve;
+ config_entry_t *ce;
+ name_entry_t *ne;
+
+ /*
+ * If this is a hot spare not currently in use or level 2 cache
+ * device, add it to the list of names to translate, but don't do
+ * anything else.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &state) == 0 &&
+ (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
+ if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
+ return (-1);
+
+ if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
+ free(ne);
+ return (-1);
+ }
+ ne->ne_guid = vdev_guid;
+ ne->ne_order = order;
+ ne->ne_num_labels = num_labels;
+ ne->ne_next = pl->names;
+ pl->names = ne;
+
+ return (0);
+ }
+
+ /*
+ * If we have a valid config but cannot read any of these fields, then
+ * it means we have a half-initialized label. In vdev_label_init()
+ * we write a label with txg == 0 so that we can identify the device
+ * in case the user refers to the same disk later on. If we fail to
+ * create the pool, we'll be left with a label in this state
+ * which should not be considered part of a valid pool.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
+ &vdev_guid) != 0 ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+ &top_guid) != 0 ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0) {
+ return (0);
+ }
+
+ /*
+ * First, see if we know about this pool. If not, then add it to the
+ * list of known pools.
+ */
+ for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
+ if (pe->pe_guid == pool_guid)
+ break;
+ }
+
+ if (pe == NULL) {
+ if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
+ return (-1);
+ }
+ pe->pe_guid = pool_guid;
+ pe->pe_next = pl->pools;
+ pl->pools = pe;
+ }
+
+ /*
+ * Second, see if we know about this toplevel vdev. Add it if its
+ * missing.
+ */
+ for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
+ if (ve->ve_guid == top_guid)
+ break;
+ }
+
+ if (ve == NULL) {
+ if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
+ return (-1);
+ }
+ ve->ve_guid = top_guid;
+ ve->ve_next = pe->pe_vdevs;
+ pe->pe_vdevs = ve;
+ }
+
+ /*
+ * Third, see if we have a config with a matching transaction group. If
+ * so, then we do nothing. Otherwise, add it to the list of known
+ * configs.
+ */
+ for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
+ if (ce->ce_txg == txg)
+ break;
+ }
+
+ if (ce == NULL) {
+ if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) {
+ return (-1);
+ }
+ ce->ce_txg = txg;
+ ce->ce_config = fnvlist_dup(config);
+ ce->ce_next = ve->ve_configs;
+ ve->ve_configs = ce;
+ }
+
+ /*
+ * At this point we've successfully added our config to the list of
+ * known configs. The last thing to do is add the vdev guid -> path
+ * mappings so that we can fix up the configuration as necessary before
+ * doing the import.
+ */
+ if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
+ return (-1);
+
+ if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
+ free(ne);
+ return (-1);
+ }
+
+ ne->ne_guid = vdev_guid;
+ ne->ne_order = order;
+ ne->ne_num_labels = num_labels;
+ ne->ne_next = pl->names;
+ pl->names = ne;
+
+ return (0);
+}
+
+static int
+pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid,
+ boolean_t *isactive)
+{
+ ASSERT(hdl->lpc_ops->pco_pool_active != NULL);
+
+ int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name,
+ guid, isactive);
+
+ return (error);
+}
+
+static nvlist_t *
+refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig)
+{
+ ASSERT(hdl->lpc_ops->pco_refresh_config != NULL);
+
+ return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle,
+ tryconfig));
+}
+
+/*
+ * Determine if the vdev id is a hole in the namespace.
+ */
+static boolean_t
+vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+{
+ int c;
+
+ for (c = 0; c < holes; c++) {
+
+ /* Top-level is a hole */
+ if (hole_array[c] == id)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Convert our list of pools into the definitive set of configurations. We
+ * start by picking the best config for each toplevel vdev. Once that's done,
+ * we assemble the toplevel vdevs into a full config for the pool. We make a
+ * pass to fix up any incorrect paths, and then add it to the main list to
+ * return to the user.
+ */
+static nvlist_t *
+get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
+ nvlist_t *policy)
+{
+ pool_entry_t *pe;
+ vdev_entry_t *ve;
+ config_entry_t *ce;
+ nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
+ nvlist_t **spares, **l2cache;
+ uint_t i, nspares, nl2cache;
+ boolean_t config_seen;
+ uint64_t best_txg;
+ char *name, *hostname = NULL;
+ uint64_t guid;
+ uint_t children = 0;
+ nvlist_t **child = NULL;
+ uint_t holes;
+ uint64_t *hole_array, max_id;
+ uint_t c;
+ boolean_t isactive;
+ uint64_t hostid;
+ nvlist_t *nvl;
+ boolean_t valid_top_config = B_FALSE;
+
+ if (nvlist_alloc(&ret, 0, 0) != 0)
+ goto nomem;
+
+ for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
+ uint64_t id, max_txg = 0;
+
+ if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
+ goto nomem;
+ config_seen = B_FALSE;
+
+ /*
+ * Iterate over all toplevel vdevs. Grab the pool configuration
+ * from the first one we find, and then go through the rest and
+ * add them as necessary to the 'vdevs' member of the config.
+ */
+ for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
+
+ /*
+ * Determine the best configuration for this vdev by
+ * selecting the config with the latest transaction
+ * group.
+ */
+ best_txg = 0;
+ for (ce = ve->ve_configs; ce != NULL;
+ ce = ce->ce_next) {
+
+ if (ce->ce_txg > best_txg) {
+ tmp = ce->ce_config;
+ best_txg = ce->ce_txg;
+ }
+ }
+
+ /*
+ * We rely on the fact that the max txg for the
+ * pool will contain the most up-to-date information
+ * about the valid top-levels in the vdev namespace.
+ */
+ if (best_txg > max_txg) {
+ (void) nvlist_remove(config,
+ ZPOOL_CONFIG_VDEV_CHILDREN,
+ DATA_TYPE_UINT64);
+ (void) nvlist_remove(config,
+ ZPOOL_CONFIG_HOLE_ARRAY,
+ DATA_TYPE_UINT64_ARRAY);
+
+ max_txg = best_txg;
+ hole_array = NULL;
+ holes = 0;
+ max_id = 0;
+ valid_top_config = B_FALSE;
+
+ if (nvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
+ verify(nvlist_add_uint64(config,
+ ZPOOL_CONFIG_VDEV_CHILDREN,
+ max_id) == 0);
+ valid_top_config = B_TRUE;
+ }
+
+ if (nvlist_lookup_uint64_array(tmp,
+ ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
+ &holes) == 0) {
+ verify(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_HOLE_ARRAY,
+ hole_array, holes) == 0);
+ }
+ }
+
+ if (!config_seen) {
+ /*
+ * Copy the relevant pieces of data to the pool
+ * configuration:
+ *
+ * version
+ * pool guid
+ * name
+ * comment (if available)
+ * pool state
+ * hostid (if available)
+ * hostname (if available)
+ */
+ uint64_t state, version;
+ char *comment = NULL;
+
+ version = fnvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_VERSION);
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_VERSION, version);
+ guid = fnvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_POOL_GUID);
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_POOL_GUID, guid);
+ name = fnvlist_lookup_string(tmp,
+ ZPOOL_CONFIG_POOL_NAME);
+ fnvlist_add_string(config,
+ ZPOOL_CONFIG_POOL_NAME, name);
+
+ if (nvlist_lookup_string(tmp,
+ ZPOOL_CONFIG_COMMENT, &comment) == 0)
+ fnvlist_add_string(config,
+ ZPOOL_CONFIG_COMMENT, comment);
+
+ state = fnvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_POOL_STATE);
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, state);
+
+ hostid = 0;
+ if (nvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_HOSTID, hostid);
+ hostname = fnvlist_lookup_string(tmp,
+ ZPOOL_CONFIG_HOSTNAME);
+ fnvlist_add_string(config,
+ ZPOOL_CONFIG_HOSTNAME, hostname);
+ }
+
+ config_seen = B_TRUE;
+ }
+
+ /*
+ * Add this top-level vdev to the child array.
+ */
+ verify(nvlist_lookup_nvlist(tmp,
+ ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
+ verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
+ &id) == 0);
+
+ if (id >= children) {
+ nvlist_t **newchild;
+
+ newchild = zfs_alloc(hdl, (id + 1) *
+ sizeof (nvlist_t *));
+ if (newchild == NULL)
+ goto nomem;
+
+ for (c = 0; c < children; c++)
+ newchild[c] = child[c];
+
+ free(child);
+ child = newchild;
+ children = id + 1;
+ }
+ if (nvlist_dup(nvtop, &child[id], 0) != 0)
+ goto nomem;
+
+ }
+
+ /*
+ * If we have information about all the top-levels then
+ * clean up the nvlist which we've constructed. This
+ * means removing any extraneous devices that are
+ * beyond the valid range or adding devices to the end
+ * of our array which appear to be missing.
+ */
+ if (valid_top_config) {
+ if (max_id < children) {
+ for (c = max_id; c < children; c++)
+ nvlist_free(child[c]);
+ children = max_id;
+ } else if (max_id > children) {
+ nvlist_t **newchild;
+
+ newchild = zfs_alloc(hdl, (max_id) *
+ sizeof (nvlist_t *));
+ if (newchild == NULL)
+ goto nomem;
+
+ for (c = 0; c < children; c++)
+ newchild[c] = child[c];
+
+ free(child);
+ child = newchild;
+ children = max_id;
+ }
+ }
+
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &guid) == 0);
+
+ /*
+ * The vdev namespace may contain holes as a result of
+ * device removal. We must add them back into the vdev
+ * tree before we process any missing devices.
+ */
+ if (holes > 0) {
+ ASSERT(valid_top_config);
+
+ for (c = 0; c < children; c++) {
+ nvlist_t *holey;
+
+ if (child[c] != NULL ||
+ !vdev_is_hole(hole_array, holes, c))
+ continue;
+
+ if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
+ 0) != 0)
+ goto nomem;
+
+ /*
+ * Holes in the namespace are treated as
+ * "hole" top-level vdevs and have a
+ * special flag set on them.
+ */
+ if (nvlist_add_string(holey,
+ ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_HOLE) != 0 ||
+ nvlist_add_uint64(holey,
+ ZPOOL_CONFIG_ID, c) != 0 ||
+ nvlist_add_uint64(holey,
+ ZPOOL_CONFIG_GUID, 0ULL) != 0) {
+ nvlist_free(holey);
+ goto nomem;
+ }
+ child[c] = holey;
+ }
+ }
+
+ /*
+ * Look for any missing top-level vdevs. If this is the case,
+ * create a faked up 'missing' vdev as a placeholder. We cannot
+ * simply compress the child array, because the kernel performs
+ * certain checks to make sure the vdev IDs match their location
+ * in the configuration.
+ */
+ for (c = 0; c < children; c++) {
+ if (child[c] == NULL) {
+ nvlist_t *missing;
+ if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
+ 0) != 0)
+ goto nomem;
+ if (nvlist_add_string(missing,
+ ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_MISSING) != 0 ||
+ nvlist_add_uint64(missing,
+ ZPOOL_CONFIG_ID, c) != 0 ||
+ nvlist_add_uint64(missing,
+ ZPOOL_CONFIG_GUID, 0ULL) != 0) {
+ nvlist_free(missing);
+ goto nomem;
+ }
+ child[c] = missing;
+ }
+ }
+
+ /*
+ * Put all of this pool's top-level vdevs into a root vdev.
+ */
+ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
+ goto nomem;
+ if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) != 0 ||
+ nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||
+ nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||
+ nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ child, children) != 0) {
+ nvlist_free(nvroot);
+ goto nomem;
+ }
+
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+ children = 0;
+ child = NULL;
+
+ /*
+ * Go through and fix up any paths and/or devids based on our
+ * known list of vdev GUID -> path mappings.
+ */
+ if (fix_paths(hdl, nvroot, pl->names) != 0) {
+ nvlist_free(nvroot);
+ goto nomem;
+ }
+
+ /*
+ * Add the root vdev to this pool's configuration.
+ */
+ if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ nvroot) != 0) {
+ nvlist_free(nvroot);
+ goto nomem;
+ }
+ nvlist_free(nvroot);
+
+ /*
+ * zdb uses this path to report on active pools that were
+ * imported or created using -R.
+ */
+ if (active_ok)
+ goto add_pool;
+
+ /*
+ * Determine if this pool is currently active, in which case we
+ * can't actually import it.
+ */
+ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &name) == 0);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &guid) == 0);
+
+ if (pool_active(hdl, name, guid, &isactive) != 0)
+ goto error;
+
+ if (isactive) {
+ nvlist_free(config);
+ config = NULL;
+ continue;
+ }
+
+ if (policy != NULL) {
+ if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
+ policy) != 0)
+ goto nomem;
+ }
+
+ if ((nvl = refresh_config(hdl, config)) == NULL) {
+ nvlist_free(config);
+ config = NULL;
+ continue;
+ }
+
+ nvlist_free(config);
+ config = nvl;
+
+ /*
+ * Go through and update the paths for spares, now that we have
+ * them.
+ */
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ for (i = 0; i < nspares; i++) {
+ if (fix_paths(hdl, spares[i], pl->names) != 0)
+ goto nomem;
+ }
+ }
+
+ /*
+ * Update the paths for l2cache devices.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ for (i = 0; i < nl2cache; i++) {
+ if (fix_paths(hdl, l2cache[i], pl->names) != 0)
+ goto nomem;
+ }
+ }
+
+ /*
+ * Restore the original information read from the actual label.
+ */
+ (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
+ DATA_TYPE_UINT64);
+ (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
+ DATA_TYPE_STRING);
+ if (hostid != 0) {
+ verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
+ hostid) == 0);
+ verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
+ hostname) == 0);
+ }
+
+add_pool:
+ /*
+ * Add this pool to the list of configs.
+ */
+ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &name) == 0);
+
+ if (nvlist_add_nvlist(ret, name, config) != 0)
+ goto nomem;
+
+ nvlist_free(config);
+ config = NULL;
+ }
+
+ return (ret);
+
+nomem:
+ (void) no_memory(hdl);
+error:
+ nvlist_free(config);
+ nvlist_free(ret);
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+
+ return (NULL);
+}
+
+/*
+ * Return the offset of the given label.
+ */
+static uint64_t
+label_offset(uint64_t size, int l)
+{
+ ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0);
+ return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+ 0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+/*
+ * Given a file descriptor, read the label information and return an nvlist
+ * describing the configuration, if there is one. The number of valid
+ * labels found will be returned in num_labels when non-NULL.
+ */
+int
+zpool_read_label(int fd, nvlist_t **config, int *num_labels)
+{
+ struct stat64 statbuf;
+ int l, count = 0;
+ vdev_label_t *label;
+ nvlist_t *expected_config = NULL;
+ uint64_t expected_guid = 0, size;
+ int error;
+
+ *config = NULL;
+
+ if (fstat64_blk(fd, &statbuf) == -1)
+ return (0);
+ size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
+
+ error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
+ if (error)
+ return (-1);
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+ uint64_t state, guid, txg;
+
+ if (pread64(fd, label, sizeof (vdev_label_t),
+ label_offset(size, l)) != sizeof (vdev_label_t))
+ continue;
+
+ if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
+ sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
+ &guid) != 0 || guid == 0) {
+ nvlist_free(*config);
+ continue;
+ }
+
+ if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 || state > POOL_STATE_L2CACHE) {
+ nvlist_free(*config);
+ continue;
+ }
+
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0)) {
+ nvlist_free(*config);
+ continue;
+ }
+
+ if (expected_guid) {
+ if (expected_guid == guid)
+ count++;
+
+ nvlist_free(*config);
+ } else {
+ expected_config = *config;
+ expected_guid = guid;
+ count++;
+ }
+ }
+
+ if (num_labels != NULL)
+ *num_labels = count;
+
+ free(label);
+ *config = expected_config;
+
+ return (0);
+}
+
+typedef struct rdsk_node {
+ char *rn_name; /* Full path to device */
+ int rn_order; /* Preferred order (low to high) */
+ int rn_num_labels; /* Number of valid labels */
+ uint64_t rn_vdev_guid; /* Expected vdev guid when set */
+ libpc_handle_t *rn_hdl;
+ nvlist_t *rn_config; /* Label config */
+ avl_tree_t *rn_avl;
+ avl_node_t rn_node;
+ pthread_mutex_t *rn_lock;
+ boolean_t rn_labelpaths;
+} rdsk_node_t;
+
+/*
+ * Sorted by vdev guid and full path to allow for multiple entries with
+ * the same full path name. This is required because it's possible to
+ * have multiple block devices with labels that refer to the same
+ * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both
+ * entries need to be added to the cache. Scenarios where this can occur
+ * include overwritten pool labels, devices which are visible from multiple
+ * hosts and multipath devices.
+ */
+static int
+slice_cache_compare(const void *arg1, const void *arg2)
+{
+ const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
+ const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
+ uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
+ uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
+ int rv;
+
+ rv = AVL_CMP(guid1, guid2);
+ if (rv)
+ return (rv);
+
+ return (AVL_ISIGN(strcmp(nm1, nm2)));
+}
+
+static boolean_t
+is_watchdog_dev(char *dev)
+{
+ /* For 'watchdog' dev */
+ if (strcmp(dev, "watchdog") == 0)
+ return (B_TRUE);
+
+ /* For 'watchdog<digit><whatever> */
+ if (strstr(dev, "watchdog") == dev && isdigit(dev[8]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static int
+label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,
+ uint64_t vdev_guid, char **path, char **devid)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ uint64_t guid;
+ char *val;
+ int error;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ error = label_paths_impl(hdl, child[c],
+ pool_guid, vdev_guid, path, devid);
+ if (error)
+ return (error);
+ }
+ return (0);
+ }
+
+ if (nvroot == NULL)
+ return (0);
+
+ error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);
+ if ((error != 0) || (guid != vdev_guid))
+ return (0);
+
+ error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);
+ if (error == 0)
+ *path = val;
+
+ error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);
+ if (error == 0)
+ *devid = val;
+
+ return (0);
+}
+
+/*
+ * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
+ * and store these strings as config_path and devid_path respectively.
+ * The returned pointers are only valid as long as label remains valid.
+ */
+static int
+label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid)
+{
+ nvlist_t *nvroot;
+ uint64_t pool_guid;
+ uint64_t vdev_guid;
+
+ *path = NULL;
+ *devid = NULL;
+
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid))
+ return (ENOENT);
+
+ return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,
+ devid));
+}
+
+static void
+zpool_open_func(void *arg)
+{
+ rdsk_node_t *rn = arg;
+ libpc_handle_t *hdl = rn->rn_hdl;
+ struct stat64 statbuf;
+ nvlist_t *config;
+ char *bname, *dupname;
+ uint64_t vdev_guid = 0;
+ int error;
+ int num_labels = 0;
+ int fd;
+
+ /*
+ * Skip devices with well known prefixes there can be side effects
+ * when opening devices which need to be avoided.
+ *
+ * hpet - High Precision Event Timer
+ * watchdog - Watchdog must be closed in a special way.
+ */
+ dupname = zfs_strdup(hdl, rn->rn_name);
+ bname = basename(dupname);
+ error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname));
+ free(dupname);
+ if (error)
+ return;
+
+ /*
+ * Ignore failed stats. We only want regular files and block devices.
+ */
+ if (stat64(rn->rn_name, &statbuf) != 0 ||
+ (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
+ return;
+
+ /*
+ * Preferentially open using O_DIRECT to bypass the block device
+ * cache which may be stale for multipath devices. An EINVAL errno
+ * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
+ */
+ fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
+ if ((fd < 0) && (errno == EINVAL))
+ fd = open(rn->rn_name, O_RDONLY);
+
+ if ((fd < 0) && (errno == EACCES))
+ hdl->lpc_open_access_error = B_TRUE;
+
+ if (fd < 0)
+ return;
+
+ /*
+ * This file is too small to hold a zpool
+ */
+ if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
+ (void) close(fd);
+ return;
+ }
+
+ error = zpool_read_label(fd, &config, &num_labels);
+ if (error != 0) {
+ (void) close(fd);
+ return;
+ }
+
+ if (num_labels == 0) {
+ (void) close(fd);
+ nvlist_free(config);
+ return;
+ }
+
+ /*
+ * Check that the vdev is for the expected guid. Additional entries
+ * are speculatively added based on the paths stored in the labels.
+ * Entries with valid paths but incorrect guids must be removed.
+ */
+ error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+ if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
+ (void) close(fd);
+ nvlist_free(config);
+ return;
+ }
+
+ (void) close(fd);
+
+ rn->rn_config = config;
+ rn->rn_num_labels = num_labels;
+
+ /*
+ * Add additional entries for paths described by this label.
+ */
+ if (rn->rn_labelpaths) {
+ char *path = NULL;
+ char *devid = NULL;
+ rdsk_node_t *slice;
+ avl_index_t where;
+ int error;
+
+ if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
+ return;
+
+ /*
+ * Allow devlinks to stabilize so all paths are available.
+ */
+ zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT);
+
+ if (path != NULL) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ slice->rn_name = zfs_strdup(hdl, path);
+ slice->rn_vdev_guid = vdev_guid;
+ slice->rn_avl = rn->rn_avl;
+ slice->rn_hdl = hdl;
+ slice->rn_order = IMPORT_ORDER_PREFERRED_1;
+ slice->rn_labelpaths = B_FALSE;
+ pthread_mutex_lock(rn->rn_lock);
+ if (avl_find(rn->rn_avl, slice, &where)) {
+ pthread_mutex_unlock(rn->rn_lock);
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(rn->rn_avl, slice, where);
+ pthread_mutex_unlock(rn->rn_lock);
+ zpool_open_func(slice);
+ }
+ }
+
+ if (devid != NULL) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ error = asprintf(&slice->rn_name, "%s%s",
+ DEV_BYID_PATH, devid);
+ if (error == -1) {
+ free(slice);
+ return;
+ }
+
+ slice->rn_vdev_guid = vdev_guid;
+ slice->rn_avl = rn->rn_avl;
+ slice->rn_hdl = hdl;
+ slice->rn_order = IMPORT_ORDER_PREFERRED_2;
+ slice->rn_labelpaths = B_FALSE;
+ pthread_mutex_lock(rn->rn_lock);
+ if (avl_find(rn->rn_avl, slice, &where)) {
+ pthread_mutex_unlock(rn->rn_lock);
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(rn->rn_avl, slice, where);
+ pthread_mutex_unlock(rn->rn_lock);
+ zpool_open_func(slice);
+ }
+ }
+ }
+}
+
+static void
+zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t *cache, const char *path, const char *name, int order)
+{
+ avl_index_t where;
+ rdsk_node_t *slice;
+
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {
+ free(slice);
+ return;
+ }
+ slice->rn_vdev_guid = 0;
+ slice->rn_lock = lock;
+ slice->rn_avl = cache;
+ slice->rn_hdl = hdl;
+ slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;
+ slice->rn_labelpaths = B_FALSE;
+
+ pthread_mutex_lock(lock);
+ if (avl_find(cache, slice, &where)) {
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(cache, slice, where);
+ }
+ pthread_mutex_unlock(lock);
+}
+
+static int
+zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t *cache, const char *dir, int order)
+{
+ int error;
+ char path[MAXPATHLEN];
+ struct dirent64 *dp;
+ DIR *dirp;
+
+ if (realpath(dir, path) == NULL) {
+ error = errno;
+ if (error == ENOENT)
+ return (0);
+
+ zfs_error_aux(hdl, strerror(error));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
+ TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
+ return (error);
+ }
+
+ dirp = opendir(path);
+ if (dirp == NULL) {
+ error = errno;
+ zfs_error_aux(hdl, strerror(error));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH,
+ dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
+ return (error);
+ }
+
+ while ((dp = readdir64(dirp)) != NULL) {
+ const char *name = dp->d_name;
+ if (name[0] == '.' &&
+ (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
+ continue;
+
+ zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,
+ order);
+ }
+
+ (void) closedir(dirp);
+ return (0);
+}
+
+static int
+zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t *cache, const char *dir, int order)
+{
+ int error = 0;
+ char path[MAXPATHLEN];
+ char *d, *b;
+ char *dpath, *name;
+
+ /*
+ * Seperate the directory part and last part of the
+ * path. We do this so that we can get the realpath of
+ * the directory. We don't get the realpath on the
+ * whole path because if it's a symlink, we want the
+ * path of the symlink not where it points to.
+ */
+ d = zfs_strdup(hdl, dir);
+ b = zfs_strdup(hdl, dir);
+ dpath = dirname(d);
+ name = basename(b);
+
+ if (realpath(dpath, path) == NULL) {
+ error = errno;
+ if (error == ENOENT) {
+ error = 0;
+ goto out;
+ }
+
+ zfs_error_aux(hdl, strerror(error));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
+ TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
+ goto out;
+ }
+
+ zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);
+
+out:
+ free(b);
+ free(d);
+ return (error);
+}
+
+/*
+ * Scan a list of directories for zfs devices.
+ */
+static int
+zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t **slice_cache, char **dir, int dirs)
+{
+ avl_tree_t *cache;
+ rdsk_node_t *slice;
+ void *cookie;
+ int i, error;
+
+ *slice_cache = NULL;
+ cache = zfs_alloc(hdl, sizeof (avl_tree_t));
+ avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),
+ offsetof(rdsk_node_t, rn_node));
+
+ for (i = 0; i < dirs; i++) {
+ struct stat sbuf;
+
+ if (stat(dir[i], &sbuf) != 0) {
+ error = errno;
+ if (error == ENOENT)
+ continue;
+
+ zfs_error_aux(hdl, strerror(error));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
+ TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);
+ goto error;
+ }
+
+ /*
+ * If dir[i] is a directory, we walk through it and add all
+ * the entry to the cache. If it's not a directory, we just
+ * add it to the cache.
+ */
+ if (S_ISDIR(sbuf.st_mode)) {
+ if ((error = zpool_find_import_scan_dir(hdl, lock,
+ cache, dir[i], i)) != 0)
+ goto error;
+ } else {
+ if ((error = zpool_find_import_scan_path(hdl, lock,
+ cache, dir[i], i)) != 0)
+ goto error;
+ }
+ }
+
+ *slice_cache = cache;
+ return (0);
+
+error:
+ cookie = NULL;
+ while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
+ free(slice->rn_name);
+ free(slice);
+ }
+ free(cache);
+
+ return (error);
+}
+
+static char *
+zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
+ "/dev/disk/by-vdev", /* Custom rules, use first if they exist */
+ "/dev/mapper", /* Use multipath devices before components */
+ "/dev/disk/by-partlabel", /* Single unique entry set by user */
+ "/dev/disk/by-partuuid", /* Generated partition uuid */
+ "/dev/disk/by-label", /* Custom persistent labels */
+ "/dev/disk/by-uuid", /* Single unique entry and persistent */
+ "/dev/disk/by-id", /* May be multiple entries and persistent */
+ "/dev/disk/by-path", /* Encodes physical location and persistent */
+ "/dev" /* UNSAFE device names will change */
+};
+
+const char * const *
+zpool_default_search_paths(size_t *count)
+{
+ *count = DEFAULT_IMPORT_PATH_SIZE;
+ return ((const char * const *)zpool_default_import_path);
+}
+
+/*
+ * Given a full path to a device determine if that device appears in the
+ * import search path. If it does return the first match and store the
+ * index in the passed 'order' variable, otherwise return an error.
+ */
+static int
+zfs_path_order(char *name, int *order)
+{
+ int i = 0, error = ENOENT;
+ char *dir, *env, *envdup;
+
+ env = getenv("ZPOOL_IMPORT_PATH");
+ if (env) {
+ envdup = strdup(env);
+ dir = strtok(envdup, ":");
+ while (dir) {
+ if (strncmp(name, dir, strlen(dir)) == 0) {
+ *order = i;
+ error = 0;
+ break;
+ }
+ dir = strtok(NULL, ":");
+ i++;
+ }
+ free(envdup);
+ } else {
+ for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) {
+ if (strncmp(name, zpool_default_import_path[i],
+ strlen(zpool_default_import_path[i])) == 0) {
+ *order = i;
+ error = 0;
+ break;
+ }
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Use libblkid to quickly enumerate all known zfs devices.
+ */
+static int
+zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t **slice_cache)
+{
+ rdsk_node_t *slice;
+ blkid_cache cache;
+ blkid_dev_iterate iter;
+ blkid_dev dev;
+ avl_index_t where;
+ int error;
+
+ *slice_cache = NULL;
+
+ error = blkid_get_cache(&cache, NULL);
+ if (error != 0)
+ return (error);
+
+ error = blkid_probe_all_new(cache);
+ if (error != 0) {
+ blkid_put_cache(cache);
+ return (error);
+ }
+
+ iter = blkid_dev_iterate_begin(cache);
+ if (iter == NULL) {
+ blkid_put_cache(cache);
+ return (EINVAL);
+ }
+
+ error = blkid_dev_set_search(iter, "TYPE", "zfs_member");
+ if (error != 0) {
+ blkid_dev_iterate_end(iter);
+ blkid_put_cache(cache);
+ return (error);
+ }
+
+ *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t));
+ avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
+ offsetof(rdsk_node_t, rn_node));
+
+ while (blkid_dev_next(iter, &dev) == 0) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
+ slice->rn_vdev_guid = 0;
+ slice->rn_lock = lock;
+ slice->rn_avl = *slice_cache;
+ slice->rn_hdl = hdl;
+ slice->rn_labelpaths = B_TRUE;
+
+ error = zfs_path_order(slice->rn_name, &slice->rn_order);
+ if (error == 0)
+ slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
+ else
+ slice->rn_order = IMPORT_ORDER_DEFAULT;
+
+ pthread_mutex_lock(lock);
+ if (avl_find(*slice_cache, slice, &where)) {
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(*slice_cache, slice, where);
+ }
+ pthread_mutex_unlock(lock);
+ }
+
+ blkid_dev_iterate_end(iter);
+ blkid_put_cache(cache);
+
+ return (0);
+}
+
+/*
+ * Given a list of directories to search, find all pools stored on disk. This
+ * includes partial pools which are not available to import. If no args are
+ * given (argc is 0), then the default directory (/dev/dsk) is searched.
+ * poolname or guid (but not both) are provided by the caller when trying
+ * to import a specific pool.
+ */
+static nvlist_t *
+zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg)
+{
+ nvlist_t *ret = NULL;
+ pool_list_t pools = { 0 };
+ pool_entry_t *pe, *penext;
+ vdev_entry_t *ve, *venext;
+ config_entry_t *ce, *cenext;
+ name_entry_t *ne, *nenext;
+ pthread_mutex_t lock;
+ avl_tree_t *cache;
+ rdsk_node_t *slice;
+ void *cookie;
+ tpool_t *t;
+
+ verify(iarg->poolname == NULL || iarg->guid == 0);
+ pthread_mutex_init(&lock, NULL);
+
+ /*
+ * Locate pool member vdevs using libblkid or by directory scanning.
+ * On success a newly allocated AVL tree which is populated with an
+ * entry for each discovered vdev will be returned as the cache.
+ * It's the callers responsibility to consume and destroy this tree.
+ */
+ if (iarg->scan || iarg->paths != 0) {
+ int dirs = iarg->paths;
+ char **dir = iarg->path;
+
+ if (dirs == 0) {
+ dir = zpool_default_import_path;
+ dirs = DEFAULT_IMPORT_PATH_SIZE;
+ }
+
+ if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0)
+ return (NULL);
+ } else {
+ if (zpool_find_import_blkid(hdl, &lock, &cache) != 0)
+ return (NULL);
+ }
+
+ /*
+ * Create a thread pool to parallelize the process of reading and
+ * validating labels, a large number of threads can be used due to
+ * minimal contention.
+ */
+ t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
+ for (slice = avl_first(cache); slice;
+ (slice = avl_walk(cache, slice, AVL_AFTER)))
+ (void) tpool_dispatch(t, zpool_open_func, slice);
+
+ tpool_wait(t);
+ tpool_destroy(t);
+
+ /*
+ * Process the cache filtering out any entries which are not
+ * for the specificed pool then adding matching label configs.
+ */
+ cookie = NULL;
+ while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
+ if (slice->rn_config != NULL) {
+ nvlist_t *config = slice->rn_config;
+ boolean_t matched = B_TRUE;
+ boolean_t aux = B_FALSE;
+ int fd;
+
+ /*
+ * Check if it's a spare or l2cache device. If it is,
+ * we need to skip the name and guid check since they
+ * don't exist on aux device label.
+ */
+ if (iarg->poolname != NULL || iarg->guid != 0) {
+ uint64_t state;
+ aux = nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&
+ (state == POOL_STATE_SPARE ||
+ state == POOL_STATE_L2CACHE);
+ }
+
+ if (iarg->poolname != NULL && !aux) {
+ char *pname;
+
+ matched = nvlist_lookup_string(config,
+ ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&
+ strcmp(iarg->poolname, pname) == 0;
+ } else if (iarg->guid != 0 && !aux) {
+ uint64_t this_guid;
+
+ matched = nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&
+ iarg->guid == this_guid;
+ }
+ if (matched) {
+ /*
+ * Verify all remaining entries can be opened
+ * exclusively. This will prune all underlying
+ * multipath devices which otherwise could
+ * result in the vdev appearing as UNAVAIL.
+ *
+ * Under zdb, this step isn't required and
+ * would prevent a zdb -e of active pools with
+ * no cachefile.
+ */
+ fd = open(slice->rn_name, O_RDONLY | O_EXCL);
+ if (fd >= 0 || iarg->can_be_active) {
+ if (fd >= 0)
+ close(fd);
+ add_config(hdl, &pools,
+ slice->rn_name, slice->rn_order,
+ slice->rn_num_labels, config);
+ }
+ }
+ nvlist_free(config);
+ }
+ free(slice->rn_name);
+ free(slice);
+ }
+ avl_destroy(cache);
+ free(cache);
+ pthread_mutex_destroy(&lock);
+
+ ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
+
+ for (pe = pools.pools; pe != NULL; pe = penext) {
+ penext = pe->pe_next;
+ for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
+ venext = ve->ve_next;
+ for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
+ cenext = ce->ce_next;
+ nvlist_free(ce->ce_config);
+ free(ce);
+ }
+ free(ve);
+ }
+ free(pe);
+ }
+
+ for (ne = pools.names; ne != NULL; ne = nenext) {
+ nenext = ne->ne_next;
+ free(ne->ne_name);
+ free(ne);
+ }
+
+ return (ret);
+}
+
+/*
+ * Given a cache file, return the contents as a list of importable pools.
+ * poolname or guid (but not both) are provided by the caller when trying
+ * to import a specific pool.
+ */
+static nvlist_t *
+zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile,
+ const char *poolname, uint64_t guid)
+{
+ char *buf;
+ int fd;
+ struct stat64 statbuf;
+ nvlist_t *raw, *src, *dst;
+ nvlist_t *pools;
+ nvpair_t *elem;
+ char *name;
+ uint64_t this_guid;
+ boolean_t active;
+
+ verify(poolname == NULL || guid == 0);
+
+ if ((fd = open(cachefile, O_RDONLY)) < 0) {
+ zfs_error_aux(hdl, "%s", strerror(errno));
+ (void) zfs_error(hdl, EZFS_BADCACHE,
+ dgettext(TEXT_DOMAIN, "failed to open cache file"));
+ return (NULL);
+ }
+
+ if (fstat64(fd, &statbuf) != 0) {
+ zfs_error_aux(hdl, "%s", strerror(errno));
+ (void) close(fd);
+ (void) zfs_error(hdl, EZFS_BADCACHE,
+ dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
+ return (NULL);
+ }
+
+ if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) {
+ (void) close(fd);
+ return (NULL);
+ }
+
+ if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
+ (void) close(fd);
+ free(buf);
+ (void) zfs_error(hdl, EZFS_BADCACHE,
+ dgettext(TEXT_DOMAIN,
+ "failed to read cache file contents"));
+ return (NULL);
+ }
+
+ (void) close(fd);
+
+ if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
+ free(buf);
+ (void) zfs_error(hdl, EZFS_BADCACHE,
+ dgettext(TEXT_DOMAIN,
+ "invalid or corrupt cache file contents"));
+ return (NULL);
+ }
+
+ free(buf);
+
+ /*
+ * Go through and get the current state of the pools and refresh their
+ * state.
+ */
+ if (nvlist_alloc(&pools, 0, 0) != 0) {
+ (void) no_memory(hdl);
+ nvlist_free(raw);
+ return (NULL);
+ }
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
+ src = fnvpair_value_nvlist(elem);
+
+ name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
+ if (poolname != NULL && strcmp(poolname, name) != 0)
+ continue;
+
+ this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
+ if (guid != 0 && guid != this_guid)
+ continue;
+
+ if (pool_active(hdl, name, this_guid, &active) != 0) {
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+
+ if (active)
+ continue;
+
+ if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
+ cachefile) != 0) {
+ (void) no_memory(hdl);
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+
+ if ((dst = refresh_config(hdl, src)) == NULL) {
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+
+ if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
+ (void) no_memory(hdl);
+ nvlist_free(dst);
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+ nvlist_free(dst);
+ }
+
+ nvlist_free(raw);
+ return (pools);
+}
+
+nvlist_t *
+zpool_search_import(void *hdl, importargs_t *import,
+ const pool_config_ops_t *pco)
+{
+ libpc_handle_t handle = { 0 };
+ nvlist_t *pools = NULL;
+
+ handle.lpc_lib_handle = hdl;
+ handle.lpc_ops = pco;
+ handle.lpc_printerr = B_TRUE;
+
+ verify(import->poolname == NULL || import->guid == 0);
+
+ if (import->cachefile != NULL)
+ pools = zpool_find_import_cached(&handle, import->cachefile,
+ import->poolname, import->guid);
+ else
+ pools = zpool_find_import_impl(&handle, import);
+
+ if ((pools == NULL || nvlist_empty(pools)) &&
+ handle.lpc_open_access_error && geteuid() != 0) {
+ (void) zfs_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN,
+ "no pools found"));
+ }
+
+ return (pools);
+}
+
+static boolean_t
+pool_match(nvlist_t *cfg, char *tgt)
+{
+ uint64_t v, guid = strtoull(tgt, NULL, 0);
+ char *s;
+
+ if (guid != 0) {
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
+ return (v == guid);
+ } else {
+ if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
+ return (strcmp(s, tgt) == 0);
+ }
+ return (B_FALSE);
+}
+
+int
+zpool_find_config(void *hdl, const char *target, nvlist_t **configp,
+ importargs_t *args, const pool_config_ops_t *pco)
+{
+ nvlist_t *pools;
+ nvlist_t *match = NULL;
+ nvlist_t *config = NULL;
+ char *name = NULL, *sepp = NULL;
+ char sep = '\0';
+ int count = 0;
+ char *targetdup = strdup(target);
+
+ *configp = NULL;
+
+ if ((sepp = strpbrk(targetdup, "/@")) != NULL) {
+ sep = *sepp;
+ *sepp = '\0';
+ }
+
+ pools = zpool_search_import(hdl, args, pco);
+
+ if (pools != NULL) {
+ nvpair_t *elem = NULL;
+ while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
+ VERIFY0(nvpair_value_nvlist(elem, &config));
+ if (pool_match(config, targetdup)) {
+ count++;
+ if (match != NULL) {
+ /* multiple matches found */
+ continue;
+ } else {
+ match = config;
+ name = nvpair_name(elem);
+ }
+ }
+ }
+ }
+
+ if (count == 0) {
+ free(targetdup);
+ return (ENOENT);
+ }
+
+ if (count > 1) {
+ free(targetdup);
+ return (EINVAL);
+ }
+
+ *configp = match;
+ free(targetdup);
+
+ return (0);
+}
diff --git a/lib/libzutil/zutil_nicenum.c b/lib/libzutil/zutil_nicenum.c
new file mode 100644
index 000000000..9a81011fc
--- /dev/null
+++ b/lib/libzutil/zutil_nicenum.c
@@ -0,0 +1,157 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <libzutil.h>
+
+/*
+ * Convert a number to an appropriately human-readable output.
+ */
+void
+zfs_nicenum_format(uint64_t num, char *buf, size_t buflen,
+ enum zfs_nicenum_format format)
+{
+ uint64_t n = num;
+ int index = 0;
+ const char *u;
+ const char *units[3][7] = {
+ [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"},
+ [ZFS_NICENUM_BYTES] = {"B", "K", "M", "G", "T", "P", "E"},
+ [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"}
+ };
+
+ const int units_len[] = {[ZFS_NICENUM_1024] = 6,
+ [ZFS_NICENUM_BYTES] = 6,
+ [ZFS_NICENUM_TIME] = 4};
+
+ const int k_unit[] = { [ZFS_NICENUM_1024] = 1024,
+ [ZFS_NICENUM_BYTES] = 1024,
+ [ZFS_NICENUM_TIME] = 1000};
+
+ double val;
+
+ if (format == ZFS_NICENUM_RAW) {
+ snprintf(buf, buflen, "%llu", (u_longlong_t)num);
+ return;
+ } else if (format == ZFS_NICENUM_RAWTIME && num > 0) {
+ snprintf(buf, buflen, "%llu", (u_longlong_t)num);
+ return;
+ } else if (format == ZFS_NICENUM_RAWTIME && num == 0) {
+ snprintf(buf, buflen, "%s", "-");
+ return;
+ }
+
+ while (n >= k_unit[format] && index < units_len[format]) {
+ n /= k_unit[format];
+ index++;
+ }
+
+ u = units[format][index];
+
+ /* Don't print zero latencies since they're invalid */
+ if ((format == ZFS_NICENUM_TIME) && (num == 0)) {
+ (void) snprintf(buf, buflen, "-");
+ } else if ((index == 0) || ((num %
+ (uint64_t)powl(k_unit[format], index)) == 0)) {
+ /*
+ * If this is an even multiple of the base, always display
+ * without any decimal precision.
+ */
+ (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t)n, u);
+
+ } else {
+ /*
+ * We want to choose a precision that reflects the best choice
+ * for fitting in 5 characters. This can get rather tricky when
+ * we have numbers that are very close to an order of magnitude.
+ * For example, when displaying 10239 (which is really 9.999K),
+ * we want only a single place of precision for 10.0K. We could
+ * develop some complex heuristics for this, but it's much
+ * easier just to try each combination in turn.
+ */
+ int i;
+ for (i = 2; i >= 0; i--) {
+ val = (double)num /
+ (uint64_t)powl(k_unit[format], index);
+
+ /*
+ * Don't print floating point values for time. Note,
+ * we use floor() instead of round() here, since
+ * round can result in undesirable results. For
+ * example, if "num" is in the range of
+ * 999500-999999, it will print out "1000us". This
+ * doesn't happen if we use floor().
+ */
+ if (format == ZFS_NICENUM_TIME) {
+ if (snprintf(buf, buflen, "%d%s",
+ (unsigned int) floor(val), u) <= 5)
+ break;
+
+ } else {
+ if (snprintf(buf, buflen, "%.*f%s", i,
+ val, u) <= 5)
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * Convert a number to an appropriately human-readable output.
+ */
+void
+zfs_nicenum(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024);
+}
+
+/*
+ * Convert a time to an appropriately human-readable output.
+ * @num: Time in nanoseconds
+ */
+void
+zfs_nicetime(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME);
+}
+
+/*
+ * Print out a raw number with correct column spacing
+ */
+void
+zfs_niceraw(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW);
+}
+
+/*
+ * Convert a number of bytes to an appropriately human-readable output.
+ */
+void
+zfs_nicebytes(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_BYTES);
+}
diff --git a/lib/libzutil/zutil_pool.c b/lib/libzutil/zutil_pool.c
new file mode 100644
index 000000000..734650f3c
--- /dev/null
+++ b/lib/libzutil/zutil_pool.c
@@ -0,0 +1,145 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/nvpair.h>
+#include <sys/fs/zfs.h>
+
+#include <libzutil.h>
+
+static void
+dump_ddt_stat(const ddt_stat_t *dds, int h)
+{
+ char refcnt[6];
+ char blocks[6], lsize[6], psize[6], dsize[6];
+ char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
+
+ if (dds == NULL || dds->dds_blocks == 0)
+ return;
+
+ if (h == -1)
+ (void) strcpy(refcnt, "Total");
+ else
+ zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt));
+
+ zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks));
+ zfs_nicebytes(dds->dds_lsize, lsize, sizeof (lsize));
+ zfs_nicebytes(dds->dds_psize, psize, sizeof (psize));
+ zfs_nicebytes(dds->dds_dsize, dsize, sizeof (dsize));
+ zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks));
+ zfs_nicebytes(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize));
+ zfs_nicebytes(dds->dds_ref_psize, ref_psize, sizeof (ref_psize));
+ zfs_nicebytes(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize));
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ refcnt,
+ blocks, lsize, psize, dsize,
+ ref_blocks, ref_lsize, ref_psize, ref_dsize);
+}
+
+/*
+ * Print the DDT histogram and the column totals.
+ */
+void
+zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh)
+{
+ int h;
+
+ (void) printf("\n");
+
+ (void) printf("bucket "
+ " allocated "
+ " referenced \n");
+ (void) printf("______ "
+ "______________________________ "
+ "______________________________\n");
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ "refcnt",
+ "blocks", "LSIZE", "PSIZE", "DSIZE",
+ "blocks", "LSIZE", "PSIZE", "DSIZE");
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ "------",
+ "------", "-----", "-----", "-----",
+ "------", "-----", "-----", "-----");
+
+ for (h = 0; h < 64; h++)
+ dump_ddt_stat(&ddh->ddh_stat[h], h);
+
+ dump_ddt_stat(dds_total, -1);
+
+ (void) printf("\n");
+}
+
+/*
+ * Process the buffer of nvlists, unpacking and storing each nvlist record
+ * into 'records'. 'leftover' is set to the number of bytes that weren't
+ * processed as there wasn't a complete record.
+ */
+int
+zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
+ nvlist_t ***records, uint_t *numrecords)
+{
+ uint64_t reclen;
+ nvlist_t *nv;
+ int i;
+ void *tmp;
+
+ while (bytes_read > sizeof (reclen)) {
+
+ /* get length of packed record (stored as little endian) */
+ for (i = 0, reclen = 0; i < sizeof (reclen); i++)
+ reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i);
+
+ if (bytes_read < sizeof (reclen) + reclen)
+ break;
+
+ /* unpack record */
+ if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0)
+ return (ENOMEM);
+ bytes_read -= sizeof (reclen) + reclen;
+ buf += sizeof (reclen) + reclen;
+
+ /* add record to nvlist array */
+ (*numrecords)++;
+ if (ISP2(*numrecords + 1)) {
+ tmp = realloc(*records,
+ *numrecords * 2 * sizeof (nvlist_t *));
+ if (tmp == NULL) {
+ nvlist_free(nv);
+ (*numrecords)--;
+ return (ENOMEM);
+ }
+ *records = tmp;
+ }
+ (*records)[*numrecords - 1] = nv;
+ }
+
+ *leftover = bytes_read;
+ return (0);
+}