aboutsummaryrefslogtreecommitdiffstats
path: root/lib/libzutil
diff options
context:
space:
mode:
authorDon Brady <[email protected]>2018-11-05 12:22:33 -0700
committerBrian Behlendorf <[email protected]>2018-11-05 11:22:33 -0800
commite89f1295d4faa88bb05a62c8dd5f781657db5955 (patch)
tree8e39dfe33c6849e00813e54ec95c09a24448a43a /lib/libzutil
parent6644e5bb6e1a6c25c5006c819abf93c7bb662e80 (diff)
Add libzutil for libzfs or libzpool consumers
Adds a libzutil for utility functions that are common to libzfs and libzpool consumers (most of what was in libzfs_import.c). This removes the need for utilities to link against both libzpool and libzfs. Reviewed-by: Matthew Ahrens <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Don Brady <[email protected]> Closes #8050
Diffstat (limited to 'lib/libzutil')
-rw-r--r--lib/libzutil/Makefile.am27
-rw-r--r--lib/libzutil/zutil_device_path.c625
-rw-r--r--lib/libzutil/zutil_import.c2389
-rw-r--r--lib/libzutil/zutil_nicenum.c157
-rw-r--r--lib/libzutil/zutil_pool.c145
5 files changed, 3343 insertions, 0 deletions
diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am
new file mode 100644
index 000000000..720b843ab
--- /dev/null
+++ b/lib/libzutil/Makefile.am
@@ -0,0 +1,27 @@
+include $(top_srcdir)/config/Rules.am
+
+# Suppress unused but set variable warnings often due to ASSERTs
+AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE)
+
+DEFAULT_INCLUDES += \
+ -I$(top_srcdir)/include \
+ -I$(top_srcdir)/lib/libspl/include
+
+noinst_LTLIBRARIES = libzutil.la
+
+USER_C = \
+ zutil_device_path.c \
+ zutil_import.c \
+ zutil_nicenum.c \
+ zutil_pool.c
+
+nodist_libzutil_la_SOURCES = $(USER_C)
+
+libzutil_la_LIBADD = \
+ $(top_builddir)/lib/libavl/libavl.la \
+ $(top_builddir)/lib/libefi/libefi.la \
+ $(top_builddir)/lib/libtpool/libtpool.la
+
+libzutil_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV)
+
+EXTRA_DIST = $(USER_C)
diff --git a/lib/libzutil/zutil_device_path.c b/lib/libzutil/zutil_device_path.c
new file mode 100644
index 000000000..1dc0d4d1d
--- /dev/null
+++ b/lib/libzutil/zutil_device_path.c
@@ -0,0 +1,625 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <ctype.h>
+#include <errno.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/efi_partition.h>
+
+#include <libzutil.h>
+#ifdef HAVE_LIBUDEV
+#include <libudev.h>
+#endif
+
+/*
+ * Append partition suffix to an otherwise fully qualified device path.
+ * This is used to generate the name the full path as its stored in
+ * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length
+ * of 'path' will be returned on error a negative value is returned.
+ */
+int
+zfs_append_partition(char *path, size_t max_len)
+{
+ int len = strlen(path);
+
+ if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) ||
+ (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) {
+ if (len + 6 >= max_len)
+ return (-1);
+
+ (void) strcat(path, "-part1");
+ len += 6;
+ } else {
+ if (len + 2 >= max_len)
+ return (-1);
+
+ if (isdigit(path[len-1])) {
+ (void) strcat(path, "p1");
+ len += 2;
+ } else {
+ (void) strcat(path, "1");
+ len += 1;
+ }
+ }
+
+ return (len);
+}
+
+/*
+ * Given a shorthand device name check if a file by that name exists in any
+ * of the 'zpool_default_import_path' or ZPOOL_IMPORT_PATH directories. If
+ * one is found, store its fully qualified path in the 'path' buffer passed
+ * by the caller and return 0, otherwise return an error.
+ */
+int
+zfs_resolve_shortname(const char *name, char *path, size_t len)
+{
+ int i, error = -1;
+ char *dir, *env, *envdup;
+
+ env = getenv("ZPOOL_IMPORT_PATH");
+ errno = ENOENT;
+
+ if (env) {
+ envdup = strdup(env);
+ dir = strtok(envdup, ":");
+ while (dir && error) {
+ (void) snprintf(path, len, "%s/%s", dir, name);
+ error = access(path, F_OK);
+ dir = strtok(NULL, ":");
+ }
+ free(envdup);
+ } else {
+ const char * const *zpool_default_import_path;
+ size_t count;
+
+ zpool_default_import_path = zpool_default_search_paths(&count);
+
+ for (i = 0; i < count && error < 0; i++) {
+ (void) snprintf(path, len, "%s/%s",
+ zpool_default_import_path[i], name);
+ error = access(path, F_OK);
+ }
+ }
+
+ return (error ? ENOENT : 0);
+}
+
+/*
+ * Given a shorthand device name look for a match against 'cmp_name'. This
+ * is done by checking all prefix expansions using either the default
+ * 'zpool_default_import_paths' or the ZPOOL_IMPORT_PATH environment
+ * variable. Proper partition suffixes will be appended if this is a
+ * whole disk. When a match is found 0 is returned otherwise ENOENT.
+ */
+static int
+zfs_strcmp_shortname(const char *name, const char *cmp_name, int wholedisk)
+{
+ int path_len, cmp_len, i = 0, error = ENOENT;
+ char *dir, *env, *envdup = NULL;
+ char path_name[MAXPATHLEN];
+ const char * const *zpool_default_import_path;
+ size_t count;
+
+ zpool_default_import_path = zpool_default_search_paths(&count);
+
+ cmp_len = strlen(cmp_name);
+ env = getenv("ZPOOL_IMPORT_PATH");
+
+ if (env) {
+ envdup = strdup(env);
+ dir = strtok(envdup, ":");
+ } else {
+ dir = (char *)zpool_default_import_path[i];
+ }
+
+ while (dir) {
+ /* Trim trailing directory slashes from ZPOOL_IMPORT_PATH */
+ if (env) {
+ while (dir[strlen(dir)-1] == '/')
+ dir[strlen(dir)-1] = '\0';
+ }
+
+ path_len = snprintf(path_name, MAXPATHLEN, "%s/%s", dir, name);
+ if (wholedisk)
+ path_len = zfs_append_partition(path_name, MAXPATHLEN);
+
+ if ((path_len == cmp_len) && strcmp(path_name, cmp_name) == 0) {
+ error = 0;
+ break;
+ }
+
+ if (env) {
+ dir = strtok(NULL, ":");
+ } else if (++i < count) {
+ dir = (char *)zpool_default_import_path[i];
+ } else {
+ dir = NULL;
+ }
+ }
+
+ if (env)
+ free(envdup);
+
+ return (error);
+}
+
+/*
+ * Given either a shorthand or fully qualified path name look for a match
+ * against 'cmp'. The passed name will be expanded as needed for comparison
+ * purposes and redundant slashes stripped to ensure an accurate match.
+ */
+int
+zfs_strcmp_pathname(const char *name, const char *cmp, int wholedisk)
+{
+ int path_len, cmp_len;
+ char path_name[MAXPATHLEN];
+ char cmp_name[MAXPATHLEN];
+ char *dir, *dup;
+
+ /* Strip redundant slashes if one exists due to ZPOOL_IMPORT_PATH */
+ memset(cmp_name, 0, MAXPATHLEN);
+ dup = strdup(cmp);
+ dir = strtok(dup, "/");
+ while (dir) {
+ strlcat(cmp_name, "/", sizeof (cmp_name));
+ strlcat(cmp_name, dir, sizeof (cmp_name));
+ dir = strtok(NULL, "/");
+ }
+ free(dup);
+
+ if (name[0] != '/')
+ return (zfs_strcmp_shortname(name, cmp_name, wholedisk));
+
+ (void) strlcpy(path_name, name, MAXPATHLEN);
+ path_len = strlen(path_name);
+ cmp_len = strlen(cmp_name);
+
+ if (wholedisk) {
+ path_len = zfs_append_partition(path_name, MAXPATHLEN);
+ if (path_len == -1)
+ return (ENOMEM);
+ }
+
+ if ((path_len != cmp_len) || strcmp(path_name, cmp_name))
+ return (ENOENT);
+
+ return (0);
+}
+
+/*
+ * Allocate and return the underlying device name for a device mapper device.
+ * If a device mapper device maps to multiple devices, return the first device.
+ *
+ * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
+ * DM device (like /dev/disk/by-vdev/A0) are also allowed.
+ *
+ * Returns device name, or NULL on error or no match. If dm_name is not a DM
+ * device then return NULL.
+ *
+ * NOTE: The returned name string must be *freed*.
+ */
+static char *
+dm_get_underlying_path(const char *dm_name)
+{
+ DIR *dp = NULL;
+ struct dirent *ep;
+ char *realp;
+ char *tmp = NULL;
+ char *path = NULL;
+ char *dev_str;
+ int size;
+
+ if (dm_name == NULL)
+ return (NULL);
+
+ /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
+ realp = realpath(dm_name, NULL);
+ if (realp == NULL)
+ return (NULL);
+
+ /*
+ * If they preface 'dev' with a path (like "/dev") then strip it off.
+ * We just want the 'dm-N' part.
+ */
+ tmp = strrchr(realp, '/');
+ if (tmp != NULL)
+ dev_str = tmp + 1; /* +1 since we want the chr after '/' */
+ else
+ dev_str = tmp;
+
+ size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str);
+ if (size == -1 || !tmp)
+ goto end;
+
+ dp = opendir(tmp);
+ if (dp == NULL)
+ goto end;
+
+ /* Return first sd* entry in /sys/block/dm-N/slaves/ */
+ while ((ep = readdir(dp))) {
+ if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */
+ size = asprintf(&path, "/dev/%s", ep->d_name);
+ break;
+ }
+ }
+
+end:
+ if (dp != NULL)
+ closedir(dp);
+ free(tmp);
+ free(realp);
+ return (path);
+}
+
+/*
+ * Return 1 if device is a device mapper or multipath device.
+ * Return 0 if not.
+ */
+int
+zfs_dev_is_dm(const char *dev_name)
+{
+
+ char *tmp;
+ tmp = dm_get_underlying_path(dev_name);
+ if (tmp == NULL)
+ return (0);
+
+ free(tmp);
+ return (1);
+}
+
+/*
+ * By "whole disk" we mean an entire physical disk (something we can
+ * label, toggle the write cache on, etc.) as opposed to the full
+ * capacity of a pseudo-device such as lofi or did. We act as if we
+ * are labeling the disk, which should be a pretty good test of whether
+ * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
+ * it isn't.
+ */
+int
+zfs_dev_is_whole_disk(const char *dev_name)
+{
+ struct dk_gpt *label;
+ int fd;
+
+ if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0)
+ return (0);
+
+ if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
+ (void) close(fd);
+ return (0);
+ }
+
+ efi_free(label);
+ (void) close(fd);
+
+ return (1);
+}
+
+/*
+ * Lookup the underlying device for a device name
+ *
+ * Often you'll have a symlink to a device, a partition device,
+ * or a multipath device, and want to look up the underlying device.
+ * This function returns the underlying device name. If the device
+ * name is already the underlying device, then just return the same
+ * name. If the device is a DM device with multiple underlying devices
+ * then return the first one.
+ *
+ * For example:
+ *
+ * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
+ * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
+ * returns: /dev/sda
+ *
+ * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
+ * dev_name: /dev/mapper/mpatha
+ * returns: /dev/sda (first device)
+ *
+ * 3. /dev/sda (already the underlying device)
+ * dev_name: /dev/sda
+ * returns: /dev/sda
+ *
+ * 4. /dev/dm-3 (mapped to /dev/sda)
+ * dev_name: /dev/dm-3
+ * returns: /dev/sda
+ *
+ * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
+ * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
+ * returns: /dev/sdb
+ *
+ * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
+ * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
+ * returns: /dev/sda
+ *
+ * Returns underlying device name, or NULL on error or no match.
+ *
+ * NOTE: The returned name string must be *freed*.
+ */
+char *
+zfs_get_underlying_path(const char *dev_name)
+{
+ char *name = NULL;
+ char *tmp;
+
+ if (dev_name == NULL)
+ return (NULL);
+
+ tmp = dm_get_underlying_path(dev_name);
+
+ /* dev_name not a DM device, so just un-symlinkize it */
+ if (tmp == NULL)
+ tmp = realpath(dev_name, NULL);
+
+ if (tmp != NULL) {
+ name = zfs_strip_partition_path(tmp);
+ free(tmp);
+ }
+
+ return (name);
+}
+
+/*
+ * Given a dev name like "sda", return the full enclosure sysfs path to
+ * the disk. You can also pass in the name with "/dev" prepended
+ * to it (like /dev/sda).
+ *
+ * For example, disk "sda" in enclosure slot 1:
+ * dev: "sda"
+ * returns: "/sys/class/enclosure/1:0:3:0/Slot 1"
+ *
+ * 'dev' must be a non-devicemapper device.
+ *
+ * Returned string must be freed.
+ */
+char *
+zfs_get_enclosure_sysfs_path(const char *dev_name)
+{
+ DIR *dp = NULL;
+ struct dirent *ep;
+ char buf[MAXPATHLEN];
+ char *tmp1 = NULL;
+ char *tmp2 = NULL;
+ char *tmp3 = NULL;
+ char *path = NULL;
+ size_t size;
+ int tmpsize;
+
+ if (dev_name == NULL)
+ return (NULL);
+
+ /* If they preface 'dev' with a path (like "/dev") then strip it off */
+ tmp1 = strrchr(dev_name, '/');
+ if (tmp1 != NULL)
+ dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */
+
+ tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name);
+ if (tmpsize == -1 || tmp1 == NULL) {
+ tmp1 = NULL;
+ goto end;
+ }
+
+ dp = opendir(tmp1);
+ if (dp == NULL) {
+ tmp1 = NULL; /* To make free() at the end a NOP */
+ goto end;
+ }
+
+ /*
+ * Look though all sysfs entries in /sys/block/<dev>/device for
+ * the enclosure symlink.
+ */
+ while ((ep = readdir(dp))) {
+ /* Ignore everything that's not our enclosure_device link */
+ if (strstr(ep->d_name, "enclosure_device") == NULL)
+ continue;
+
+ if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 ||
+ tmp2 == NULL)
+ break;
+
+ size = readlink(tmp2, buf, sizeof (buf));
+
+ /* Did readlink fail or crop the link name? */
+ if (size == -1 || size >= sizeof (buf)) {
+ free(tmp2);
+ tmp2 = NULL; /* To make free() at the end a NOP */
+ break;
+ }
+
+ /*
+ * We got a valid link. readlink() doesn't terminate strings
+ * so we have to do it.
+ */
+ buf[size] = '\0';
+
+ /*
+ * Our link will look like:
+ *
+ * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
+ *
+ * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
+ */
+ tmp3 = strstr(buf, "enclosure");
+ if (tmp3 == NULL)
+ break;
+
+ if (asprintf(&path, "/sys/class/%s", tmp3) == -1) {
+ /* If asprintf() fails, 'path' is undefined */
+ path = NULL;
+ break;
+ }
+
+ if (path == NULL)
+ break;
+ }
+
+end:
+ free(tmp2);
+ free(tmp1);
+
+ if (dp != NULL)
+ closedir(dp);
+
+ return (path);
+}
+
+/*
+ * Remove partition suffix from a vdev path. Partition suffixes may take three
+ * forms: "-partX", "pX", or "X", where X is a string of digits. The second
+ * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
+ * third case only occurs when preceded by a string matching the regular
+ * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
+ *
+ * caller must free the returned string
+ */
+char *
+zfs_strip_partition(char *path)
+{
+ char *tmp = strdup(path);
+ char *part = NULL, *d = NULL;
+ if (!tmp)
+ return (NULL);
+
+ if ((part = strstr(tmp, "-part")) && part != tmp) {
+ d = part + 5;
+ } else if ((part = strrchr(tmp, 'p')) &&
+ part > tmp + 1 && isdigit(*(part-1))) {
+ d = part + 1;
+ } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
+ tmp[1] == 'd') {
+ for (d = &tmp[2]; isalpha(*d); part = ++d) { }
+ } else if (strncmp("xvd", tmp, 3) == 0) {
+ for (d = &tmp[3]; isalpha(*d); part = ++d) { }
+ }
+ if (part && d && *d != '\0') {
+ for (; isdigit(*d); d++) { }
+ if (*d == '\0')
+ *part = '\0';
+ }
+
+ return (tmp);
+}
+
+/*
+ * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
+ *
+ * path: /dev/sda1
+ * returns: /dev/sda
+ *
+ * Returned string must be freed.
+ */
+char *
+zfs_strip_partition_path(char *path)
+{
+ char *newpath = strdup(path);
+ char *sd_offset;
+ char *new_sd;
+
+ if (!newpath)
+ return (NULL);
+
+ /* Point to "sda1" part of "/dev/sda1" */
+ sd_offset = strrchr(newpath, '/') + 1;
+
+ /* Get our new name "sda" */
+ new_sd = zfs_strip_partition(sd_offset);
+ if (!new_sd) {
+ free(newpath);
+ return (NULL);
+ }
+
+ /* Paste the "sda" where "sda1" was */
+ strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1);
+
+ /* Free temporary "sda" */
+ free(new_sd);
+
+ return (newpath);
+}
+
+#ifdef HAVE_LIBUDEV
+/*
+ * A disk is considered a multipath whole disk when:
+ * DEVNAME key value has "dm-"
+ * DM_NAME key value has "mpath" prefix
+ * DM_UUID key exists
+ * ID_PART_TABLE_TYPE key does not exist or is not gpt
+ */
+static boolean_t
+udev_mpath_whole_disk(struct udev_device *dev)
+{
+ const char *devname, *type, *uuid;
+
+ devname = udev_device_get_property_value(dev, "DEVNAME");
+ type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
+ uuid = udev_device_get_property_value(dev, "DM_UUID");
+
+ if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
+ ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
+ (uuid != NULL)) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Check if a disk is effectively a multipath whole disk
+ */
+boolean_t
+is_mpath_whole_disk(const char *path)
+{
+ struct udev *udev;
+ struct udev_device *dev = NULL;
+ char nodepath[MAXPATHLEN];
+ char *sysname;
+ boolean_t wholedisk = B_FALSE;
+
+ if (realpath(path, nodepath) == NULL)
+ return (B_FALSE);
+ sysname = strrchr(nodepath, '/') + 1;
+ if (strncmp(sysname, "dm-", 3) != 0)
+ return (B_FALSE);
+ if ((udev = udev_new()) == NULL)
+ return (B_FALSE);
+ if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
+ sysname)) == NULL) {
+ udev_device_unref(dev);
+ return (B_FALSE);
+ }
+
+ wholedisk = udev_mpath_whole_disk(dev);
+
+ udev_device_unref(dev);
+ return (wholedisk);
+}
+#endif
diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c
new file mode 100644
index 000000000..f6e56fabf
--- /dev/null
+++ b/lib/libzutil/zutil_import.c
@@ -0,0 +1,2389 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright 2015 RackTop Systems.
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+/*
+ * Pool import support functions.
+ *
+ * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
+ * these commands are expected to run in the global zone, we can assume
+ * that the devices are all readable when called.
+ *
+ * To import a pool, we rely on reading the configuration information from the
+ * ZFS label of each device. If we successfully read the label, then we
+ * organize the configuration information in the following hierarchy:
+ *
+ * pool guid -> toplevel vdev guid -> label txg
+ *
+ * Duplicate entries matching this same tuple will be discarded. Once we have
+ * examined every device, we pick the best label txg config for each toplevel
+ * vdev. We then arrange these toplevel vdevs into a complete pool config, and
+ * update any paths that have changed. Finally, we attempt to import the pool
+ * using our derived config, and record the results.
+ */
+
+#include <ctype.h>
+#include <devid.h>
+#include <dirent.h>
+#include <errno.h>
+#include <libintl.h>
+#include <libgen.h>
+#ifdef HAVE_LIBUDEV
+#include <libudev.h>
+#include <sched.h>
+#endif
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/dktp/fdisk.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+
+#include <blkid/blkid.h>
+#include <thread_pool.h>
+#include <libzutil.h>
+#include <libnvpair.h>
+
+#define IMPORT_ORDER_PREFERRED_1 1
+#define IMPORT_ORDER_PREFERRED_2 2
+#define IMPORT_ORDER_SCAN_OFFSET 10
+#define IMPORT_ORDER_DEFAULT 100
+#define DEFAULT_IMPORT_PATH_SIZE 9
+
+#define EZFS_BADCACHE "invalid or missing cache file"
+#define EZFS_BADPATH "must be an absolute path"
+#define EZFS_NOMEM "out of memory"
+#define EZFS_EACESS "some devices require root privileges"
+
+typedef struct libpc_handle {
+ boolean_t lpc_printerr;
+ boolean_t lpc_open_access_error;
+ boolean_t lpc_desc_active;
+ char lpc_desc[1024];
+ const pool_config_ops_t *lpc_ops;
+ void *lpc_lib_handle;
+} libpc_handle_t;
+
+/*PRINTFLIKE2*/
+static void
+zfs_error_aux(libpc_handle_t *hdl, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ (void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap);
+ hdl->lpc_desc_active = B_TRUE;
+
+ va_end(ap);
+}
+
+static void
+zfs_verror(libpc_handle_t *hdl, const char *error, const char *fmt, va_list ap)
+{
+ char action[1024];
+
+ (void) vsnprintf(action, sizeof (action), fmt, ap);
+
+ if (hdl->lpc_desc_active)
+ hdl->lpc_desc_active = B_FALSE;
+ else
+ hdl->lpc_desc[0] = '\0';
+
+ if (hdl->lpc_printerr) {
+ if (hdl->lpc_desc[0] != '\0')
+ error = hdl->lpc_desc;
+
+ (void) fprintf(stderr, "%s: %s\n", action, error);
+ }
+}
+
+/*PRINTFLIKE3*/
+static int
+zfs_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ zfs_verror(hdl, error, fmt, ap);
+
+ va_end(ap);
+
+ return (-1);
+}
+
+static int
+zfs_error(libpc_handle_t *hdl, const char *error, const char *msg)
+{
+ return (zfs_error_fmt(hdl, error, "%s", msg));
+}
+
+static int
+no_memory(libpc_handle_t *hdl)
+{
+ zfs_error(hdl, EZFS_NOMEM, "internal error");
+ exit(1);
+}
+
+static void *
+zfs_alloc(libpc_handle_t *hdl, size_t size)
+{
+ void *data;
+
+ if ((data = calloc(1, size)) == NULL)
+ (void) no_memory(hdl);
+
+ return (data);
+}
+
+static char *
+zfs_strdup(libpc_handle_t *hdl, const char *str)
+{
+ char *ret;
+
+ if ((ret = strdup(str)) == NULL)
+ (void) no_memory(hdl);
+
+ return (ret);
+}
+
+/*
+ * Intermediate structures used to gather configuration information.
+ */
+typedef struct config_entry {
+ uint64_t ce_txg;
+ nvlist_t *ce_config;
+ struct config_entry *ce_next;
+} config_entry_t;
+
+typedef struct vdev_entry {
+ uint64_t ve_guid;
+ config_entry_t *ve_configs;
+ struct vdev_entry *ve_next;
+} vdev_entry_t;
+
+typedef struct pool_entry {
+ uint64_t pe_guid;
+ vdev_entry_t *pe_vdevs;
+ struct pool_entry *pe_next;
+} pool_entry_t;
+
+typedef struct name_entry {
+ char *ne_name;
+ uint64_t ne_guid;
+ uint64_t ne_order;
+ uint64_t ne_num_labels;
+ struct name_entry *ne_next;
+} name_entry_t;
+
+typedef struct pool_list {
+ pool_entry_t *pools;
+ name_entry_t *names;
+} pool_list_t;
+
+#define ZVOL_ROOT "/dev/zvol"
+#define DEV_BYID_PATH "/dev/disk/by-id/"
+
+/*
+ * Linux persistent device strings for vdev labels
+ *
+ * based on libudev for consistency with libudev disk add/remove events
+ */
+
+typedef struct vdev_dev_strs {
+ char vds_devid[128];
+ char vds_devphys[128];
+} vdev_dev_strs_t;
+
+/*
+ * Obtain the persistent device id string (describes what)
+ *
+ * used by ZED vdev matching for auto-{online,expand,replace}
+ */
+int
+zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
+{
+ struct udev_list_entry *entry;
+ const char *bus;
+ char devbyid[MAXPATHLEN];
+
+ /* The bus based by-id path is preferred */
+ bus = udev_device_get_property_value(dev, "ID_BUS");
+
+ if (bus == NULL) {
+ const char *dm_uuid;
+
+ /*
+ * For multipath nodes use the persistent uuid based identifier
+ *
+ * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
+ */
+ dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
+ if (dm_uuid != NULL) {
+ (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
+ return (0);
+ }
+
+ /*
+ * For volumes use the persistent /dev/zvol/dataset identifier
+ */
+ entry = udev_device_get_devlinks_list_entry(dev);
+ while (entry != NULL) {
+ const char *name;
+
+ name = udev_list_entry_get_name(entry);
+ if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
+ (void) strlcpy(bufptr, name, buflen);
+ return (0);
+ }
+ entry = udev_list_entry_get_next(entry);
+ }
+
+ /*
+ * NVME 'by-id' symlinks are similar to bus case
+ */
+ struct udev_device *parent;
+
+ parent = udev_device_get_parent_with_subsystem_devtype(dev,
+ "nvme", NULL);
+ if (parent != NULL)
+ bus = "nvme"; /* continue with bus symlink search */
+ else
+ return (ENODATA);
+ }
+
+ /*
+ * locate the bus specific by-id link
+ */
+ (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
+ entry = udev_device_get_devlinks_list_entry(dev);
+ while (entry != NULL) {
+ const char *name;
+
+ name = udev_list_entry_get_name(entry);
+ if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
+ name += strlen(DEV_BYID_PATH);
+ (void) strlcpy(bufptr, name, buflen);
+ return (0);
+ }
+ entry = udev_list_entry_get_next(entry);
+ }
+
+ return (ENODATA);
+}
+
+/*
+ * Obtain the persistent physical location string (describes where)
+ *
+ * used by ZED vdev matching for auto-{online,expand,replace}
+ */
+int
+zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
+{
+ const char *physpath = NULL;
+ struct udev_list_entry *entry;
+
+ /*
+ * Normal disks use ID_PATH for their physical path.
+ */
+ physpath = udev_device_get_property_value(dev, "ID_PATH");
+ if (physpath != NULL && strlen(physpath) > 0) {
+ (void) strlcpy(bufptr, physpath, buflen);
+ return (0);
+ }
+
+ /*
+ * Device mapper devices are virtual and don't have a physical
+ * path. For them we use ID_VDEV instead, which is setup via the
+ * /etc/vdev_id.conf file. ID_VDEV provides a persistent path
+ * to a virtual device. If you don't have vdev_id.conf setup,
+ * you cannot use multipath autoreplace with device mapper.
+ */
+ physpath = udev_device_get_property_value(dev, "ID_VDEV");
+ if (physpath != NULL && strlen(physpath) > 0) {
+ (void) strlcpy(bufptr, physpath, buflen);
+ return (0);
+ }
+
+ /*
+ * For ZFS volumes use the persistent /dev/zvol/dataset identifier
+ */
+ entry = udev_device_get_devlinks_list_entry(dev);
+ while (entry != NULL) {
+ physpath = udev_list_entry_get_name(entry);
+ if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
+ (void) strlcpy(bufptr, physpath, buflen);
+ return (0);
+ }
+ entry = udev_list_entry_get_next(entry);
+ }
+
+ /*
+ * For all other devices fallback to using the by-uuid name.
+ */
+ entry = udev_device_get_devlinks_list_entry(dev);
+ while (entry != NULL) {
+ physpath = udev_list_entry_get_name(entry);
+ if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
+ (void) strlcpy(bufptr, physpath, buflen);
+ return (0);
+ }
+ entry = udev_list_entry_get_next(entry);
+ }
+
+ return (ENODATA);
+}
+
+/*
+ * A disk is considered a multipath whole disk when:
+ * DEVNAME key value has "dm-"
+ * DM_NAME key value has "mpath" prefix
+ * DM_UUID key exists
+ * ID_PART_TABLE_TYPE key does not exist or is not gpt
+ */
+static boolean_t
+udev_mpath_whole_disk(struct udev_device *dev)
+{
+ const char *devname, *type, *uuid;
+
+ devname = udev_device_get_property_value(dev, "DEVNAME");
+ type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
+ uuid = udev_device_get_property_value(dev, "DM_UUID");
+
+ if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
+ ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
+ (uuid != NULL)) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+udev_device_is_ready(struct udev_device *dev)
+{
+#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
+ return (udev_device_get_is_initialized(dev));
+#else
+ /* wait for DEVLINKS property to be initialized */
+ return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
+#endif
+}
+
+/*
+ * Wait up to timeout_ms for udev to set up the device node. The device is
+ * considered ready when libudev determines it has been initialized, all of
+ * the device links have been verified to exist, and it has been allowed to
+ * settle. At this point the device the device can be accessed reliably.
+ * Depending on the complexity of the udev rules this process could take
+ * several seconds.
+ */
+int
+zpool_label_disk_wait(const char *path, int timeout_ms)
+{
+#ifdef HAVE_LIBUDEV
+ struct udev *udev;
+ struct udev_device *dev = NULL;
+ char nodepath[MAXPATHLEN];
+ char *sysname = NULL;
+ int ret = ENODEV;
+ int settle_ms = 50;
+ long sleep_ms = 10;
+ hrtime_t start, settle;
+
+ if ((udev = udev_new()) == NULL)
+ return (ENXIO);
+
+ start = gethrtime();
+ settle = 0;
+
+ do {
+ if (sysname == NULL) {
+ if (realpath(path, nodepath) != NULL) {
+ sysname = strrchr(nodepath, '/') + 1;
+ } else {
+ (void) usleep(sleep_ms * MILLISEC);
+ continue;
+ }
+ }
+
+ dev = udev_device_new_from_subsystem_sysname(udev,
+ "block", sysname);
+ if ((dev != NULL) && udev_device_is_ready(dev)) {
+ struct udev_list_entry *links, *link = NULL;
+
+ ret = 0;
+ links = udev_device_get_devlinks_list_entry(dev);
+
+ udev_list_entry_foreach(link, links) {
+ struct stat64 statbuf;
+ const char *name;
+
+ name = udev_list_entry_get_name(link);
+ errno = 0;
+ if (stat64(name, &statbuf) == 0 && errno == 0)
+ continue;
+
+ settle = 0;
+ ret = ENODEV;
+ break;
+ }
+
+ if (ret == 0) {
+ if (settle == 0) {
+ settle = gethrtime();
+ } else if (NSEC2MSEC(gethrtime() - settle) >=
+ settle_ms) {
+ udev_device_unref(dev);
+ break;
+ }
+ }
+ }
+
+ udev_device_unref(dev);
+ (void) usleep(sleep_ms * MILLISEC);
+
+ } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
+
+ udev_unref(udev);
+
+ return (ret);
+#else
+ int settle_ms = 50;
+ long sleep_ms = 10;
+ hrtime_t start, settle;
+ struct stat64 statbuf;
+
+ start = gethrtime();
+ settle = 0;
+
+ do {
+ errno = 0;
+ if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
+ if (settle == 0)
+ settle = gethrtime();
+ else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
+ return (0);
+ } else if (errno != ENOENT) {
+ return (errno);
+ }
+
+ usleep(sleep_ms * MILLISEC);
+ } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
+
+ return (ENODEV);
+#endif /* HAVE_LIBUDEV */
+}
+
+/*
+ * Encode the persistent devices strings
+ * used for the vdev disk label
+ */
+static int
+encode_device_strings(const char *path, vdev_dev_strs_t *ds,
+ boolean_t wholedisk)
+{
+#ifdef HAVE_LIBUDEV
+ struct udev *udev;
+ struct udev_device *dev = NULL;
+ char nodepath[MAXPATHLEN];
+ char *sysname;
+ int ret = ENODEV;
+ hrtime_t start;
+
+ if ((udev = udev_new()) == NULL)
+ return (ENXIO);
+
+ /* resolve path to a runtime device node instance */
+ if (realpath(path, nodepath) == NULL)
+ goto no_dev;
+
+ sysname = strrchr(nodepath, '/') + 1;
+
+ /*
+ * Wait up to 3 seconds for udev to set up the device node context
+ */
+ start = gethrtime();
+ do {
+ dev = udev_device_new_from_subsystem_sysname(udev, "block",
+ sysname);
+ if (dev == NULL)
+ goto no_dev;
+ if (udev_device_is_ready(dev))
+ break; /* udev ready */
+
+ udev_device_unref(dev);
+ dev = NULL;
+
+ if (NSEC2MSEC(gethrtime() - start) < 10)
+ (void) sched_yield(); /* yield/busy wait up to 10ms */
+ else
+ (void) usleep(10 * MILLISEC);
+
+ } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
+
+ if (dev == NULL)
+ goto no_dev;
+
+ /*
+ * Only whole disks require extra device strings
+ */
+ if (!wholedisk && !udev_mpath_whole_disk(dev))
+ goto no_dev;
+
+ ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
+ if (ret != 0)
+ goto no_dev_ref;
+
+ /* physical location string (optional) */
+ if (zfs_device_get_physical(dev, ds->vds_devphys,
+ sizeof (ds->vds_devphys)) != 0) {
+ ds->vds_devphys[0] = '\0'; /* empty string --> not available */
+ }
+
+no_dev_ref:
+ udev_device_unref(dev);
+no_dev:
+ udev_unref(udev);
+
+ return (ret);
+#else
+ return (ENOENT);
+#endif
+}
+
+/*
+ * Update a leaf vdev's persistent device strings (Linux only)
+ *
+ * - only applies for a dedicated leaf vdev (aka whole disk)
+ * - updated during pool create|add|attach|import
+ * - used for matching device matching during auto-{online,expand,replace}
+ * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
+ * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
+ *
+ * single device node example:
+ * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1'
+ * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
+ *
+ * multipath device node example:
+ * devid: 'dm-uuid-mpath-35000c5006304de3f'
+ *
+ * We also store the enclosure sysfs path for turning on enclosure LEDs
+ * (if applicable):
+ * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
+ */
+void
+update_vdev_config_dev_strs(nvlist_t *nv)
+{
+ vdev_dev_strs_t vds;
+ char *env, *type, *path;
+ uint64_t wholedisk = 0;
+ char *upath, *spath;
+
+ /*
+ * For the benefit of legacy ZFS implementations, allow
+ * for opting out of devid strings in the vdev label.
+ *
+ * example use:
+ * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
+ *
+ * explanation:
+ * Older ZFS on Linux implementations had issues when attempting to
+ * display pool config VDEV names if a "devid" NVP value is present
+ * in the pool's config.
+ *
+ * For example, a pool that originated on illumos platform would
+ * have a devid value in the config and "zpool status" would fail
+ * when listing the config.
+ *
+ * A pool can be stripped of any "devid" values on import or
+ * prevented from adding them on zpool create|add by setting
+ * ZFS_VDEV_DEVID_OPT_OUT.
+ */
+ env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
+ if (env && (strtoul(env, NULL, 0) > 0 ||
+ !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
+ return;
+ }
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
+ strcmp(type, VDEV_TYPE_DISK) != 0) {
+ return;
+ }
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
+ return;
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
+
+ /*
+ * Update device string values in config nvlist
+ */
+ if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
+ (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
+ if (vds.vds_devphys[0] != '\0') {
+ (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ vds.vds_devphys);
+ }
+
+ /* Add enclosure sysfs path (if disk is in an enclosure) */
+ upath = zfs_get_underlying_path(path);
+ spath = zfs_get_enclosure_sysfs_path(upath);
+ if (spath)
+ nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+ spath);
+ else
+ nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
+
+ free(upath);
+ free(spath);
+ } else {
+ /* clear out any stale entries */
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
+ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
+ }
+}
+
+/*
+ * Go through and fix up any path and/or devid information for the given vdev
+ * configuration.
+ */
+static int
+fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ uint64_t guid;
+ name_entry_t *ne, *best;
+ char *path;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if (fix_paths(hdl, child[c], names) != 0)
+ return (-1);
+ return (0);
+ }
+
+ /*
+ * This is a leaf (file or disk) vdev. In either case, go through
+ * the name list and see if we find a matching guid. If so, replace
+ * the path and see if we can calculate a new devid.
+ *
+ * There may be multiple names associated with a particular guid, in
+ * which case we have overlapping partitions or multiple paths to the
+ * same disk. In this case we prefer to use the path name which
+ * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we
+ * use the lowest order device which corresponds to the first match
+ * while traversing the ZPOOL_IMPORT_PATH search path.
+ */
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
+ path = NULL;
+
+ best = NULL;
+ for (ne = names; ne != NULL; ne = ne->ne_next) {
+ if (ne->ne_guid == guid) {
+ if (path == NULL) {
+ best = ne;
+ break;
+ }
+
+ if ((strlen(path) == strlen(ne->ne_name)) &&
+ strncmp(path, ne->ne_name, strlen(path)) == 0) {
+ best = ne;
+ break;
+ }
+
+ if (best == NULL) {
+ best = ne;
+ continue;
+ }
+
+ /* Prefer paths with move vdev labels. */
+ if (ne->ne_num_labels > best->ne_num_labels) {
+ best = ne;
+ continue;
+ }
+
+ /* Prefer paths earlier in the search order. */
+ if (ne->ne_num_labels == best->ne_num_labels &&
+ ne->ne_order < best->ne_order) {
+ best = ne;
+ continue;
+ }
+ }
+ }
+
+ if (best == NULL)
+ return (0);
+
+ if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
+ return (-1);
+
+ /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */
+ update_vdev_config_dev_strs(nv);
+
+ return (0);
+}
+
+/*
+ * Add the given configuration to the list of known devices.
+ */
+static int
+add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path,
+ int order, int num_labels, nvlist_t *config)
+{
+ uint64_t pool_guid, vdev_guid, top_guid, txg, state;
+ pool_entry_t *pe;
+ vdev_entry_t *ve;
+ config_entry_t *ce;
+ name_entry_t *ne;
+
+ /*
+ * If this is a hot spare not currently in use or level 2 cache
+ * device, add it to the list of names to translate, but don't do
+ * anything else.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &state) == 0 &&
+ (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
+ if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
+ return (-1);
+
+ if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
+ free(ne);
+ return (-1);
+ }
+ ne->ne_guid = vdev_guid;
+ ne->ne_order = order;
+ ne->ne_num_labels = num_labels;
+ ne->ne_next = pl->names;
+ pl->names = ne;
+
+ return (0);
+ }
+
+ /*
+ * If we have a valid config but cannot read any of these fields, then
+ * it means we have a half-initialized label. In vdev_label_init()
+ * we write a label with txg == 0 so that we can identify the device
+ * in case the user refers to the same disk later on. If we fail to
+ * create the pool, we'll be left with a label in this state
+ * which should not be considered part of a valid pool.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
+ &vdev_guid) != 0 ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+ &top_guid) != 0 ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0) {
+ return (0);
+ }
+
+ /*
+ * First, see if we know about this pool. If not, then add it to the
+ * list of known pools.
+ */
+ for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
+ if (pe->pe_guid == pool_guid)
+ break;
+ }
+
+ if (pe == NULL) {
+ if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
+ return (-1);
+ }
+ pe->pe_guid = pool_guid;
+ pe->pe_next = pl->pools;
+ pl->pools = pe;
+ }
+
+ /*
+ * Second, see if we know about this toplevel vdev. Add it if its
+ * missing.
+ */
+ for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
+ if (ve->ve_guid == top_guid)
+ break;
+ }
+
+ if (ve == NULL) {
+ if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
+ return (-1);
+ }
+ ve->ve_guid = top_guid;
+ ve->ve_next = pe->pe_vdevs;
+ pe->pe_vdevs = ve;
+ }
+
+ /*
+ * Third, see if we have a config with a matching transaction group. If
+ * so, then we do nothing. Otherwise, add it to the list of known
+ * configs.
+ */
+ for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
+ if (ce->ce_txg == txg)
+ break;
+ }
+
+ if (ce == NULL) {
+ if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) {
+ return (-1);
+ }
+ ce->ce_txg = txg;
+ ce->ce_config = fnvlist_dup(config);
+ ce->ce_next = ve->ve_configs;
+ ve->ve_configs = ce;
+ }
+
+ /*
+ * At this point we've successfully added our config to the list of
+ * known configs. The last thing to do is add the vdev guid -> path
+ * mappings so that we can fix up the configuration as necessary before
+ * doing the import.
+ */
+ if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
+ return (-1);
+
+ if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
+ free(ne);
+ return (-1);
+ }
+
+ ne->ne_guid = vdev_guid;
+ ne->ne_order = order;
+ ne->ne_num_labels = num_labels;
+ ne->ne_next = pl->names;
+ pl->names = ne;
+
+ return (0);
+}
+
+static int
+pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid,
+ boolean_t *isactive)
+{
+ ASSERT(hdl->lpc_ops->pco_pool_active != NULL);
+
+ int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name,
+ guid, isactive);
+
+ return (error);
+}
+
+static nvlist_t *
+refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig)
+{
+ ASSERT(hdl->lpc_ops->pco_refresh_config != NULL);
+
+ return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle,
+ tryconfig));
+}
+
+/*
+ * Determine if the vdev id is a hole in the namespace.
+ */
+static boolean_t
+vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+{
+ int c;
+
+ for (c = 0; c < holes; c++) {
+
+ /* Top-level is a hole */
+ if (hole_array[c] == id)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Convert our list of pools into the definitive set of configurations. We
+ * start by picking the best config for each toplevel vdev. Once that's done,
+ * we assemble the toplevel vdevs into a full config for the pool. We make a
+ * pass to fix up any incorrect paths, and then add it to the main list to
+ * return to the user.
+ */
+static nvlist_t *
+get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
+ nvlist_t *policy)
+{
+ pool_entry_t *pe;
+ vdev_entry_t *ve;
+ config_entry_t *ce;
+ nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
+ nvlist_t **spares, **l2cache;
+ uint_t i, nspares, nl2cache;
+ boolean_t config_seen;
+ uint64_t best_txg;
+ char *name, *hostname = NULL;
+ uint64_t guid;
+ uint_t children = 0;
+ nvlist_t **child = NULL;
+ uint_t holes;
+ uint64_t *hole_array, max_id;
+ uint_t c;
+ boolean_t isactive;
+ uint64_t hostid;
+ nvlist_t *nvl;
+ boolean_t valid_top_config = B_FALSE;
+
+ if (nvlist_alloc(&ret, 0, 0) != 0)
+ goto nomem;
+
+ for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
+ uint64_t id, max_txg = 0;
+
+ if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
+ goto nomem;
+ config_seen = B_FALSE;
+
+ /*
+ * Iterate over all toplevel vdevs. Grab the pool configuration
+ * from the first one we find, and then go through the rest and
+ * add them as necessary to the 'vdevs' member of the config.
+ */
+ for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
+
+ /*
+ * Determine the best configuration for this vdev by
+ * selecting the config with the latest transaction
+ * group.
+ */
+ best_txg = 0;
+ for (ce = ve->ve_configs; ce != NULL;
+ ce = ce->ce_next) {
+
+ if (ce->ce_txg > best_txg) {
+ tmp = ce->ce_config;
+ best_txg = ce->ce_txg;
+ }
+ }
+
+ /*
+ * We rely on the fact that the max txg for the
+ * pool will contain the most up-to-date information
+ * about the valid top-levels in the vdev namespace.
+ */
+ if (best_txg > max_txg) {
+ (void) nvlist_remove(config,
+ ZPOOL_CONFIG_VDEV_CHILDREN,
+ DATA_TYPE_UINT64);
+ (void) nvlist_remove(config,
+ ZPOOL_CONFIG_HOLE_ARRAY,
+ DATA_TYPE_UINT64_ARRAY);
+
+ max_txg = best_txg;
+ hole_array = NULL;
+ holes = 0;
+ max_id = 0;
+ valid_top_config = B_FALSE;
+
+ if (nvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
+ verify(nvlist_add_uint64(config,
+ ZPOOL_CONFIG_VDEV_CHILDREN,
+ max_id) == 0);
+ valid_top_config = B_TRUE;
+ }
+
+ if (nvlist_lookup_uint64_array(tmp,
+ ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
+ &holes) == 0) {
+ verify(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_HOLE_ARRAY,
+ hole_array, holes) == 0);
+ }
+ }
+
+ if (!config_seen) {
+ /*
+ * Copy the relevant pieces of data to the pool
+ * configuration:
+ *
+ * version
+ * pool guid
+ * name
+ * comment (if available)
+ * pool state
+ * hostid (if available)
+ * hostname (if available)
+ */
+ uint64_t state, version;
+ char *comment = NULL;
+
+ version = fnvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_VERSION);
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_VERSION, version);
+ guid = fnvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_POOL_GUID);
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_POOL_GUID, guid);
+ name = fnvlist_lookup_string(tmp,
+ ZPOOL_CONFIG_POOL_NAME);
+ fnvlist_add_string(config,
+ ZPOOL_CONFIG_POOL_NAME, name);
+
+ if (nvlist_lookup_string(tmp,
+ ZPOOL_CONFIG_COMMENT, &comment) == 0)
+ fnvlist_add_string(config,
+ ZPOOL_CONFIG_COMMENT, comment);
+
+ state = fnvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_POOL_STATE);
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, state);
+
+ hostid = 0;
+ if (nvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+ fnvlist_add_uint64(config,
+ ZPOOL_CONFIG_HOSTID, hostid);
+ hostname = fnvlist_lookup_string(tmp,
+ ZPOOL_CONFIG_HOSTNAME);
+ fnvlist_add_string(config,
+ ZPOOL_CONFIG_HOSTNAME, hostname);
+ }
+
+ config_seen = B_TRUE;
+ }
+
+ /*
+ * Add this top-level vdev to the child array.
+ */
+ verify(nvlist_lookup_nvlist(tmp,
+ ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
+ verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
+ &id) == 0);
+
+ if (id >= children) {
+ nvlist_t **newchild;
+
+ newchild = zfs_alloc(hdl, (id + 1) *
+ sizeof (nvlist_t *));
+ if (newchild == NULL)
+ goto nomem;
+
+ for (c = 0; c < children; c++)
+ newchild[c] = child[c];
+
+ free(child);
+ child = newchild;
+ children = id + 1;
+ }
+ if (nvlist_dup(nvtop, &child[id], 0) != 0)
+ goto nomem;
+
+ }
+
+ /*
+ * If we have information about all the top-levels then
+ * clean up the nvlist which we've constructed. This
+ * means removing any extraneous devices that are
+ * beyond the valid range or adding devices to the end
+ * of our array which appear to be missing.
+ */
+ if (valid_top_config) {
+ if (max_id < children) {
+ for (c = max_id; c < children; c++)
+ nvlist_free(child[c]);
+ children = max_id;
+ } else if (max_id > children) {
+ nvlist_t **newchild;
+
+ newchild = zfs_alloc(hdl, (max_id) *
+ sizeof (nvlist_t *));
+ if (newchild == NULL)
+ goto nomem;
+
+ for (c = 0; c < children; c++)
+ newchild[c] = child[c];
+
+ free(child);
+ child = newchild;
+ children = max_id;
+ }
+ }
+
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &guid) == 0);
+
+ /*
+ * The vdev namespace may contain holes as a result of
+ * device removal. We must add them back into the vdev
+ * tree before we process any missing devices.
+ */
+ if (holes > 0) {
+ ASSERT(valid_top_config);
+
+ for (c = 0; c < children; c++) {
+ nvlist_t *holey;
+
+ if (child[c] != NULL ||
+ !vdev_is_hole(hole_array, holes, c))
+ continue;
+
+ if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
+ 0) != 0)
+ goto nomem;
+
+ /*
+ * Holes in the namespace are treated as
+ * "hole" top-level vdevs and have a
+ * special flag set on them.
+ */
+ if (nvlist_add_string(holey,
+ ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_HOLE) != 0 ||
+ nvlist_add_uint64(holey,
+ ZPOOL_CONFIG_ID, c) != 0 ||
+ nvlist_add_uint64(holey,
+ ZPOOL_CONFIG_GUID, 0ULL) != 0) {
+ nvlist_free(holey);
+ goto nomem;
+ }
+ child[c] = holey;
+ }
+ }
+
+ /*
+ * Look for any missing top-level vdevs. If this is the case,
+ * create a faked up 'missing' vdev as a placeholder. We cannot
+ * simply compress the child array, because the kernel performs
+ * certain checks to make sure the vdev IDs match their location
+ * in the configuration.
+ */
+ for (c = 0; c < children; c++) {
+ if (child[c] == NULL) {
+ nvlist_t *missing;
+ if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
+ 0) != 0)
+ goto nomem;
+ if (nvlist_add_string(missing,
+ ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_MISSING) != 0 ||
+ nvlist_add_uint64(missing,
+ ZPOOL_CONFIG_ID, c) != 0 ||
+ nvlist_add_uint64(missing,
+ ZPOOL_CONFIG_GUID, 0ULL) != 0) {
+ nvlist_free(missing);
+ goto nomem;
+ }
+ child[c] = missing;
+ }
+ }
+
+ /*
+ * Put all of this pool's top-level vdevs into a root vdev.
+ */
+ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
+ goto nomem;
+ if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) != 0 ||
+ nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||
+ nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||
+ nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ child, children) != 0) {
+ nvlist_free(nvroot);
+ goto nomem;
+ }
+
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+ children = 0;
+ child = NULL;
+
+ /*
+ * Go through and fix up any paths and/or devids based on our
+ * known list of vdev GUID -> path mappings.
+ */
+ if (fix_paths(hdl, nvroot, pl->names) != 0) {
+ nvlist_free(nvroot);
+ goto nomem;
+ }
+
+ /*
+ * Add the root vdev to this pool's configuration.
+ */
+ if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ nvroot) != 0) {
+ nvlist_free(nvroot);
+ goto nomem;
+ }
+ nvlist_free(nvroot);
+
+ /*
+ * zdb uses this path to report on active pools that were
+ * imported or created using -R.
+ */
+ if (active_ok)
+ goto add_pool;
+
+ /*
+ * Determine if this pool is currently active, in which case we
+ * can't actually import it.
+ */
+ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &name) == 0);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &guid) == 0);
+
+ if (pool_active(hdl, name, guid, &isactive) != 0)
+ goto error;
+
+ if (isactive) {
+ nvlist_free(config);
+ config = NULL;
+ continue;
+ }
+
+ if (policy != NULL) {
+ if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
+ policy) != 0)
+ goto nomem;
+ }
+
+ if ((nvl = refresh_config(hdl, config)) == NULL) {
+ nvlist_free(config);
+ config = NULL;
+ continue;
+ }
+
+ nvlist_free(config);
+ config = nvl;
+
+ /*
+ * Go through and update the paths for spares, now that we have
+ * them.
+ */
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ for (i = 0; i < nspares; i++) {
+ if (fix_paths(hdl, spares[i], pl->names) != 0)
+ goto nomem;
+ }
+ }
+
+ /*
+ * Update the paths for l2cache devices.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ for (i = 0; i < nl2cache; i++) {
+ if (fix_paths(hdl, l2cache[i], pl->names) != 0)
+ goto nomem;
+ }
+ }
+
+ /*
+ * Restore the original information read from the actual label.
+ */
+ (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
+ DATA_TYPE_UINT64);
+ (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
+ DATA_TYPE_STRING);
+ if (hostid != 0) {
+ verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
+ hostid) == 0);
+ verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
+ hostname) == 0);
+ }
+
+add_pool:
+ /*
+ * Add this pool to the list of configs.
+ */
+ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &name) == 0);
+
+ if (nvlist_add_nvlist(ret, name, config) != 0)
+ goto nomem;
+
+ nvlist_free(config);
+ config = NULL;
+ }
+
+ return (ret);
+
+nomem:
+ (void) no_memory(hdl);
+error:
+ nvlist_free(config);
+ nvlist_free(ret);
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+
+ return (NULL);
+}
+
+/*
+ * Return the offset of the given label.
+ */
+static uint64_t
+label_offset(uint64_t size, int l)
+{
+ ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0);
+ return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+ 0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+/*
+ * Given a file descriptor, read the label information and return an nvlist
+ * describing the configuration, if there is one. The number of valid
+ * labels found will be returned in num_labels when non-NULL.
+ */
+int
+zpool_read_label(int fd, nvlist_t **config, int *num_labels)
+{
+ struct stat64 statbuf;
+ int l, count = 0;
+ vdev_label_t *label;
+ nvlist_t *expected_config = NULL;
+ uint64_t expected_guid = 0, size;
+ int error;
+
+ *config = NULL;
+
+ if (fstat64_blk(fd, &statbuf) == -1)
+ return (0);
+ size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
+
+ error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
+ if (error)
+ return (-1);
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+ uint64_t state, guid, txg;
+
+ if (pread64(fd, label, sizeof (vdev_label_t),
+ label_offset(size, l)) != sizeof (vdev_label_t))
+ continue;
+
+ if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
+ sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
+ &guid) != 0 || guid == 0) {
+ nvlist_free(*config);
+ continue;
+ }
+
+ if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 || state > POOL_STATE_L2CACHE) {
+ nvlist_free(*config);
+ continue;
+ }
+
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0)) {
+ nvlist_free(*config);
+ continue;
+ }
+
+ if (expected_guid) {
+ if (expected_guid == guid)
+ count++;
+
+ nvlist_free(*config);
+ } else {
+ expected_config = *config;
+ expected_guid = guid;
+ count++;
+ }
+ }
+
+ if (num_labels != NULL)
+ *num_labels = count;
+
+ free(label);
+ *config = expected_config;
+
+ return (0);
+}
+
+typedef struct rdsk_node {
+ char *rn_name; /* Full path to device */
+ int rn_order; /* Preferred order (low to high) */
+ int rn_num_labels; /* Number of valid labels */
+ uint64_t rn_vdev_guid; /* Expected vdev guid when set */
+ libpc_handle_t *rn_hdl;
+ nvlist_t *rn_config; /* Label config */
+ avl_tree_t *rn_avl;
+ avl_node_t rn_node;
+ pthread_mutex_t *rn_lock;
+ boolean_t rn_labelpaths;
+} rdsk_node_t;
+
+/*
+ * Sorted by vdev guid and full path to allow for multiple entries with
+ * the same full path name. This is required because it's possible to
+ * have multiple block devices with labels that refer to the same
+ * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both
+ * entries need to be added to the cache. Scenarios where this can occur
+ * include overwritten pool labels, devices which are visible from multiple
+ * hosts and multipath devices.
+ */
+static int
+slice_cache_compare(const void *arg1, const void *arg2)
+{
+ const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
+ const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
+ uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
+ uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
+ int rv;
+
+ rv = AVL_CMP(guid1, guid2);
+ if (rv)
+ return (rv);
+
+ return (AVL_ISIGN(strcmp(nm1, nm2)));
+}
+
+static boolean_t
+is_watchdog_dev(char *dev)
+{
+ /* For 'watchdog' dev */
+ if (strcmp(dev, "watchdog") == 0)
+ return (B_TRUE);
+
+ /* For 'watchdog<digit><whatever> */
+ if (strstr(dev, "watchdog") == dev && isdigit(dev[8]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static int
+label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,
+ uint64_t vdev_guid, char **path, char **devid)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ uint64_t guid;
+ char *val;
+ int error;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ error = label_paths_impl(hdl, child[c],
+ pool_guid, vdev_guid, path, devid);
+ if (error)
+ return (error);
+ }
+ return (0);
+ }
+
+ if (nvroot == NULL)
+ return (0);
+
+ error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);
+ if ((error != 0) || (guid != vdev_guid))
+ return (0);
+
+ error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);
+ if (error == 0)
+ *path = val;
+
+ error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);
+ if (error == 0)
+ *devid = val;
+
+ return (0);
+}
+
+/*
+ * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
+ * and store these strings as config_path and devid_path respectively.
+ * The returned pointers are only valid as long as label remains valid.
+ */
+static int
+label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid)
+{
+ nvlist_t *nvroot;
+ uint64_t pool_guid;
+ uint64_t vdev_guid;
+
+ *path = NULL;
+ *devid = NULL;
+
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid))
+ return (ENOENT);
+
+ return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,
+ devid));
+}
+
+static void
+zpool_open_func(void *arg)
+{
+ rdsk_node_t *rn = arg;
+ libpc_handle_t *hdl = rn->rn_hdl;
+ struct stat64 statbuf;
+ nvlist_t *config;
+ char *bname, *dupname;
+ uint64_t vdev_guid = 0;
+ int error;
+ int num_labels = 0;
+ int fd;
+
+ /*
+ * Skip devices with well known prefixes there can be side effects
+ * when opening devices which need to be avoided.
+ *
+ * hpet - High Precision Event Timer
+ * watchdog - Watchdog must be closed in a special way.
+ */
+ dupname = zfs_strdup(hdl, rn->rn_name);
+ bname = basename(dupname);
+ error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname));
+ free(dupname);
+ if (error)
+ return;
+
+ /*
+ * Ignore failed stats. We only want regular files and block devices.
+ */
+ if (stat64(rn->rn_name, &statbuf) != 0 ||
+ (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
+ return;
+
+ /*
+ * Preferentially open using O_DIRECT to bypass the block device
+ * cache which may be stale for multipath devices. An EINVAL errno
+ * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
+ */
+ fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
+ if ((fd < 0) && (errno == EINVAL))
+ fd = open(rn->rn_name, O_RDONLY);
+
+ if ((fd < 0) && (errno == EACCES))
+ hdl->lpc_open_access_error = B_TRUE;
+
+ if (fd < 0)
+ return;
+
+ /*
+ * This file is too small to hold a zpool
+ */
+ if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
+ (void) close(fd);
+ return;
+ }
+
+ error = zpool_read_label(fd, &config, &num_labels);
+ if (error != 0) {
+ (void) close(fd);
+ return;
+ }
+
+ if (num_labels == 0) {
+ (void) close(fd);
+ nvlist_free(config);
+ return;
+ }
+
+ /*
+ * Check that the vdev is for the expected guid. Additional entries
+ * are speculatively added based on the paths stored in the labels.
+ * Entries with valid paths but incorrect guids must be removed.
+ */
+ error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+ if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
+ (void) close(fd);
+ nvlist_free(config);
+ return;
+ }
+
+ (void) close(fd);
+
+ rn->rn_config = config;
+ rn->rn_num_labels = num_labels;
+
+ /*
+ * Add additional entries for paths described by this label.
+ */
+ if (rn->rn_labelpaths) {
+ char *path = NULL;
+ char *devid = NULL;
+ rdsk_node_t *slice;
+ avl_index_t where;
+ int error;
+
+ if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
+ return;
+
+ /*
+ * Allow devlinks to stabilize so all paths are available.
+ */
+ zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT);
+
+ if (path != NULL) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ slice->rn_name = zfs_strdup(hdl, path);
+ slice->rn_vdev_guid = vdev_guid;
+ slice->rn_avl = rn->rn_avl;
+ slice->rn_hdl = hdl;
+ slice->rn_order = IMPORT_ORDER_PREFERRED_1;
+ slice->rn_labelpaths = B_FALSE;
+ pthread_mutex_lock(rn->rn_lock);
+ if (avl_find(rn->rn_avl, slice, &where)) {
+ pthread_mutex_unlock(rn->rn_lock);
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(rn->rn_avl, slice, where);
+ pthread_mutex_unlock(rn->rn_lock);
+ zpool_open_func(slice);
+ }
+ }
+
+ if (devid != NULL) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ error = asprintf(&slice->rn_name, "%s%s",
+ DEV_BYID_PATH, devid);
+ if (error == -1) {
+ free(slice);
+ return;
+ }
+
+ slice->rn_vdev_guid = vdev_guid;
+ slice->rn_avl = rn->rn_avl;
+ slice->rn_hdl = hdl;
+ slice->rn_order = IMPORT_ORDER_PREFERRED_2;
+ slice->rn_labelpaths = B_FALSE;
+ pthread_mutex_lock(rn->rn_lock);
+ if (avl_find(rn->rn_avl, slice, &where)) {
+ pthread_mutex_unlock(rn->rn_lock);
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(rn->rn_avl, slice, where);
+ pthread_mutex_unlock(rn->rn_lock);
+ zpool_open_func(slice);
+ }
+ }
+ }
+}
+
+static void
+zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t *cache, const char *path, const char *name, int order)
+{
+ avl_index_t where;
+ rdsk_node_t *slice;
+
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {
+ free(slice);
+ return;
+ }
+ slice->rn_vdev_guid = 0;
+ slice->rn_lock = lock;
+ slice->rn_avl = cache;
+ slice->rn_hdl = hdl;
+ slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;
+ slice->rn_labelpaths = B_FALSE;
+
+ pthread_mutex_lock(lock);
+ if (avl_find(cache, slice, &where)) {
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(cache, slice, where);
+ }
+ pthread_mutex_unlock(lock);
+}
+
+static int
+zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t *cache, const char *dir, int order)
+{
+ int error;
+ char path[MAXPATHLEN];
+ struct dirent64 *dp;
+ DIR *dirp;
+
+ if (realpath(dir, path) == NULL) {
+ error = errno;
+ if (error == ENOENT)
+ return (0);
+
+ zfs_error_aux(hdl, strerror(error));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
+ TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
+ return (error);
+ }
+
+ dirp = opendir(path);
+ if (dirp == NULL) {
+ error = errno;
+ zfs_error_aux(hdl, strerror(error));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH,
+ dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
+ return (error);
+ }
+
+ while ((dp = readdir64(dirp)) != NULL) {
+ const char *name = dp->d_name;
+ if (name[0] == '.' &&
+ (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
+ continue;
+
+ zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,
+ order);
+ }
+
+ (void) closedir(dirp);
+ return (0);
+}
+
+static int
+zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t *cache, const char *dir, int order)
+{
+ int error = 0;
+ char path[MAXPATHLEN];
+ char *d, *b;
+ char *dpath, *name;
+
+ /*
+ * Seperate the directory part and last part of the
+ * path. We do this so that we can get the realpath of
+ * the directory. We don't get the realpath on the
+ * whole path because if it's a symlink, we want the
+ * path of the symlink not where it points to.
+ */
+ d = zfs_strdup(hdl, dir);
+ b = zfs_strdup(hdl, dir);
+ dpath = dirname(d);
+ name = basename(b);
+
+ if (realpath(dpath, path) == NULL) {
+ error = errno;
+ if (error == ENOENT) {
+ error = 0;
+ goto out;
+ }
+
+ zfs_error_aux(hdl, strerror(error));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
+ TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
+ goto out;
+ }
+
+ zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);
+
+out:
+ free(b);
+ free(d);
+ return (error);
+}
+
+/*
+ * Scan a list of directories for zfs devices.
+ */
+static int
+zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t **slice_cache, char **dir, int dirs)
+{
+ avl_tree_t *cache;
+ rdsk_node_t *slice;
+ void *cookie;
+ int i, error;
+
+ *slice_cache = NULL;
+ cache = zfs_alloc(hdl, sizeof (avl_tree_t));
+ avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),
+ offsetof(rdsk_node_t, rn_node));
+
+ for (i = 0; i < dirs; i++) {
+ struct stat sbuf;
+
+ if (stat(dir[i], &sbuf) != 0) {
+ error = errno;
+ if (error == ENOENT)
+ continue;
+
+ zfs_error_aux(hdl, strerror(error));
+ (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
+ TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);
+ goto error;
+ }
+
+ /*
+ * If dir[i] is a directory, we walk through it and add all
+ * the entry to the cache. If it's not a directory, we just
+ * add it to the cache.
+ */
+ if (S_ISDIR(sbuf.st_mode)) {
+ if ((error = zpool_find_import_scan_dir(hdl, lock,
+ cache, dir[i], i)) != 0)
+ goto error;
+ } else {
+ if ((error = zpool_find_import_scan_path(hdl, lock,
+ cache, dir[i], i)) != 0)
+ goto error;
+ }
+ }
+
+ *slice_cache = cache;
+ return (0);
+
+error:
+ cookie = NULL;
+ while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
+ free(slice->rn_name);
+ free(slice);
+ }
+ free(cache);
+
+ return (error);
+}
+
+static char *
+zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
+ "/dev/disk/by-vdev", /* Custom rules, use first if they exist */
+ "/dev/mapper", /* Use multipath devices before components */
+ "/dev/disk/by-partlabel", /* Single unique entry set by user */
+ "/dev/disk/by-partuuid", /* Generated partition uuid */
+ "/dev/disk/by-label", /* Custom persistent labels */
+ "/dev/disk/by-uuid", /* Single unique entry and persistent */
+ "/dev/disk/by-id", /* May be multiple entries and persistent */
+ "/dev/disk/by-path", /* Encodes physical location and persistent */
+ "/dev" /* UNSAFE device names will change */
+};
+
+const char * const *
+zpool_default_search_paths(size_t *count)
+{
+ *count = DEFAULT_IMPORT_PATH_SIZE;
+ return ((const char * const *)zpool_default_import_path);
+}
+
+/*
+ * Given a full path to a device determine if that device appears in the
+ * import search path. If it does return the first match and store the
+ * index in the passed 'order' variable, otherwise return an error.
+ */
+static int
+zfs_path_order(char *name, int *order)
+{
+ int i = 0, error = ENOENT;
+ char *dir, *env, *envdup;
+
+ env = getenv("ZPOOL_IMPORT_PATH");
+ if (env) {
+ envdup = strdup(env);
+ dir = strtok(envdup, ":");
+ while (dir) {
+ if (strncmp(name, dir, strlen(dir)) == 0) {
+ *order = i;
+ error = 0;
+ break;
+ }
+ dir = strtok(NULL, ":");
+ i++;
+ }
+ free(envdup);
+ } else {
+ for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) {
+ if (strncmp(name, zpool_default_import_path[i],
+ strlen(zpool_default_import_path[i])) == 0) {
+ *order = i;
+ error = 0;
+ break;
+ }
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Use libblkid to quickly enumerate all known zfs devices.
+ */
+static int
+zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock,
+ avl_tree_t **slice_cache)
+{
+ rdsk_node_t *slice;
+ blkid_cache cache;
+ blkid_dev_iterate iter;
+ blkid_dev dev;
+ avl_index_t where;
+ int error;
+
+ *slice_cache = NULL;
+
+ error = blkid_get_cache(&cache, NULL);
+ if (error != 0)
+ return (error);
+
+ error = blkid_probe_all_new(cache);
+ if (error != 0) {
+ blkid_put_cache(cache);
+ return (error);
+ }
+
+ iter = blkid_dev_iterate_begin(cache);
+ if (iter == NULL) {
+ blkid_put_cache(cache);
+ return (EINVAL);
+ }
+
+ error = blkid_dev_set_search(iter, "TYPE", "zfs_member");
+ if (error != 0) {
+ blkid_dev_iterate_end(iter);
+ blkid_put_cache(cache);
+ return (error);
+ }
+
+ *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t));
+ avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
+ offsetof(rdsk_node_t, rn_node));
+
+ while (blkid_dev_next(iter, &dev) == 0) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
+ slice->rn_vdev_guid = 0;
+ slice->rn_lock = lock;
+ slice->rn_avl = *slice_cache;
+ slice->rn_hdl = hdl;
+ slice->rn_labelpaths = B_TRUE;
+
+ error = zfs_path_order(slice->rn_name, &slice->rn_order);
+ if (error == 0)
+ slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
+ else
+ slice->rn_order = IMPORT_ORDER_DEFAULT;
+
+ pthread_mutex_lock(lock);
+ if (avl_find(*slice_cache, slice, &where)) {
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(*slice_cache, slice, where);
+ }
+ pthread_mutex_unlock(lock);
+ }
+
+ blkid_dev_iterate_end(iter);
+ blkid_put_cache(cache);
+
+ return (0);
+}
+
+/*
+ * Given a list of directories to search, find all pools stored on disk. This
+ * includes partial pools which are not available to import. If no args are
+ * given (argc is 0), then the default directory (/dev/dsk) is searched.
+ * poolname or guid (but not both) are provided by the caller when trying
+ * to import a specific pool.
+ */
+static nvlist_t *
+zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg)
+{
+ nvlist_t *ret = NULL;
+ pool_list_t pools = { 0 };
+ pool_entry_t *pe, *penext;
+ vdev_entry_t *ve, *venext;
+ config_entry_t *ce, *cenext;
+ name_entry_t *ne, *nenext;
+ pthread_mutex_t lock;
+ avl_tree_t *cache;
+ rdsk_node_t *slice;
+ void *cookie;
+ tpool_t *t;
+
+ verify(iarg->poolname == NULL || iarg->guid == 0);
+ pthread_mutex_init(&lock, NULL);
+
+ /*
+ * Locate pool member vdevs using libblkid or by directory scanning.
+ * On success a newly allocated AVL tree which is populated with an
+ * entry for each discovered vdev will be returned as the cache.
+ * It's the callers responsibility to consume and destroy this tree.
+ */
+ if (iarg->scan || iarg->paths != 0) {
+ int dirs = iarg->paths;
+ char **dir = iarg->path;
+
+ if (dirs == 0) {
+ dir = zpool_default_import_path;
+ dirs = DEFAULT_IMPORT_PATH_SIZE;
+ }
+
+ if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0)
+ return (NULL);
+ } else {
+ if (zpool_find_import_blkid(hdl, &lock, &cache) != 0)
+ return (NULL);
+ }
+
+ /*
+ * Create a thread pool to parallelize the process of reading and
+ * validating labels, a large number of threads can be used due to
+ * minimal contention.
+ */
+ t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
+ for (slice = avl_first(cache); slice;
+ (slice = avl_walk(cache, slice, AVL_AFTER)))
+ (void) tpool_dispatch(t, zpool_open_func, slice);
+
+ tpool_wait(t);
+ tpool_destroy(t);
+
+ /*
+ * Process the cache filtering out any entries which are not
+ * for the specificed pool then adding matching label configs.
+ */
+ cookie = NULL;
+ while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
+ if (slice->rn_config != NULL) {
+ nvlist_t *config = slice->rn_config;
+ boolean_t matched = B_TRUE;
+ boolean_t aux = B_FALSE;
+ int fd;
+
+ /*
+ * Check if it's a spare or l2cache device. If it is,
+ * we need to skip the name and guid check since they
+ * don't exist on aux device label.
+ */
+ if (iarg->poolname != NULL || iarg->guid != 0) {
+ uint64_t state;
+ aux = nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&
+ (state == POOL_STATE_SPARE ||
+ state == POOL_STATE_L2CACHE);
+ }
+
+ if (iarg->poolname != NULL && !aux) {
+ char *pname;
+
+ matched = nvlist_lookup_string(config,
+ ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&
+ strcmp(iarg->poolname, pname) == 0;
+ } else if (iarg->guid != 0 && !aux) {
+ uint64_t this_guid;
+
+ matched = nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&
+ iarg->guid == this_guid;
+ }
+ if (matched) {
+ /*
+ * Verify all remaining entries can be opened
+ * exclusively. This will prune all underlying
+ * multipath devices which otherwise could
+ * result in the vdev appearing as UNAVAIL.
+ *
+ * Under zdb, this step isn't required and
+ * would prevent a zdb -e of active pools with
+ * no cachefile.
+ */
+ fd = open(slice->rn_name, O_RDONLY | O_EXCL);
+ if (fd >= 0 || iarg->can_be_active) {
+ if (fd >= 0)
+ close(fd);
+ add_config(hdl, &pools,
+ slice->rn_name, slice->rn_order,
+ slice->rn_num_labels, config);
+ }
+ }
+ nvlist_free(config);
+ }
+ free(slice->rn_name);
+ free(slice);
+ }
+ avl_destroy(cache);
+ free(cache);
+ pthread_mutex_destroy(&lock);
+
+ ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
+
+ for (pe = pools.pools; pe != NULL; pe = penext) {
+ penext = pe->pe_next;
+ for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
+ venext = ve->ve_next;
+ for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
+ cenext = ce->ce_next;
+ nvlist_free(ce->ce_config);
+ free(ce);
+ }
+ free(ve);
+ }
+ free(pe);
+ }
+
+ for (ne = pools.names; ne != NULL; ne = nenext) {
+ nenext = ne->ne_next;
+ free(ne->ne_name);
+ free(ne);
+ }
+
+ return (ret);
+}
+
+/*
+ * Given a cache file, return the contents as a list of importable pools.
+ * poolname or guid (but not both) are provided by the caller when trying
+ * to import a specific pool.
+ */
+static nvlist_t *
+zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile,
+ const char *poolname, uint64_t guid)
+{
+ char *buf;
+ int fd;
+ struct stat64 statbuf;
+ nvlist_t *raw, *src, *dst;
+ nvlist_t *pools;
+ nvpair_t *elem;
+ char *name;
+ uint64_t this_guid;
+ boolean_t active;
+
+ verify(poolname == NULL || guid == 0);
+
+ if ((fd = open(cachefile, O_RDONLY)) < 0) {
+ zfs_error_aux(hdl, "%s", strerror(errno));
+ (void) zfs_error(hdl, EZFS_BADCACHE,
+ dgettext(TEXT_DOMAIN, "failed to open cache file"));
+ return (NULL);
+ }
+
+ if (fstat64(fd, &statbuf) != 0) {
+ zfs_error_aux(hdl, "%s", strerror(errno));
+ (void) close(fd);
+ (void) zfs_error(hdl, EZFS_BADCACHE,
+ dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
+ return (NULL);
+ }
+
+ if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) {
+ (void) close(fd);
+ return (NULL);
+ }
+
+ if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
+ (void) close(fd);
+ free(buf);
+ (void) zfs_error(hdl, EZFS_BADCACHE,
+ dgettext(TEXT_DOMAIN,
+ "failed to read cache file contents"));
+ return (NULL);
+ }
+
+ (void) close(fd);
+
+ if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
+ free(buf);
+ (void) zfs_error(hdl, EZFS_BADCACHE,
+ dgettext(TEXT_DOMAIN,
+ "invalid or corrupt cache file contents"));
+ return (NULL);
+ }
+
+ free(buf);
+
+ /*
+ * Go through and get the current state of the pools and refresh their
+ * state.
+ */
+ if (nvlist_alloc(&pools, 0, 0) != 0) {
+ (void) no_memory(hdl);
+ nvlist_free(raw);
+ return (NULL);
+ }
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
+ src = fnvpair_value_nvlist(elem);
+
+ name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
+ if (poolname != NULL && strcmp(poolname, name) != 0)
+ continue;
+
+ this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
+ if (guid != 0 && guid != this_guid)
+ continue;
+
+ if (pool_active(hdl, name, this_guid, &active) != 0) {
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+
+ if (active)
+ continue;
+
+ if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
+ cachefile) != 0) {
+ (void) no_memory(hdl);
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+
+ if ((dst = refresh_config(hdl, src)) == NULL) {
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+
+ if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
+ (void) no_memory(hdl);
+ nvlist_free(dst);
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+ nvlist_free(dst);
+ }
+
+ nvlist_free(raw);
+ return (pools);
+}
+
+nvlist_t *
+zpool_search_import(void *hdl, importargs_t *import,
+ const pool_config_ops_t *pco)
+{
+ libpc_handle_t handle = { 0 };
+ nvlist_t *pools = NULL;
+
+ handle.lpc_lib_handle = hdl;
+ handle.lpc_ops = pco;
+ handle.lpc_printerr = B_TRUE;
+
+ verify(import->poolname == NULL || import->guid == 0);
+
+ if (import->cachefile != NULL)
+ pools = zpool_find_import_cached(&handle, import->cachefile,
+ import->poolname, import->guid);
+ else
+ pools = zpool_find_import_impl(&handle, import);
+
+ if ((pools == NULL || nvlist_empty(pools)) &&
+ handle.lpc_open_access_error && geteuid() != 0) {
+ (void) zfs_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN,
+ "no pools found"));
+ }
+
+ return (pools);
+}
+
+static boolean_t
+pool_match(nvlist_t *cfg, char *tgt)
+{
+ uint64_t v, guid = strtoull(tgt, NULL, 0);
+ char *s;
+
+ if (guid != 0) {
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
+ return (v == guid);
+ } else {
+ if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
+ return (strcmp(s, tgt) == 0);
+ }
+ return (B_FALSE);
+}
+
+int
+zpool_find_config(void *hdl, const char *target, nvlist_t **configp,
+ importargs_t *args, const pool_config_ops_t *pco)
+{
+ nvlist_t *pools;
+ nvlist_t *match = NULL;
+ nvlist_t *config = NULL;
+ char *name = NULL, *sepp = NULL;
+ char sep = '\0';
+ int count = 0;
+ char *targetdup = strdup(target);
+
+ *configp = NULL;
+
+ if ((sepp = strpbrk(targetdup, "/@")) != NULL) {
+ sep = *sepp;
+ *sepp = '\0';
+ }
+
+ pools = zpool_search_import(hdl, args, pco);
+
+ if (pools != NULL) {
+ nvpair_t *elem = NULL;
+ while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
+ VERIFY0(nvpair_value_nvlist(elem, &config));
+ if (pool_match(config, targetdup)) {
+ count++;
+ if (match != NULL) {
+ /* multiple matches found */
+ continue;
+ } else {
+ match = config;
+ name = nvpair_name(elem);
+ }
+ }
+ }
+ }
+
+ if (count == 0) {
+ free(targetdup);
+ return (ENOENT);
+ }
+
+ if (count > 1) {
+ free(targetdup);
+ return (EINVAL);
+ }
+
+ *configp = match;
+ free(targetdup);
+
+ return (0);
+}
diff --git a/lib/libzutil/zutil_nicenum.c b/lib/libzutil/zutil_nicenum.c
new file mode 100644
index 000000000..9a81011fc
--- /dev/null
+++ b/lib/libzutil/zutil_nicenum.c
@@ -0,0 +1,157 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <libzutil.h>
+
+/*
+ * Convert a number to an appropriately human-readable output.
+ */
+void
+zfs_nicenum_format(uint64_t num, char *buf, size_t buflen,
+ enum zfs_nicenum_format format)
+{
+ uint64_t n = num;
+ int index = 0;
+ const char *u;
+ const char *units[3][7] = {
+ [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"},
+ [ZFS_NICENUM_BYTES] = {"B", "K", "M", "G", "T", "P", "E"},
+ [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"}
+ };
+
+ const int units_len[] = {[ZFS_NICENUM_1024] = 6,
+ [ZFS_NICENUM_BYTES] = 6,
+ [ZFS_NICENUM_TIME] = 4};
+
+ const int k_unit[] = { [ZFS_NICENUM_1024] = 1024,
+ [ZFS_NICENUM_BYTES] = 1024,
+ [ZFS_NICENUM_TIME] = 1000};
+
+ double val;
+
+ if (format == ZFS_NICENUM_RAW) {
+ snprintf(buf, buflen, "%llu", (u_longlong_t)num);
+ return;
+ } else if (format == ZFS_NICENUM_RAWTIME && num > 0) {
+ snprintf(buf, buflen, "%llu", (u_longlong_t)num);
+ return;
+ } else if (format == ZFS_NICENUM_RAWTIME && num == 0) {
+ snprintf(buf, buflen, "%s", "-");
+ return;
+ }
+
+ while (n >= k_unit[format] && index < units_len[format]) {
+ n /= k_unit[format];
+ index++;
+ }
+
+ u = units[format][index];
+
+ /* Don't print zero latencies since they're invalid */
+ if ((format == ZFS_NICENUM_TIME) && (num == 0)) {
+ (void) snprintf(buf, buflen, "-");
+ } else if ((index == 0) || ((num %
+ (uint64_t)powl(k_unit[format], index)) == 0)) {
+ /*
+ * If this is an even multiple of the base, always display
+ * without any decimal precision.
+ */
+ (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t)n, u);
+
+ } else {
+ /*
+ * We want to choose a precision that reflects the best choice
+ * for fitting in 5 characters. This can get rather tricky when
+ * we have numbers that are very close to an order of magnitude.
+ * For example, when displaying 10239 (which is really 9.999K),
+ * we want only a single place of precision for 10.0K. We could
+ * develop some complex heuristics for this, but it's much
+ * easier just to try each combination in turn.
+ */
+ int i;
+ for (i = 2; i >= 0; i--) {
+ val = (double)num /
+ (uint64_t)powl(k_unit[format], index);
+
+ /*
+ * Don't print floating point values for time. Note,
+ * we use floor() instead of round() here, since
+ * round can result in undesirable results. For
+ * example, if "num" is in the range of
+ * 999500-999999, it will print out "1000us". This
+ * doesn't happen if we use floor().
+ */
+ if (format == ZFS_NICENUM_TIME) {
+ if (snprintf(buf, buflen, "%d%s",
+ (unsigned int) floor(val), u) <= 5)
+ break;
+
+ } else {
+ if (snprintf(buf, buflen, "%.*f%s", i,
+ val, u) <= 5)
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * Convert a number to an appropriately human-readable output.
+ */
+void
+zfs_nicenum(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024);
+}
+
+/*
+ * Convert a time to an appropriately human-readable output.
+ * @num: Time in nanoseconds
+ */
+void
+zfs_nicetime(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME);
+}
+
+/*
+ * Print out a raw number with correct column spacing
+ */
+void
+zfs_niceraw(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW);
+}
+
+/*
+ * Convert a number of bytes to an appropriately human-readable output.
+ */
+void
+zfs_nicebytes(uint64_t num, char *buf, size_t buflen)
+{
+ zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_BYTES);
+}
diff --git a/lib/libzutil/zutil_pool.c b/lib/libzutil/zutil_pool.c
new file mode 100644
index 000000000..734650f3c
--- /dev/null
+++ b/lib/libzutil/zutil_pool.c
@@ -0,0 +1,145 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/nvpair.h>
+#include <sys/fs/zfs.h>
+
+#include <libzutil.h>
+
+static void
+dump_ddt_stat(const ddt_stat_t *dds, int h)
+{
+ char refcnt[6];
+ char blocks[6], lsize[6], psize[6], dsize[6];
+ char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
+
+ if (dds == NULL || dds->dds_blocks == 0)
+ return;
+
+ if (h == -1)
+ (void) strcpy(refcnt, "Total");
+ else
+ zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt));
+
+ zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks));
+ zfs_nicebytes(dds->dds_lsize, lsize, sizeof (lsize));
+ zfs_nicebytes(dds->dds_psize, psize, sizeof (psize));
+ zfs_nicebytes(dds->dds_dsize, dsize, sizeof (dsize));
+ zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks));
+ zfs_nicebytes(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize));
+ zfs_nicebytes(dds->dds_ref_psize, ref_psize, sizeof (ref_psize));
+ zfs_nicebytes(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize));
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ refcnt,
+ blocks, lsize, psize, dsize,
+ ref_blocks, ref_lsize, ref_psize, ref_dsize);
+}
+
+/*
+ * Print the DDT histogram and the column totals.
+ */
+void
+zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh)
+{
+ int h;
+
+ (void) printf("\n");
+
+ (void) printf("bucket "
+ " allocated "
+ " referenced \n");
+ (void) printf("______ "
+ "______________________________ "
+ "______________________________\n");
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ "refcnt",
+ "blocks", "LSIZE", "PSIZE", "DSIZE",
+ "blocks", "LSIZE", "PSIZE", "DSIZE");
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ "------",
+ "------", "-----", "-----", "-----",
+ "------", "-----", "-----", "-----");
+
+ for (h = 0; h < 64; h++)
+ dump_ddt_stat(&ddh->ddh_stat[h], h);
+
+ dump_ddt_stat(dds_total, -1);
+
+ (void) printf("\n");
+}
+
+/*
+ * Process the buffer of nvlists, unpacking and storing each nvlist record
+ * into 'records'. 'leftover' is set to the number of bytes that weren't
+ * processed as there wasn't a complete record.
+ */
+int
+zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
+ nvlist_t ***records, uint_t *numrecords)
+{
+ uint64_t reclen;
+ nvlist_t *nv;
+ int i;
+ void *tmp;
+
+ while (bytes_read > sizeof (reclen)) {
+
+ /* get length of packed record (stored as little endian) */
+ for (i = 0, reclen = 0; i < sizeof (reclen); i++)
+ reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i);
+
+ if (bytes_read < sizeof (reclen) + reclen)
+ break;
+
+ /* unpack record */
+ if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0)
+ return (ENOMEM);
+ bytes_read -= sizeof (reclen) + reclen;
+ buf += sizeof (reclen) + reclen;
+
+ /* add record to nvlist array */
+ (*numrecords)++;
+ if (ISP2(*numrecords + 1)) {
+ tmp = realloc(*records,
+ *numrecords * 2 * sizeof (nvlist_t *));
+ if (tmp == NULL) {
+ nvlist_free(nv);
+ (*numrecords)--;
+ return (ENOMEM);
+ }
+ *records = tmp;
+ }
+ (*records)[*numrecords - 1] = nv;
+ }
+
+ *leftover = bytes_read;
+ return (0);
+}