From e89f1295d4faa88bb05a62c8dd5f781657db5955 Mon Sep 17 00:00:00 2001 From: Don Brady <don.brady@delphix.com> Date: Mon, 5 Nov 2018 12:22:33 -0700 Subject: Add libzutil for libzfs or libzpool consumers Adds a libzutil for utility functions that are common to libzfs and libzpool consumers (most of what was in libzfs_import.c). This removes the need for utilities to link against both libzpool and libzfs. Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Don Brady <don.brady@delphix.com> Closes #8050 --- cmd/mount_zfs/mount_zfs.c | 1 + cmd/zdb/Makefile.am | 1 - cmd/zdb/zdb.c | 13 +- cmd/zed/agents/zfs_mod.c | 1 + cmd/zed/zed_disk_event.c | 1 + cmd/zfs/zfs_main.c | 1 + cmd/zhack/Makefile.am | 1 - cmd/zhack/zhack.c | 13 +- cmd/zinject/Makefile.am | 3 +- cmd/zinject/translate.c | 126 +- cmd/zinject/zinject.c | 2 + cmd/zpool/zpool_iter.c | 1 + cmd/zpool/zpool_main.c | 45 +- cmd/zpool/zpool_vdev.c | 1 + cmd/ztest/Makefile.am | 1 - cmd/ztest/ztest.c | 8 +- configure.ac | 1 + include/Makefile.am | 1 + include/libzfs.h | 102 -- include/libzutil.h | 150 +++ include/sys/zfs_context.h | 1 - include/sys/zfs_ioctl.h | 1 + lib/Makefile.am | 2 +- lib/libzfs/Makefile.am | 7 +- lib/libzfs/libzfs_dataset.c | 2 +- lib/libzfs/libzfs_import.c | 2377 ++----------------------------------- lib/libzfs/libzfs_iter.c | 1 + lib/libzfs/libzfs_pool.c | 403 +------ lib/libzfs/libzfs_sendrecv.c | 1 + lib/libzfs/libzfs_status.c | 66 +- lib/libzfs/libzfs_util.c | 335 +----- lib/libzpool/Makefile.am | 5 +- lib/libzpool/util.c | 155 +-- lib/libzutil/Makefile.am | 27 + lib/libzutil/zutil_device_path.c | 625 ++++++++++ lib/libzutil/zutil_import.c | 2389 ++++++++++++++++++++++++++++++++++++++ lib/libzutil/zutil_nicenum.c | 157 +++ lib/libzutil/zutil_pool.c | 145 +++ module/zfs/zio_inject.c | 67 ++ 39 files changed, 3784 insertions(+), 3455 deletions(-) create mode 100644 include/libzutil.h create mode 100644 lib/libzutil/Makefile.am create mode 100644 lib/libzutil/zutil_device_path.c create mode 100644 lib/libzutil/zutil_import.c create mode 100644 lib/libzutil/zutil_nicenum.c create mode 100644 lib/libzutil/zutil_pool.c diff --git a/cmd/mount_zfs/mount_zfs.c b/cmd/mount_zfs/mount_zfs.c index d0d65fb94..a9b1e166b 100644 --- a/cmd/mount_zfs/mount_zfs.c +++ b/cmd/mount_zfs/mount_zfs.c @@ -31,6 +31,7 @@ #include <sys/mntent.h> #include <sys/stat.h> #include <libzfs.h> +#include <libzutil.h> #include <locale.h> #include <getopt.h> #include <fcntl.h> diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am index 70b60bfaf..1fa7ec651 100644 --- a/cmd/zdb/Makefile.am +++ b/cmd/zdb/Makefile.am @@ -16,5 +16,4 @@ zdb_SOURCES = \ zdb_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs/libzfs.la \ $(top_builddir)/lib/libzpool/libzpool.la diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 737217538..52a671557 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -67,7 +67,9 @@ #include <sys/dsl_crypt.h> #include <sys/dsl_scan.h> #include <zfs_comutil.h> -#include <libzfs.h> + +#include <libnvpair.h> +#include <libzutil.h> #include "zdb.h" @@ -106,7 +108,6 @@ typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); uint64_t *zopt_object = NULL; static unsigned zopt_objects = 0; -libzfs_handle_t *g_zfs; uint64_t max_inflight = 1000; static int leaked_objects = 0; static range_tree_t *mos_refd_objs; @@ -5996,10 +5997,6 @@ main(int argc, char **argv) spa_load_verify_dryrun = B_TRUE; kernel_init(FREAD); - if ((g_zfs = libzfs_init()) == NULL) { - (void) fprintf(stderr, "%s", libzfs_error_init(errno)); - return (1); - } if (dump_all) verbose = MAX(verbose, 1); @@ -6078,7 +6075,8 @@ main(int argc, char **argv) args.path = searchdirs; args.can_be_active = B_TRUE; - error = zpool_tryimport(g_zfs, target_pool, &cfg, &args); + error = zpool_find_config(NULL, target_pool, &cfg, &args, + &libzpool_config_ops); if (error == 0) { @@ -6228,7 +6226,6 @@ main(int argc, char **argv) dump_debug_buffer(); - libzfs_fini(g_zfs); kernel_fini(); return (error); diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index ca7128c88..db9c4c4b7 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -73,6 +73,7 @@ #include <fcntl.h> #include <libnvpair.h> #include <libzfs.h> +#include <libzutil.h> #include <limits.h> #include <stddef.h> #include <stdlib.h> diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 996b911c5..174d24523 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -21,6 +21,7 @@ #include <libnvpair.h> #include <libudev.h> #include <libzfs.h> +#include <libzutil.h> #include <pthread.h> #include <stdlib.h> #include <string.h> diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index dcfb6e80a..6e0a6d5bc 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -66,6 +66,7 @@ #include <libzfs_core.h> #include <zfs_prop.h> #include <zfs_deleg.h> +#include <libzutil.h> #include <libuutil.h> #ifdef HAVE_IDMAP #include <aclutils.h> diff --git a/cmd/zhack/Makefile.am b/cmd/zhack/Makefile.am index f720e8286..6e3e706ec 100644 --- a/cmd/zhack/Makefile.am +++ b/cmd/zhack/Makefile.am @@ -11,5 +11,4 @@ zhack_SOURCES = \ zhack_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs/libzfs.la \ $(top_builddir)/lib/libzpool/libzpool.la diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c index 296a7fe75..57e497f62 100644 --- a/cmd/zhack/zhack.c +++ b/cmd/zhack/zhack.c @@ -48,12 +48,11 @@ #include <sys/zio_compress.h> #include <sys/zfeature.h> #include <sys/dmu_tx.h> -#include <libzfs.h> +#include <libzutil.h> extern boolean_t zfeature_checks_disable; const char cmdname[] = "zhack"; -libzfs_handle_t *g_zfs; static importargs_t g_importargs; static char *g_pool; static boolean_t g_readonly; @@ -128,20 +127,17 @@ zhack_import(char *target, boolean_t readonly) int error; kernel_init(readonly ? FREAD : (FREAD | FWRITE)); - g_zfs = libzfs_init(); - ASSERT(g_zfs != NULL); dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb); g_readonly = readonly; - g_importargs.unique = B_TRUE; g_importargs.can_be_active = readonly; g_pool = strdup(target); - error = zpool_tryimport(g_zfs, target, &config, &g_importargs); + error = zpool_find_config(NULL, target, &config, &g_importargs, + &libzpool_config_ops); if (error) - fatal(NULL, FTAG, "cannot import '%s': %s", target, - libzfs_error_description(g_zfs)); + fatal(NULL, FTAG, "cannot import '%s'", target); props = NULL; if (readonly) { @@ -529,7 +525,6 @@ main(int argc, char **argv) "changes may not be committed to disk\n"); } - libzfs_fini(g_zfs); kernel_fini(); return (rv); diff --git a/cmd/zinject/Makefile.am b/cmd/zinject/Makefile.am index b709a2f5a..ab7f4de12 100644 --- a/cmd/zinject/Makefile.am +++ b/cmd/zinject/Makefile.am @@ -13,5 +13,4 @@ zinject_SOURCES = \ zinject_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs/libzfs.la \ - $(top_builddir)/lib/libzpool/libzpool.la + $(top_builddir)/lib/libzfs/libzfs.la diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c index 4b3169e88..700961b06 100644 --- a/cmd/zinject/translate.c +++ b/cmd/zinject/translate.c @@ -25,8 +25,6 @@ #include <libzfs.h> -#include <sys/zfs_context.h> - #include <errno.h> #include <fcntl.h> #include <stdarg.h> @@ -49,9 +47,6 @@ #include "zinject.h" -extern void kernel_init(int); -extern void kernel_fini(void); - static int debug; static void @@ -161,51 +156,32 @@ parse_pathname(const char *inpath, char *dataset, char *relpath, } /* - * Convert from a (dataset, path) pair into a (objset, object) pair. Note that - * we grab the object number from the inode number, since looking this up via - * libzpool is a real pain. + * Convert from a dataset to a objset id. Note that + * we grab the object number from the inode number. */ -/* ARGSUSED */ static int -object_from_path(const char *dataset, const char *path, struct stat64 *statbuf, - zinject_record_t *record) +object_from_path(const char *dataset, uint64_t object, zinject_record_t *record) { - objset_t *os; - int err; - - /* - * Before doing any libzpool operations, call sync() to ensure that the - * on-disk state is consistent with the in-core state. - */ - sync(); + zfs_handle_t *zhp; - err = dmu_objset_own(dataset, DMU_OST_ZFS, B_TRUE, B_FALSE, FTAG, &os); - if (err != 0) { - (void) fprintf(stderr, "cannot open dataset '%s': %s\n", - dataset, strerror(err)); + if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) return (-1); - } - record->zi_objset = dmu_objset_id(os); - record->zi_object = statbuf->st_ino; + record->zi_objset = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + record->zi_object = object; - dmu_objset_disown(os, B_FALSE, FTAG); + zfs_close(zhp); return (0); } /* - * Calculate the real range based on the type, level, and range given. + * Intialize the range based on the type, level, and range given. */ static int -calculate_range(const char *dataset, err_type_t type, int level, char *range, +initialize_range(err_type_t type, int level, char *range, zinject_record_t *record) { - objset_t *os = NULL; - dnode_t *dn = NULL; - int err; - int ret = -1; - /* * Determine the numeric range from the string. */ @@ -233,7 +209,7 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range, (void) fprintf(stderr, "invalid range '%s': must be " "a numeric range of the form 'start[,end]'\n", range); - goto out; + return (-1); } } @@ -253,7 +229,7 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range, if (range != NULL) { (void) fprintf(stderr, "range cannot be specified when " "type is 'dnode'\n"); - goto out; + return (-1); } record->zi_start = record->zi_object * sizeof (dnode_phys_t); @@ -262,76 +238,9 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range, break; } - /* - * Get the dnode associated with object, so we can calculate the block - * size. - */ - if ((err = dmu_objset_own(dataset, DMU_OST_ANY, - B_TRUE, B_FALSE, FTAG, &os)) != 0) { - (void) fprintf(stderr, "cannot open dataset '%s': %s\n", - dataset, strerror(err)); - goto out; - } - - if (record->zi_object == 0) { - dn = DMU_META_DNODE(os); - } else { - err = dnode_hold(os, record->zi_object, FTAG, &dn); - if (err != 0) { - (void) fprintf(stderr, "failed to hold dnode " - "for object %llu\n", - (u_longlong_t)record->zi_object); - goto out; - } - } - - - ziprintf("data shift: %d\n", (int)dn->dn_datablkshift); - ziprintf(" ind shift: %d\n", (int)dn->dn_indblkshift); - - /* - * Translate range into block IDs. - */ - if (record->zi_start != 0 || record->zi_end != -1ULL) { - record->zi_start >>= dn->dn_datablkshift; - record->zi_end >>= dn->dn_datablkshift; - } - - /* - * Check level, and then translate level 0 blkids into ranges - * appropriate for level of indirection. - */ record->zi_level = level; - if (level > 0) { - ziprintf("level 0 blkid range: [%llu, %llu]\n", - record->zi_start, record->zi_end); - - if (level >= dn->dn_nlevels) { - (void) fprintf(stderr, "level %d exceeds max level " - "of object (%d)\n", level, dn->dn_nlevels - 1); - goto out; - } - - if (record->zi_start != 0 || record->zi_end != 0) { - int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (; level > 0; level--) { - record->zi_start >>= shift; - record->zi_end >>= shift; - } - } - } - - ret = 0; -out: - if (dn) { - if (dn != DMU_META_DNODE(os)) - dnode_rele(dn, FTAG); - } - if (os) - dmu_objset_disown(os, B_FALSE, FTAG); - - return (ret); + return (0); } int @@ -343,8 +252,6 @@ translate_record(err_type_t type, const char *object, const char *range, struct stat64 statbuf; int ret = -1; - kernel_init(FREAD); - debug = (getenv("ZINJECT_DEBUG") != NULL); ziprintf("translating: %s\n", object); @@ -396,16 +303,16 @@ translate_record(err_type_t type, const char *object, const char *range, /* * Convert (dataset, file) into (objset, object) */ - if (object_from_path(dataset, path, &statbuf, record) != 0) + if (object_from_path(dataset, statbuf.st_ino, record) != 0) goto err; ziprintf("raw objset: %llu\n", record->zi_objset); ziprintf("raw object: %llu\n", record->zi_object); /* - * For the given object, calculate the real (type, level, range) + * For the given object, intialize the range in bytes */ - if (calculate_range(dataset, type, level, (char *)range, record) != 0) + if (initialize_range(type, level, (char *)range, record) != 0) goto err; ziprintf(" objset: %llu\n", record->zi_objset); @@ -427,7 +334,6 @@ translate_record(err_type_t type, const char *object, const char *range, ret = 0; err: - kernel_fini(); return (ret); } diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index ee15ff8a4..54740104a 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -561,6 +561,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record, if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to add handler: %s\n", + errno == EDOM ? "block level exceeds max level of object" : strerror(errno)); return (1); } @@ -853,6 +854,7 @@ main(int argc, char **argv) break; case 'r': range = optarg; + flags |= ZINJECT_CALC_RANGE; break; case 's': dur_secs = 1; diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 019f0b136..9927a9deb 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -36,6 +36,7 @@ #include <thread_pool.h> #include <libzfs.h> +#include <libzutil.h> #include <sys/zfs_context.h> #include <sys/wait.h> diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 4845956e5..67ec23d47 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -64,6 +64,7 @@ #include <math.h> #include <libzfs.h> +#include <libzutil.h> #include "zpool_util.h" #include "zfs_comutil.h" @@ -2533,6 +2534,40 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, return (ret); } +typedef struct target_exists_args { + const char *poolname; + uint64_t poolguid; +} target_exists_args_t; + +static int +name_or_guid_exists(zpool_handle_t *zhp, void *data) +{ + target_exists_args_t *args = data; + nvlist_t *config = zpool_get_config(zhp, NULL); + int found = 0; + + if (config == NULL) + return (0); + + if (args->poolname != NULL) { + char *pool_name; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pool_name) == 0); + if (strcmp(pool_name, args->poolname) == 0) + found = 1; + } else { + uint64_t pool_guid; + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0); + if (pool_guid == args->poolguid) + found = 1; + } + zpool_close(zhp); + + return (found); +} /* * zpool checkpoint <pool> * checkpoint --discard <pool> @@ -2685,6 +2720,7 @@ zpool_do_import(int argc, char **argv) boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; + boolean_t pool_exists = B_FALSE; uint64_t pool_state, txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; @@ -2892,7 +2928,8 @@ zpool_do_import(int argc, char **argv) /* * User specified a name or guid. Ensure it's unique. */ - idata.unique = B_TRUE; + target_exists_args_t search = {searchname, searchguid}; + pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); } /* @@ -2928,9 +2965,9 @@ zpool_do_import(int argc, char **argv) idata.scan = do_scan; idata.policy = policy; - pools = zpool_search_import(g_zfs, &idata); + pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); - if (pools != NULL && idata.exists && + if (pools != NULL && pool_exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), @@ -2939,7 +2976,7 @@ zpool_do_import(int argc, char **argv) "<pool | id> <newpool>' to give it a new name\n"), "zpool import"); err = 1; - } else if (pools == NULL && idata.exists) { + } else if (pools == NULL && pool_exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 2d9c1d028..5134553a5 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -69,6 +69,7 @@ #include <fcntl.h> #include <libintl.h> #include <libnvpair.h> +#include <libzutil.h> #include <limits.h> #include <sys/spa.h> #include <scsi/scsi.h> diff --git a/cmd/ztest/Makefile.am b/cmd/ztest/Makefile.am index 489d8b547..55af41680 100644 --- a/cmd/ztest/Makefile.am +++ b/cmd/ztest/Makefile.am @@ -20,7 +20,6 @@ ztest_SOURCES = \ ztest_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libzfs/libzfs.la \ $(top_builddir)/lib/libzpool/libzpool.la ztest_LDADD += -lm diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 14e8a5e27..03c62fc6f 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -128,7 +128,7 @@ #include <sys/fs/zfs.h> #include <zfs_fletcher.h> #include <libnvpair.h> -#include <libzfs.h> +#include <libzutil.h> #include <sys/crypto/icp.h> #ifdef __GLIBC__ #include <execinfo.h> /* for backtrace() */ @@ -7065,7 +7065,6 @@ make_random_props(void) static void ztest_import(ztest_shared_t *zs) { - libzfs_handle_t *hdl; importargs_t args = { 0 }; spa_t *spa; nvlist_t *cfg = NULL; @@ -7080,14 +7079,14 @@ ztest_import(ztest_shared_t *zs) VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); kernel_init(FREAD | FWRITE); - hdl = libzfs_init(); searchdirs[0] = ztest_opts.zo_dir; args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_FALSE; - error = zpool_tryimport(hdl, name, &cfg, &args); + error = zpool_find_config(NULL, name, &cfg, &args, + &libzpool_config_ops); if (error) (void) fatal(0, "No pools found\n"); @@ -7097,7 +7096,6 @@ ztest_import(ztest_shared_t *zs) 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); - libzfs_fini(hdl); kernel_fini(); if (!ztest_opts.zo_mmp_test) { diff --git a/configure.ac b/configure.ac index 59c8c5f7f..028cae338 100644 --- a/configure.ac +++ b/configure.ac @@ -93,6 +93,7 @@ AC_CONFIG_FILES([ lib/libefi/Makefile lib/libicp/Makefile lib/libnvpair/Makefile + lib/libzutil/Makefile lib/libtpool/Makefile lib/libunicode/Makefile lib/libuutil/Makefile diff --git a/include/Makefile.am b/include/Makefile.am index 5f13505f2..bac47d98d 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -16,6 +16,7 @@ USER_H = \ $(top_srcdir)/include/libzfs.h \ $(top_srcdir)/include/libzfs_core.h \ $(top_srcdir)/include/libzfs_impl.h \ + $(top_srcdir)/include/libzutil.h \ $(top_srcdir)/include/thread_pool.h EXTRA_DIST = $(COMMON_H) $(USER_H) diff --git a/include/libzfs.h b/include/libzfs.h index a8e3c9c40..d34658055 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -54,25 +54,6 @@ extern "C" { #define ZFS_MAXPROPLEN MAXPATHLEN #define ZPOOL_MAXPROPLEN MAXPATHLEN -/* - * Default device paths - */ -#define DISK_ROOT "/dev" -#define UDISK_ROOT "/dev/disk" -#define ZVOL_ROOT "/dev/zvol" - -/* - * Default wait time for a device name to be created. - */ -#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ - -#define IMPORT_ORDER_PREFERRED_1 1 -#define IMPORT_ORDER_PREFERRED_2 2 -#define IMPORT_ORDER_SCAN_OFFSET 10 -#define IMPORT_ORDER_DEFAULT 100 -#define DEFAULT_IMPORT_PATH_SIZE 9 -extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE]; - /* * libzfs errors */ @@ -298,15 +279,9 @@ extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); -extern int zpool_label_disk_wait(char *, int); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); extern uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); -int zfs_dev_is_dm(char *dev_name); -int zfs_dev_is_whole_disk(char *dev_name); -char *zfs_get_underlying_path(char *dev_name); -char *zfs_get_enclosure_sysfs_path(char *dev_name); - const char *zpool_get_state_str(zpool_handle_t *); /* @@ -386,7 +361,6 @@ extern zpool_status_t zpool_get_status(zpool_handle_t *, char **, zpool_errata_t *); extern zpool_status_t zpool_import_status(nvlist_t *, char **, zpool_errata_t *); -extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); /* * Statistics and configuration functions. @@ -407,32 +381,6 @@ extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); extern void zpool_print_unsup_feat(nvlist_t *config); -/* - * Search for pools to import - */ - -typedef struct importargs { - char **path; /* a list of paths to search */ - int paths; /* number of paths to search */ - char *poolname; /* name of a pool to find */ - uint64_t guid; /* guid of a pool to find */ - char *cachefile; /* cachefile to use for import */ - int can_be_active : 1; /* can the pool be active? */ - int unique : 1; /* does 'poolname' already exist? */ - int exists : 1; /* set on return if pool already exists */ - int scan : 1; /* prefer scanning to libblkid cache */ - nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ -} importargs_t; - -extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); -extern int zpool_tryimport(libzfs_handle_t *hdl, char *target, - nvlist_t **configp, importargs_t *args); - -/* legacy pool search routines */ -extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); -extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, - char *, uint64_t); - /* * Miscellaneous pool functions */ @@ -451,8 +399,6 @@ extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, int name_flags); extern int zpool_upgrade(zpool_handle_t *, uint64_t); extern int zpool_get_history(zpool_handle_t *, nvlist_t **); -extern int zpool_history_unpack(char *, uint64_t, uint64_t *, - nvlist_t ***, uint_t *); extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, int); extern int zpool_events_clear(libzfs_handle_t *, int *); @@ -780,10 +726,6 @@ extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); extern int zfs_spa_version(zfs_handle_t *, int *); extern boolean_t zfs_bookmark_exists(const char *path); -extern int zfs_append_partition(char *path, size_t max_len); -extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen); -extern int zfs_strcmp_pathname(char *name, char *cmp_name, int wholedisk); -extern int zfs_path_order(char *path, int *order); /* * Mount support functions. @@ -819,33 +761,6 @@ extern int zfs_unshareall(zfs_handle_t *); extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); -/* - * Formats for iostat numbers. Examples: "12K", "30ms", "4B", "2321234", "-". - * - * ZFS_NICENUM_1024: Print kilo, mega, tera, peta, exa.. - * ZFS_NICENUM_BYTES: Print single bytes ("13B"), kilo, mega, tera... - * ZFS_NICENUM_TIME: Print nanosecs, microsecs, millisecs, seconds... - * ZFS_NICENUM_RAW: Print the raw number without any formatting - * ZFS_NICENUM_RAWTIME: Same as RAW, but print dashes ('-') for zero. - */ -enum zfs_nicenum_format { - ZFS_NICENUM_1024 = 0, - ZFS_NICENUM_BYTES = 1, - ZFS_NICENUM_TIME = 2, - ZFS_NICENUM_RAW = 3, - ZFS_NICENUM_RAWTIME = 4 -}; - -/* - * Utility function to convert a number to a human-readable form. - */ -extern void zfs_nicenum(uint64_t, char *, size_t); -extern void zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, - enum zfs_nicenum_format type); - - -extern void zfs_nicetime(uint64_t, char *, size_t); -extern void zfs_nicebytes(uint64_t, char *, size_t); extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* @@ -874,7 +789,6 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, /* * Label manipulation. */ -extern int zpool_read_label(int, nvlist_t **, int *); extern int zpool_clear_label(int); /* @@ -893,22 +807,6 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); -/* - * Support for Linux libudev derived persistent device strings - */ -extern boolean_t is_mpath_whole_disk(const char *); -extern void update_vdev_config_dev_strs(nvlist_t *); -extern char *zfs_strip_partition(char *); -extern char *zfs_strip_partition_path(char *); - -#ifdef HAVE_LIBUDEV -struct udev_device; - -extern boolean_t udev_is_mpath(struct udev_device *dev); -extern int zfs_device_get_devid(struct udev_device *, char *, size_t); -extern int zfs_device_get_physical(struct udev_device *, char *, size_t); -#endif - extern int zfs_remap_indirects(libzfs_handle_t *hdl, const char *); #ifdef __cplusplus diff --git a/include/libzutil.h b/include/libzutil.h new file mode 100644 index 000000000..39fc5554b --- /dev/null +++ b/include/libzutil.h @@ -0,0 +1,150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _LIBZUTIL_H +#define _LIBZUTIL_H + +#include <sys/nvpair.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Default wait time for a device name to be created. + */ +#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ + + +/* + * Pool Config Operations + * + * These are specific to the library libzfs or libzpool instance. + */ +typedef nvlist_t *refresh_config_func_t(void *, nvlist_t *); + +typedef int pool_active_func_t(void *, const char *, uint64_t, boolean_t *); + +typedef const struct pool_config_ops { + refresh_config_func_t *pco_refresh_config; + pool_active_func_t *pco_pool_active; +} pool_config_ops_t; + +/* + * An instance of pool_config_ops_t is expected in the caller's binary. + */ +extern const pool_config_ops_t libzfs_config_ops; +extern const pool_config_ops_t libzpool_config_ops; + +typedef struct importargs { + char **path; /* a list of paths to search */ + int paths; /* number of paths to search */ + const char *poolname; /* name of a pool to find */ + uint64_t guid; /* guid of a pool to find */ + const char *cachefile; /* cachefile to use for import */ + boolean_t can_be_active; /* can the pool be active? */ + boolean_t scan; /* prefer scanning to libblkid cache */ + nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ +} importargs_t; + +extern nvlist_t *zpool_search_import(void *, importargs_t *, + const pool_config_ops_t *); +extern int zpool_find_config(void *, const char *, nvlist_t **, importargs_t *, + const pool_config_ops_t *); + +extern const char * const * zpool_default_search_paths(size_t *count); +extern int zpool_read_label(int, nvlist_t **, int *); +extern int zpool_label_disk_wait(const char *, int); + +struct udev_device; + +extern int zfs_device_get_devid(struct udev_device *, char *, size_t); +extern int zfs_device_get_physical(struct udev_device *, char *, size_t); +extern void update_vdev_config_dev_strs(nvlist_t *); + +/* + * Default device paths + */ +#define DISK_ROOT "/dev" +#define UDISK_ROOT "/dev/disk" +#define ZVOL_ROOT "/dev/zvol" + +extern int zfs_append_partition(char *path, size_t max_len); +extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen); + +extern char *zfs_strip_partition(char *); +extern char *zfs_strip_partition_path(char *); + +extern int zfs_strcmp_pathname(const char *, const char *, int); + +extern int zfs_dev_is_dm(const char *); +extern int zfs_dev_is_whole_disk(const char *); +extern char *zfs_get_underlying_path(const char *); +extern char *zfs_get_enclosure_sysfs_path(const char *); + +#ifdef HAVE_LIBUDEV +extern boolean_t is_mpath_whole_disk(const char *); +#else +#define is_mpath_whole_disk(path) (B_FALSE); +#endif + +/* + * Formats for iostat numbers. Examples: "12K", "30ms", "4B", "2321234", "-". + * + * ZFS_NICENUM_1024: Print kilo, mega, tera, peta, exa.. + * ZFS_NICENUM_BYTES: Print single bytes ("13B"), kilo, mega, tera... + * ZFS_NICENUM_TIME: Print nanosecs, microsecs, millisecs, seconds... + * ZFS_NICENUM_RAW: Print the raw number without any formatting + * ZFS_NICENUM_RAWTIME: Same as RAW, but print dashes ('-') for zero. + */ +enum zfs_nicenum_format { + ZFS_NICENUM_1024 = 0, + ZFS_NICENUM_BYTES = 1, + ZFS_NICENUM_TIME = 2, + ZFS_NICENUM_RAW = 3, + ZFS_NICENUM_RAWTIME = 4 +}; + +/* + * Convert a number to a human-readable form. + */ +extern void zfs_nicebytes(uint64_t, char *, size_t); +extern void zfs_nicenum(uint64_t, char *, size_t); +extern void zfs_nicenum_format(uint64_t, char *, size_t, + enum zfs_nicenum_format); +extern void zfs_nicetime(uint64_t, char *, size_t); + +#define nicenum(num, buf, size) zfs_nicenum(num, buf, size) + +extern void zpool_dump_ddt(const ddt_stat_t *, const ddt_histogram_t *); +extern int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, + uint_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZUTIL_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 11c048c23..3637cc617 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -663,7 +663,6 @@ extern void random_init(void); extern void random_fini(void); struct spa; -extern void nicenum(uint64_t num, char *buf, size_t); extern void show_pool_stats(struct spa *); extern int set_global_var(char *arg); diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index b63ceffac..a552fad37 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -372,6 +372,7 @@ typedef struct zinject_record { #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 +#define ZINJECT_CALC_RANGE 0x8 #define ZEVENT_NONE 0x0 #define ZEVENT_NONBLOCK 0x1 diff --git a/lib/Makefile.am b/lib/Makefile.am index e1833b842..8dff773df 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -1,6 +1,6 @@ # NB: GNU Automake Manual, Chapter 8.3.5: Libtool Convenience Libraries # These six libraries are intermediary build components. -SUBDIRS = libavl libefi libicp libshare libspl libtpool libunicode +SUBDIRS = libavl libefi libicp libshare libspl libtpool libzutil libunicode # These four libraries, which are installed as the final build product, # incorporate the six convenience libraries given above. diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index da40c96ce..34933e627 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -55,14 +55,13 @@ nodist_libzfs_la_SOURCES = \ $(KERNEL_C) libzfs_la_LIBADD = \ - $(top_builddir)/lib/libefi/libefi.la \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libshare/libshare.la \ - $(top_builddir)/lib/libtpool/libtpool.la \ $(top_builddir)/lib/libuutil/libuutil.la \ - $(top_builddir)/lib/libzfs_core/libzfs_core.la + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(top_builddir)/lib/libzutil/libzutil.la -libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) $(LIBSSL) +libzfs_la_LIBADD += -lm $(LIBSSL) libzfs_la_LDFLAGS = -version-info 2:0:0 EXTRA_DIST = $(libzfs_pc_DATA) $(USER_C) diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index c84ed5bda..e79a936f9 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -36,7 +36,6 @@ #include <ctype.h> #include <errno.h> #include <libintl.h> -#include <math.h> #include <stdio.h> #include <stdlib.h> #include <strings.h> @@ -61,6 +60,7 @@ #include <sys/zap.h> #include <sys/dsl_crypt.h> #include <libzfs.h> +#include <libzutil.h> #include "zfs_namecheck.h" #include "zfs_prop.h" diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 634c076b8..44a46d1ce 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -26,2374 +26,149 @@ * Copyright (c) 2016, Intel Corporation. */ -/* - * Pool import support functions. - * - * To import a pool, we rely on reading the configuration information from the - * ZFS label of each device. If we successfully read the label, then we - * organize the configuration information in the following hierarchy: - * - * pool guid -> toplevel vdev guid -> label txg - * - * Duplicate entries matching this same tuple will be discarded. Once we have - * examined every device, we pick the best label txg config for each toplevel - * vdev. We then arrange these toplevel vdevs into a complete pool config, and - * update any paths that have changed. Finally, we attempt to import the pool - * using our derived config, and record the results. - */ - -#include <ctype.h> #include <devid.h> -#include <dirent.h> #include <errno.h> #include <libintl.h> #include <libgen.h> -#ifdef HAVE_LIBUDEV -#include <libudev.h> -#include <sched.h> -#endif #include <stddef.h> #include <stdlib.h> #include <string.h> #include <sys/stat.h> #include <unistd.h> -#include <fcntl.h> -#include <sys/vtoc.h> -#include <sys/dktp/fdisk.h> -#include <sys/efi_partition.h> -#include <thread_pool.h> #include <sys/vdev_impl.h> -#include <blkid/blkid.h> -#include "libzfs.h" -#include "libzfs_impl.h" #include <libzfs.h> +#include <libzfs_impl.h> +#include <libzutil.h> /* - * Intermediate structures used to gather configuration information. - */ -typedef struct config_entry { - uint64_t ce_txg; - nvlist_t *ce_config; - struct config_entry *ce_next; -} config_entry_t; - -typedef struct vdev_entry { - uint64_t ve_guid; - config_entry_t *ve_configs; - struct vdev_entry *ve_next; -} vdev_entry_t; - -typedef struct pool_entry { - uint64_t pe_guid; - vdev_entry_t *pe_vdevs; - struct pool_entry *pe_next; -} pool_entry_t; - -typedef struct name_entry { - char *ne_name; - uint64_t ne_guid; - uint64_t ne_order; - uint64_t ne_num_labels; - struct name_entry *ne_next; -} name_entry_t; - -typedef struct pool_list { - pool_entry_t *pools; - name_entry_t *names; -} pool_list_t; - -#define DEV_BYID_PATH "/dev/disk/by-id/" - -/* - * Linux persistent device strings for vdev labels - * - * based on libudev for consistency with libudev disk add/remove events - */ -#ifdef HAVE_LIBUDEV - -typedef struct vdev_dev_strs { - char vds_devid[128]; - char vds_devphys[128]; -} vdev_dev_strs_t; - -/* - * Obtain the persistent device id string (describes what) - * - * used by ZED vdev matching for auto-{online,expand,replace} + * Returns true if the named pool matches the given GUID. */ -int -zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +static int +pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid, + boolean_t *isactive) { - struct udev_list_entry *entry; - const char *bus; - char devbyid[MAXPATHLEN]; - - /* The bus based by-id path is preferred */ - bus = udev_device_get_property_value(dev, "ID_BUS"); - - if (bus == NULL) { - const char *dm_uuid; - - /* - * For multipath nodes use the persistent uuid based identifier - * - * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f - */ - dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); - if (dm_uuid != NULL) { - (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); - return (0); - } - - /* - * For volumes use the persistent /dev/zvol/dataset identifier - */ - entry = udev_device_get_devlinks_list_entry(dev); - while (entry != NULL) { - const char *name; - - name = udev_list_entry_get_name(entry); - if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { - (void) strlcpy(bufptr, name, buflen); - return (0); - } - entry = udev_list_entry_get_next(entry); - } + zpool_handle_t *zhp; + uint64_t theguid; - /* - * NVME 'by-id' symlinks are similar to bus case - */ - struct udev_device *parent; + if (zpool_open_silent(hdl, name, &zhp) != 0) + return (-1); - parent = udev_device_get_parent_with_subsystem_devtype(dev, - "nvme", NULL); - if (parent != NULL) - bus = "nvme"; /* continue with bus symlink search */ - else - return (ENODATA); + if (zhp == NULL) { + *isactive = B_FALSE; + return (0); } - /* - * locate the bus specific by-id link - */ - (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); - entry = udev_device_get_devlinks_list_entry(dev); - while (entry != NULL) { - const char *name; + verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID, + &theguid) == 0); - name = udev_list_entry_get_name(entry); - if (strncmp(name, devbyid, strlen(devbyid)) == 0) { - name += strlen(DEV_BYID_PATH); - (void) strlcpy(bufptr, name, buflen); - return (0); - } - entry = udev_list_entry_get_next(entry); - } + zpool_close(zhp); - return (ENODATA); + *isactive = (theguid == guid); + return (0); } -/* - * Obtain the persistent physical location string (describes where) - * - * used by ZED vdev matching for auto-{online,expand,replace} - */ -int -zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +static nvlist_t * +refresh_config(libzfs_handle_t *hdl, nvlist_t *config) { - const char *physpath = NULL; - struct udev_list_entry *entry; + nvlist_t *nvl; + zfs_cmd_t zc = {"\0"}; + int err, dstbuf_size; - /* - * Normal disks use ID_PATH for their physical path. - */ - physpath = udev_device_get_property_value(dev, "ID_PATH"); - if (physpath != NULL && strlen(physpath) > 0) { - (void) strlcpy(bufptr, physpath, buflen); - return (0); - } + if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) + return (NULL); - /* - * Device mapper devices are virtual and don't have a physical - * path. For them we use ID_VDEV instead, which is setup via the - * /etc/vdev_id.conf file. ID_VDEV provides a persistent path - * to a virtual device. If you don't have vdev_id.conf setup, - * you cannot use multipath autoreplace with device mapper. - */ - physpath = udev_device_get_property_value(dev, "ID_VDEV"); - if (physpath != NULL && strlen(physpath) > 0) { - (void) strlcpy(bufptr, physpath, buflen); - return (0); - } + dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4); - /* - * For ZFS volumes use the persistent /dev/zvol/dataset identifier - */ - entry = udev_device_get_devlinks_list_entry(dev); - while (entry != NULL) { - physpath = udev_list_entry_get_name(entry); - if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { - (void) strlcpy(bufptr, physpath, buflen); - return (0); - } - entry = udev_list_entry_get_next(entry); + if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); } - /* - * For all other devices fallback to using the by-uuid name. - */ - entry = udev_device_get_devlinks_list_entry(dev); - while (entry != NULL) { - physpath = udev_list_entry_get_name(entry); - if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) { - (void) strlcpy(bufptr, physpath, buflen); - return (0); + while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT, + &zc)) != 0 && errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); } - entry = udev_list_entry_get_next(entry); } - return (ENODATA); -} - -boolean_t -udev_is_mpath(struct udev_device *dev) -{ - return udev_device_get_property_value(dev, "DM_UUID") && - udev_device_get_property_value(dev, "MPATH_SBIN_PATH"); -} - -/* - * A disk is considered a multipath whole disk when: - * DEVNAME key value has "dm-" - * DM_NAME key value has "mpath" prefix - * DM_UUID key exists - * ID_PART_TABLE_TYPE key does not exist or is not gpt - */ -static boolean_t -udev_mpath_whole_disk(struct udev_device *dev) -{ - const char *devname, *type, *uuid; - - devname = udev_device_get_property_value(dev, "DEVNAME"); - type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); - uuid = udev_device_get_property_value(dev, "DM_UUID"); - - if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && - ((type == NULL) || (strcmp(type, "gpt") != 0)) && - (uuid != NULL)) { - return (B_TRUE); + if (err) { + zcmd_free_nvlists(&zc); + return (NULL); } - return (B_FALSE); -} - -/* - * Check if a disk is effectively a multipath whole disk - */ -boolean_t -is_mpath_whole_disk(const char *path) -{ - struct udev *udev; - struct udev_device *dev = NULL; - char nodepath[MAXPATHLEN]; - char *sysname; - boolean_t wholedisk = B_FALSE; - - if (realpath(path, nodepath) == NULL) - return (B_FALSE); - sysname = strrchr(nodepath, '/') + 1; - if (strncmp(sysname, "dm-", 3) != 0) - return (B_FALSE); - if ((udev = udev_new()) == NULL) - return (B_FALSE); - if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", - sysname)) == NULL) { - udev_device_unref(dev); - return (B_FALSE); + if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); } - wholedisk = udev_mpath_whole_disk(dev); - - udev_device_unref(dev); - return (wholedisk); -} - -static int -udev_device_is_ready(struct udev_device *dev) -{ -#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED - return (udev_device_get_is_initialized(dev)); -#else - /* wait for DEVLINKS property to be initialized */ - return (udev_device_get_property_value(dev, "DEVLINKS") != NULL); -#endif + zcmd_free_nvlists(&zc); + return (nvl); } -/* - * Wait up to timeout_ms for udev to set up the device node. The device is - * considered ready when libudev determines it has been initialized, all of - * the device links have been verified to exist, and it has been allowed to - * settle. At this point the device the device can be accessed reliably. - * Depending on the complexity of the udev rules this process could take - * several seconds. - */ -int -zpool_label_disk_wait(char *path, int timeout_ms) +static nvlist_t * +refresh_config_libzfs(void *handle, nvlist_t *tryconfig) { - struct udev *udev; - struct udev_device *dev = NULL; - char nodepath[MAXPATHLEN]; - char *sysname = NULL; - int ret = ENODEV; - int settle_ms = 50; - long sleep_ms = 10; - hrtime_t start, settle; - - if ((udev = udev_new()) == NULL) - return (ENXIO); - - start = gethrtime(); - settle = 0; - - do { - if (sysname == NULL) { - if (realpath(path, nodepath) != NULL) { - sysname = strrchr(nodepath, '/') + 1; - } else { - (void) usleep(sleep_ms * MILLISEC); - continue; - } - } - - dev = udev_device_new_from_subsystem_sysname(udev, - "block", sysname); - if ((dev != NULL) && udev_device_is_ready(dev)) { - struct udev_list_entry *links, *link = NULL; - - ret = 0; - links = udev_device_get_devlinks_list_entry(dev); - - udev_list_entry_foreach(link, links) { - struct stat64 statbuf; - const char *name; - - name = udev_list_entry_get_name(link); - errno = 0; - if (stat64(name, &statbuf) == 0 && errno == 0) - continue; - - settle = 0; - ret = ENODEV; - break; - } - - if (ret == 0) { - if (settle == 0) { - settle = gethrtime(); - } else if (NSEC2MSEC(gethrtime() - settle) >= - settle_ms) { - udev_device_unref(dev); - break; - } - } - } - - udev_device_unref(dev); - (void) usleep(sleep_ms * MILLISEC); - - } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); - - udev_unref(udev); - - return (ret); + return (refresh_config((libzfs_handle_t *)handle, tryconfig)); } -/* - * Encode the persistent devices strings - * used for the vdev disk label - */ static int -encode_device_strings(const char *path, vdev_dev_strs_t *ds, - boolean_t wholedisk) +pool_active_libzfs(void *handle, const char *name, uint64_t guid, + boolean_t *isactive) { - struct udev *udev; - struct udev_device *dev = NULL; - char nodepath[MAXPATHLEN]; - char *sysname; - int ret = ENODEV; - hrtime_t start; - - if ((udev = udev_new()) == NULL) - return (ENXIO); - - /* resolve path to a runtime device node instance */ - if (realpath(path, nodepath) == NULL) - goto no_dev; - - sysname = strrchr(nodepath, '/') + 1; - - /* - * Wait up to 3 seconds for udev to set up the device node context - */ - start = gethrtime(); - do { - dev = udev_device_new_from_subsystem_sysname(udev, "block", - sysname); - if (dev == NULL) - goto no_dev; - if (udev_device_is_ready(dev)) - break; /* udev ready */ - - udev_device_unref(dev); - dev = NULL; - - if (NSEC2MSEC(gethrtime() - start) < 10) - (void) sched_yield(); /* yield/busy wait up to 10ms */ - else - (void) usleep(10 * MILLISEC); - - } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC)); - - if (dev == NULL) - goto no_dev; - - /* - * Only whole disks require extra device strings - */ - if (!wholedisk && !udev_mpath_whole_disk(dev)) - goto no_dev; - - ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); - if (ret != 0) - goto no_dev_ref; - - /* physical location string (optional) */ - if (zfs_device_get_physical(dev, ds->vds_devphys, - sizeof (ds->vds_devphys)) != 0) { - ds->vds_devphys[0] = '\0'; /* empty string --> not available */ - } - -no_dev_ref: - udev_device_unref(dev); -no_dev: - udev_unref(udev); - - return (ret); + return (pool_active((libzfs_handle_t *)handle, name, guid, isactive)); } +const pool_config_ops_t libzfs_config_ops = { + .pco_refresh_config = refresh_config_libzfs, + .pco_pool_active = pool_active_libzfs, +}; + /* - * Update a leaf vdev's persistent device strings (Linux only) - * - * - only applies for a dedicated leaf vdev (aka whole disk) - * - updated during pool create|add|attach|import - * - used for matching device matching during auto-{online,expand,replace} - * - stored in a leaf disk config label (i.e. alongside 'path' NVP) - * - these strings are currently not used in kernel (i.e. for vdev_disk_open) - * - * single device node example: - * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' - * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' - * - * multipath device node example: - * devid: 'dm-uuid-mpath-35000c5006304de3f' - * - * We also store the enclosure sysfs path for turning on enclosure LEDs - * (if applicable): - * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + * Return the offset of the given label. */ -void -update_vdev_config_dev_strs(nvlist_t *nv) -{ - vdev_dev_strs_t vds; - char *env, *type, *path; - uint64_t wholedisk = 0; - char *upath, *spath; - - /* - * For the benefit of legacy ZFS implementations, allow - * for opting out of devid strings in the vdev label. - * - * example use: - * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer - * - * explanation: - * Older ZFS on Linux implementations had issues when attempting to - * display pool config VDEV names if a "devid" NVP value is present - * in the pool's config. - * - * For example, a pool that originated on illumos platform would - * have a devid value in the config and "zpool status" would fail - * when listing the config. - * - * A pool can be stripped of any "devid" values on import or - * prevented from adding them on zpool create|add by setting - * ZFS_VDEV_DEVID_OPT_OUT. - */ - env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); - if (env && (strtoul(env, NULL, 0) > 0 || - !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); - return; - } - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || - strcmp(type, VDEV_TYPE_DISK) != 0) { - return; - } - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) - return; - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); - - /* - * Update device string values in config nvlist - */ - if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { - (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); - if (vds.vds_devphys[0] != '\0') { - (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, - vds.vds_devphys); - } - - /* Add enclosure sysfs path (if disk is in an enclosure) */ - upath = zfs_get_underlying_path(path); - spath = zfs_get_enclosure_sysfs_path(upath); - if (spath) - nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, - spath); - else - nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); - - free(upath); - free(spath); - } else { - /* clear out any stale entries */ - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); - } -} -#else - -boolean_t -is_mpath_whole_disk(const char *path) +static uint64_t +label_offset(uint64_t size, int l) { - return (B_FALSE); + ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); } /* - * Wait up to timeout_ms for udev to set up the device node. The device is - * considered ready when the provided path have been verified to exist and - * it has been allowed to settle. At this point the device the device can - * be accessed reliably. Depending on the complexity of the udev rules thisi - * process could take several seconds. + * Given a file descriptor, clear (zero) the label information. This function + * is used in the appliance stack as part of the ZFS sysevent module and + * to implement the "zpool labelclear" command. */ int -zpool_label_disk_wait(char *path, int timeout_ms) +zpool_clear_label(int fd) { - int settle_ms = 50; - long sleep_ms = 10; - hrtime_t start, settle; struct stat64 statbuf; + int l; + vdev_label_t *label; + uint64_t size; - start = gethrtime(); - settle = 0; - - do { - errno = 0; - if ((stat64(path, &statbuf) == 0) && (errno == 0)) { - if (settle == 0) - settle = gethrtime(); - else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) - return (0); - } else if (errno != ENOENT) { - return (errno); - } - - usleep(sleep_ms * MILLISEC); - } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); - - return (ENODEV); -} - -void -update_vdev_config_dev_strs(nvlist_t *nv) -{ -} - -#endif /* HAVE_LIBUDEV */ - -/* - * Go through and fix up any path and/or devid information for the given vdev - * configuration. - */ -static int -fix_paths(nvlist_t *nv, name_entry_t *names) -{ - nvlist_t **child; - uint_t c, children; - uint64_t guid; - name_entry_t *ne, *best; - char *path; - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if (fix_paths(child[c], names) != 0) - return (-1); - return (0); - } - - /* - * This is a leaf (file or disk) vdev. In either case, go through - * the name list and see if we find a matching guid. If so, replace - * the path and see if we can calculate a new devid. - * - * There may be multiple names associated with a particular guid, in - * which case we have overlapping partitions or multiple paths to the - * same disk. In this case we prefer to use the path name which - * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we - * use the lowest order device which corresponds to the first match - * while traversing the ZPOOL_IMPORT_PATH search path. - */ - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) - path = NULL; - - best = NULL; - for (ne = names; ne != NULL; ne = ne->ne_next) { - if (ne->ne_guid == guid) { - if (path == NULL) { - best = ne; - break; - } - - if ((strlen(path) == strlen(ne->ne_name)) && - strncmp(path, ne->ne_name, strlen(path)) == 0) { - best = ne; - break; - } - - if (best == NULL) { - best = ne; - continue; - } - - /* Prefer paths with move vdev labels. */ - if (ne->ne_num_labels > best->ne_num_labels) { - best = ne; - continue; - } - - /* Prefer paths earlier in the search order. */ - if (ne->ne_num_labels == best->ne_num_labels && - ne->ne_order < best->ne_order) { - best = ne; - continue; - } - } - } - - if (best == NULL) + if (fstat64_blk(fd, &statbuf) == -1) return (0); + size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) + if ((label = calloc(1, sizeof (vdev_label_t))) == NULL) return (-1); - /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */ - update_vdev_config_dev_strs(nv); - - return (0); -} - -/* - * Add the given configuration to the list of known devices. - */ -static int -add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path, - int order, int num_labels, nvlist_t *config) -{ - uint64_t pool_guid, vdev_guid, top_guid, txg, state; - pool_entry_t *pe; - vdev_entry_t *ve; - config_entry_t *ce; - name_entry_t *ne; - - /* - * If this is a hot spare not currently in use or level 2 cache - * device, add it to the list of names to translate, but don't do - * anything else. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &state) == 0 && - (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && - nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { - if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) - return (-1); - - if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { - free(ne); - return (-1); - } - ne->ne_guid = vdev_guid; - ne->ne_order = order; - ne->ne_num_labels = num_labels; - ne->ne_next = pl->names; - pl->names = ne; - - return (0); - } - - /* - * If we have a valid config but cannot read any of these fields, then - * it means we have a half-initialized label. In vdev_label_init() - * we write a label with txg == 0 so that we can identify the device - * in case the user refers to the same disk later on. If we fail to - * create the pool, we'll be left with a label in this state - * which should not be considered part of a valid pool. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &pool_guid) != 0 || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, - &vdev_guid) != 0 || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, - &top_guid) != 0 || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0 || txg == 0) { - return (0); - } - - /* - * First, see if we know about this pool. If not, then add it to the - * list of known pools. - */ - for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { - if (pe->pe_guid == pool_guid) - break; - } - - if (pe == NULL) { - if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) { + for (l = 0; l < VDEV_LABELS; l++) { + if (pwrite64(fd, label, sizeof (vdev_label_t), + label_offset(size, l)) != sizeof (vdev_label_t)) { + free(label); return (-1); } - pe->pe_guid = pool_guid; - pe->pe_next = pl->pools; - pl->pools = pe; } - /* - * Second, see if we know about this toplevel vdev. Add it if its - * missing. - */ - for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { - if (ve->ve_guid == top_guid) - break; - } - - if (ve == NULL) { - if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { - return (-1); - } - ve->ve_guid = top_guid; - ve->ve_next = pe->pe_vdevs; - pe->pe_vdevs = ve; - } - - /* - * Third, see if we have a config with a matching transaction group. If - * so, then we do nothing. Otherwise, add it to the list of known - * configs. - */ - for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { - if (ce->ce_txg == txg) - break; - } - - if (ce == NULL) { - if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) { - return (-1); - } - ce->ce_txg = txg; - ce->ce_config = fnvlist_dup(config); - ce->ce_next = ve->ve_configs; - ve->ve_configs = ce; - } - - /* - * At this point we've successfully added our config to the list of - * known configs. The last thing to do is add the vdev guid -> path - * mappings so that we can fix up the configuration as necessary before - * doing the import. - */ - if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) - return (-1); - - if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { - free(ne); - return (-1); - } - - ne->ne_guid = vdev_guid; - ne->ne_order = order; - ne->ne_num_labels = num_labels; - ne->ne_next = pl->names; - pl->names = ne; - - return (0); -} - -/* - * Returns true if the named pool matches the given GUID. - */ -static int -pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid, - boolean_t *isactive) -{ - zpool_handle_t *zhp; - uint64_t theguid; - - if (zpool_open_silent(hdl, name, &zhp) != 0) - return (-1); - - if (zhp == NULL) { - *isactive = B_FALSE; - return (0); - } - - verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID, - &theguid) == 0); - - zpool_close(zhp); - - *isactive = (theguid == guid); - return (0); -} - -static nvlist_t * -refresh_config(libzfs_handle_t *hdl, nvlist_t *config) -{ - nvlist_t *nvl; - zfs_cmd_t zc = {"\0"}; - int err, dstbuf_size; - - if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) - return (NULL); - - dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4); - - if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) { - zcmd_free_nvlists(&zc); - return (NULL); - } - - while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT, - &zc)) != 0 && errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (NULL); - } - } - - if (err) { - zcmd_free_nvlists(&zc); - return (NULL); - } - - if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) { - zcmd_free_nvlists(&zc); - return (NULL); - } - - zcmd_free_nvlists(&zc); - return (nvl); -} - -/* - * Determine if the vdev id is a hole in the namespace. - */ -boolean_t -vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) -{ - int c; - - for (c = 0; c < holes; c++) { - - /* Top-level is a hole */ - if (hole_array[c] == id) - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * Convert our list of pools into the definitive set of configurations. We - * start by picking the best config for each toplevel vdev. Once that's done, - * we assemble the toplevel vdevs into a full config for the pool. We make a - * pass to fix up any incorrect paths, and then add it to the main list to - * return to the user. - */ -static nvlist_t * -get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, - nvlist_t *policy) -{ - pool_entry_t *pe; - vdev_entry_t *ve; - config_entry_t *ce; - nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; - nvlist_t **spares, **l2cache; - uint_t i, nspares, nl2cache; - boolean_t config_seen; - uint64_t best_txg; - char *name, *hostname = NULL; - uint64_t guid; - uint_t children = 0; - nvlist_t **child = NULL; - uint_t holes; - uint64_t *hole_array, max_id; - uint_t c; - boolean_t isactive; - uint64_t hostid; - nvlist_t *nvl; - boolean_t valid_top_config = B_FALSE; - - if (nvlist_alloc(&ret, 0, 0) != 0) - goto nomem; - - for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { - uint64_t id, max_txg = 0; - - if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) - goto nomem; - config_seen = B_FALSE; - - /* - * Iterate over all toplevel vdevs. Grab the pool configuration - * from the first one we find, and then go through the rest and - * add them as necessary to the 'vdevs' member of the config. - */ - for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { - - /* - * Determine the best configuration for this vdev by - * selecting the config with the latest transaction - * group. - */ - best_txg = 0; - for (ce = ve->ve_configs; ce != NULL; - ce = ce->ce_next) { - - if (ce->ce_txg > best_txg) { - tmp = ce->ce_config; - best_txg = ce->ce_txg; - } - } - - /* - * We rely on the fact that the max txg for the - * pool will contain the most up-to-date information - * about the valid top-levels in the vdev namespace. - */ - if (best_txg > max_txg) { - (void) nvlist_remove(config, - ZPOOL_CONFIG_VDEV_CHILDREN, - DATA_TYPE_UINT64); - (void) nvlist_remove(config, - ZPOOL_CONFIG_HOLE_ARRAY, - DATA_TYPE_UINT64_ARRAY); - - max_txg = best_txg; - hole_array = NULL; - holes = 0; - max_id = 0; - valid_top_config = B_FALSE; - - if (nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { - verify(nvlist_add_uint64(config, - ZPOOL_CONFIG_VDEV_CHILDREN, - max_id) == 0); - valid_top_config = B_TRUE; - } - - if (nvlist_lookup_uint64_array(tmp, - ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, - &holes) == 0) { - verify(nvlist_add_uint64_array(config, - ZPOOL_CONFIG_HOLE_ARRAY, - hole_array, holes) == 0); - } - } - - if (!config_seen) { - /* - * Copy the relevant pieces of data to the pool - * configuration: - * - * version - * pool guid - * name - * comment (if available) - * pool state - * hostid (if available) - * hostname (if available) - */ - uint64_t state, version; - char *comment = NULL; - - version = fnvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_VERSION); - fnvlist_add_uint64(config, - ZPOOL_CONFIG_VERSION, version); - guid = fnvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_POOL_GUID); - fnvlist_add_uint64(config, - ZPOOL_CONFIG_POOL_GUID, guid); - name = fnvlist_lookup_string(tmp, - ZPOOL_CONFIG_POOL_NAME); - fnvlist_add_string(config, - ZPOOL_CONFIG_POOL_NAME, name); - - if (nvlist_lookup_string(tmp, - ZPOOL_CONFIG_COMMENT, &comment) == 0) - fnvlist_add_string(config, - ZPOOL_CONFIG_COMMENT, comment); - - state = fnvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_POOL_STATE); - fnvlist_add_uint64(config, - ZPOOL_CONFIG_POOL_STATE, state); - - hostid = 0; - if (nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_HOSTID, &hostid) == 0) { - fnvlist_add_uint64(config, - ZPOOL_CONFIG_HOSTID, hostid); - hostname = fnvlist_lookup_string(tmp, - ZPOOL_CONFIG_HOSTNAME); - fnvlist_add_string(config, - ZPOOL_CONFIG_HOSTNAME, hostname); - } - - config_seen = B_TRUE; - } - - /* - * Add this top-level vdev to the child array. - */ - verify(nvlist_lookup_nvlist(tmp, - ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); - verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, - &id) == 0); - - if (id >= children) { - nvlist_t **newchild; - - newchild = zfs_alloc(hdl, (id + 1) * - sizeof (nvlist_t *)); - if (newchild == NULL) - goto nomem; - - for (c = 0; c < children; c++) - newchild[c] = child[c]; - - free(child); - child = newchild; - children = id + 1; - } - if (nvlist_dup(nvtop, &child[id], 0) != 0) - goto nomem; - - } - - /* - * If we have information about all the top-levels then - * clean up the nvlist which we've constructed. This - * means removing any extraneous devices that are - * beyond the valid range or adding devices to the end - * of our array which appear to be missing. - */ - if (valid_top_config) { - if (max_id < children) { - for (c = max_id; c < children; c++) - nvlist_free(child[c]); - children = max_id; - } else if (max_id > children) { - nvlist_t **newchild; - - newchild = zfs_alloc(hdl, (max_id) * - sizeof (nvlist_t *)); - if (newchild == NULL) - goto nomem; - - for (c = 0; c < children; c++) - newchild[c] = child[c]; - - free(child); - child = newchild; - children = max_id; - } - } - - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &guid) == 0); - - /* - * The vdev namespace may contain holes as a result of - * device removal. We must add them back into the vdev - * tree before we process any missing devices. - */ - if (holes > 0) { - ASSERT(valid_top_config); - - for (c = 0; c < children; c++) { - nvlist_t *holey; - - if (child[c] != NULL || - !vdev_is_hole(hole_array, holes, c)) - continue; - - if (nvlist_alloc(&holey, NV_UNIQUE_NAME, - 0) != 0) - goto nomem; - - /* - * Holes in the namespace are treated as - * "hole" top-level vdevs and have a - * special flag set on them. - */ - if (nvlist_add_string(holey, - ZPOOL_CONFIG_TYPE, - VDEV_TYPE_HOLE) != 0 || - nvlist_add_uint64(holey, - ZPOOL_CONFIG_ID, c) != 0 || - nvlist_add_uint64(holey, - ZPOOL_CONFIG_GUID, 0ULL) != 0) { - nvlist_free(holey); - goto nomem; - } - child[c] = holey; - } - } - - /* - * Look for any missing top-level vdevs. If this is the case, - * create a faked up 'missing' vdev as a placeholder. We cannot - * simply compress the child array, because the kernel performs - * certain checks to make sure the vdev IDs match their location - * in the configuration. - */ - for (c = 0; c < children; c++) { - if (child[c] == NULL) { - nvlist_t *missing; - if (nvlist_alloc(&missing, NV_UNIQUE_NAME, - 0) != 0) - goto nomem; - if (nvlist_add_string(missing, - ZPOOL_CONFIG_TYPE, - VDEV_TYPE_MISSING) != 0 || - nvlist_add_uint64(missing, - ZPOOL_CONFIG_ID, c) != 0 || - nvlist_add_uint64(missing, - ZPOOL_CONFIG_GUID, 0ULL) != 0) { - nvlist_free(missing); - goto nomem; - } - child[c] = missing; - } - } - - /* - * Put all of this pool's top-level vdevs into a root vdev. - */ - if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) - goto nomem; - if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) != 0 || - nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || - nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || - nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - child, children) != 0) { - nvlist_free(nvroot); - goto nomem; - } - - for (c = 0; c < children; c++) - nvlist_free(child[c]); - free(child); - children = 0; - child = NULL; - - /* - * Go through and fix up any paths and/or devids based on our - * known list of vdev GUID -> path mappings. - */ - if (fix_paths(nvroot, pl->names) != 0) { - nvlist_free(nvroot); - goto nomem; - } - - /* - * Add the root vdev to this pool's configuration. - */ - if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - nvroot) != 0) { - nvlist_free(nvroot); - goto nomem; - } - nvlist_free(nvroot); - - /* - * zdb uses this path to report on active pools that were - * imported or created using -R. - */ - if (active_ok) - goto add_pool; - - /* - * Determine if this pool is currently active, in which case we - * can't actually import it. - */ - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &guid) == 0); - - if (pool_active(hdl, name, guid, &isactive) != 0) - goto error; - - if (isactive) { - nvlist_free(config); - config = NULL; - continue; - } - - if (policy != NULL) { - if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, - policy) != 0) - goto nomem; - } - - if ((nvl = refresh_config(hdl, config)) == NULL) { - nvlist_free(config); - config = NULL; - continue; - } - - nvlist_free(config); - config = nvl; - - /* - * Go through and update the paths for spares, now that we have - * them. - */ - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - if (fix_paths(spares[i], pl->names) != 0) - goto nomem; - } - } - - /* - * Update the paths for l2cache devices. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache) == 0) { - for (i = 0; i < nl2cache; i++) { - if (fix_paths(l2cache[i], pl->names) != 0) - goto nomem; - } - } - - /* - * Restore the original information read from the actual label. - */ - (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, - DATA_TYPE_UINT64); - (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, - DATA_TYPE_STRING); - if (hostid != 0) { - verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, - hostid) == 0); - verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, - hostname) == 0); - } - -add_pool: - /* - * Add this pool to the list of configs. - */ - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); - if (nvlist_add_nvlist(ret, name, config) != 0) - goto nomem; - - nvlist_free(config); - config = NULL; - } - - return (ret); - -nomem: - (void) no_memory(hdl); -error: - nvlist_free(config); - nvlist_free(ret); - for (c = 0; c < children; c++) - nvlist_free(child[c]); - free(child); - - return (NULL); -} - -/* - * Return the offset of the given label. - */ -static uint64_t -label_offset(uint64_t size, int l) -{ - ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); - return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? - 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); -} - -/* - * Given a file descriptor, read the label information and return an nvlist - * describing the configuration, if there is one. The number of valid - * labels found will be returned in num_labels when non-NULL. - */ -int -zpool_read_label(int fd, nvlist_t **config, int *num_labels) -{ - struct stat64 statbuf; - int l, count = 0; - vdev_label_t *label; - nvlist_t *expected_config = NULL; - uint64_t expected_guid = 0, size; - int error; - - *config = NULL; - - if (fstat64_blk(fd, &statbuf) == -1) - return (0); - size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - - error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label)); - if (error) - return (-1); - - for (l = 0; l < VDEV_LABELS; l++) { - uint64_t state, guid, txg; - - if (pread64(fd, label, sizeof (vdev_label_t), - label_offset(size, l)) != sizeof (vdev_label_t)) - continue; - - if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, - sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) - continue; - - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, - &guid) != 0 || guid == 0) { - nvlist_free(*config); - continue; - } - - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || state > POOL_STATE_L2CACHE) { - nvlist_free(*config); - continue; - } - - if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0 || txg == 0)) { - nvlist_free(*config); - continue; - } - - if (expected_guid) { - if (expected_guid == guid) - count++; - - nvlist_free(*config); - } else { - expected_config = *config; - expected_guid = guid; - count++; - } - } - - if (num_labels != NULL) - *num_labels = count; - - free(label); - *config = expected_config; - - return (0); -} - -typedef struct rdsk_node { - char *rn_name; /* Full path to device */ - int rn_order; /* Preferred order (low to high) */ - int rn_num_labels; /* Number of valid labels */ - uint64_t rn_vdev_guid; /* Expected vdev guid when set */ - libzfs_handle_t *rn_hdl; - nvlist_t *rn_config; /* Label config */ - avl_tree_t *rn_avl; - avl_node_t rn_node; - pthread_mutex_t *rn_lock; - boolean_t rn_labelpaths; -} rdsk_node_t; - -/* - * Sorted by vdev guid and full path to allow for multiple entries with - * the same full path name. This is required because it's possible to - * have multiple block devices with labels that refer to the same - * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both - * entries need to be added to the cache. Scenarios where this can occur - * include overwritten pool labels, devices which are visible from multiple - * hosts and multipath devices. - */ -static int -slice_cache_compare(const void *arg1, const void *arg2) -{ - const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; - const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; - uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; - uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; - int rv; - - rv = AVL_CMP(guid1, guid2); - if (rv) - return (rv); - - return (AVL_ISIGN(strcmp(nm1, nm2))); -} - -static boolean_t -is_watchdog_dev(char *dev) -{ - /* For 'watchdog' dev */ - if (strcmp(dev, "watchdog") == 0) - return (B_TRUE); - - /* For 'watchdog<digit><whatever> */ - if (strstr(dev, "watchdog") == dev && isdigit(dev[8])) - return (B_TRUE); - - return (B_FALSE); -} - -static int -label_paths_impl(libzfs_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid, - uint64_t vdev_guid, char **path, char **devid) -{ - nvlist_t **child; - uint_t c, children; - uint64_t guid; - char *val; - int error; - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) { - error = label_paths_impl(hdl, child[c], - pool_guid, vdev_guid, path, devid); - if (error) - return (error); - } - return (0); - } - - if (nvroot == NULL) - return (0); - - error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid); - if ((error != 0) || (guid != vdev_guid)) - return (0); - - error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val); - if (error == 0) - *path = val; - - error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val); - if (error == 0) - *devid = val; - - return (0); -} - -/* - * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID - * and store these strings as config_path and devid_path respectively. - * The returned pointers are only valid as long as label remains valid. - */ -static int -label_paths(libzfs_handle_t *hdl, nvlist_t *label, char **path, char **devid) -{ - nvlist_t *nvroot; - uint64_t pool_guid; - uint64_t vdev_guid; - - *path = NULL; - *devid = NULL; - - if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid)) - return (ENOENT); - - return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, - devid)); -} - -static void -zpool_open_func(void *arg) -{ - rdsk_node_t *rn = arg; - libzfs_handle_t *hdl = rn->rn_hdl; - struct stat64 statbuf; - nvlist_t *config; - char *bname, *dupname; - uint64_t vdev_guid = 0; - int error; - int num_labels; - int fd; - - /* - * Skip devices with well known prefixes there can be side effects - * when opening devices which need to be avoided. - * - * hpet - High Precision Event Timer - * watchdog - Watchdog must be closed in a special way. - */ - dupname = zfs_strdup(hdl, rn->rn_name); - bname = basename(dupname); - error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname)); - free(dupname); - if (error) - return; - - /* - * Ignore failed stats. We only want regular files and block devices. - */ - if (stat64(rn->rn_name, &statbuf) != 0 || - (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) - return; - - /* - * Preferentially open using O_DIRECT to bypass the block device - * cache which may be stale for multipath devices. An EINVAL errno - * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. - */ - fd = open(rn->rn_name, O_RDONLY | O_DIRECT); - if ((fd < 0) && (errno == EINVAL)) - fd = open(rn->rn_name, O_RDONLY); - - if (fd < 0) - return; - - /* - * This file is too small to hold a zpool - */ - if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) { - (void) close(fd); - return; - } - - error = zpool_read_label(fd, &config, &num_labels); - if (error != 0) { - (void) close(fd); - return; - } - - if (num_labels == 0) { - (void) close(fd); - nvlist_free(config); - return; - } - - /* - * Check that the vdev is for the expected guid. Additional entries - * are speculatively added based on the paths stored in the labels. - * Entries with valid paths but incorrect guids must be removed. - */ - error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); - if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { - (void) close(fd); - nvlist_free(config); - return; - } - - (void) close(fd); - - rn->rn_config = config; - rn->rn_num_labels = num_labels; - - /* - * Add additional entries for paths described by this label. - */ - if (rn->rn_labelpaths) { - char *path = NULL; - char *devid = NULL; - rdsk_node_t *slice; - avl_index_t where; - int error; - - if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) - return; - - /* - * Allow devlinks to stabilize so all paths are available. - */ - zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT); - - if (path != NULL) { - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - slice->rn_name = zfs_strdup(hdl, path); - slice->rn_vdev_guid = vdev_guid; - slice->rn_avl = rn->rn_avl; - slice->rn_hdl = hdl; - slice->rn_order = IMPORT_ORDER_PREFERRED_1; - slice->rn_labelpaths = B_FALSE; - pthread_mutex_lock(rn->rn_lock); - if (avl_find(rn->rn_avl, slice, &where)) { - pthread_mutex_unlock(rn->rn_lock); - free(slice->rn_name); - free(slice); - } else { - avl_insert(rn->rn_avl, slice, where); - pthread_mutex_unlock(rn->rn_lock); - zpool_open_func(slice); - } - } - - if (devid != NULL) { - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - error = asprintf(&slice->rn_name, "%s%s", - DEV_BYID_PATH, devid); - if (error == -1) { - free(slice); - return; - } - - slice->rn_vdev_guid = vdev_guid; - slice->rn_avl = rn->rn_avl; - slice->rn_hdl = hdl; - slice->rn_order = IMPORT_ORDER_PREFERRED_2; - slice->rn_labelpaths = B_FALSE; - pthread_mutex_lock(rn->rn_lock); - if (avl_find(rn->rn_avl, slice, &where)) { - pthread_mutex_unlock(rn->rn_lock); - free(slice->rn_name); - free(slice); - } else { - avl_insert(rn->rn_avl, slice, where); - pthread_mutex_unlock(rn->rn_lock); - zpool_open_func(slice); - } - } - } -} - -/* - * Given a file descriptor, clear (zero) the label information. This function - * is used in the appliance stack as part of the ZFS sysevent module and - * to implement the "zpool labelclear" command. - */ -int -zpool_clear_label(int fd) -{ - struct stat64 statbuf; - int l; - vdev_label_t *label; - uint64_t size; - - if (fstat64_blk(fd, &statbuf) == -1) - return (0); - size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - - if ((label = calloc(1, sizeof (vdev_label_t))) == NULL) - return (-1); - - for (l = 0; l < VDEV_LABELS; l++) { - if (pwrite64(fd, label, sizeof (vdev_label_t), - label_offset(size, l)) != sizeof (vdev_label_t)) { - free(label); - return (-1); - } - } - - free(label); - return (0); -} - -static void -zpool_find_import_scan_add_slice(libzfs_handle_t *hdl, pthread_mutex_t *lock, - avl_tree_t *cache, char *path, const char *name, int order) -{ - avl_index_t where; - rdsk_node_t *slice; - - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) { - free(slice); - return; - } - slice->rn_vdev_guid = 0; - slice->rn_lock = lock; - slice->rn_avl = cache; - slice->rn_hdl = hdl; - slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET; - slice->rn_labelpaths = B_FALSE; - - pthread_mutex_lock(lock); - if (avl_find(cache, slice, &where)) { - free(slice->rn_name); - free(slice); - } else { - avl_insert(cache, slice, where); - } - pthread_mutex_unlock(lock); -} - -static int -zpool_find_import_scan_dir(libzfs_handle_t *hdl, pthread_mutex_t *lock, - avl_tree_t *cache, char *dir, int order) -{ - int error; - char path[MAXPATHLEN]; - struct dirent64 *dp; - DIR *dirp; - - if (realpath(dir, path) == NULL) { - error = errno; - if (error == ENOENT) - return (0); - - zfs_error_aux(hdl, strerror(error)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( - TEXT_DOMAIN, "cannot resolve path '%s'"), dir); - return (error); - } - - dirp = opendir(path); - if (dirp == NULL) { - error = errno; - zfs_error_aux(hdl, strerror(error)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, - dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); - return (error); - } - - while ((dp = readdir64(dirp)) != NULL) { - const char *name = dp->d_name; - if (name[0] == '.' && - (name[1] == 0 || (name[1] == '.' && name[2] == 0))) - continue; - - zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, - order); - } - - (void) closedir(dirp); - return (0); -} - -static int -zpool_find_import_scan_path(libzfs_handle_t *hdl, pthread_mutex_t *lock, - avl_tree_t *cache, char *dir, int order) -{ - int error = 0; - char path[MAXPATHLEN]; - char *d, *b; - char *dpath, *name; - - /* - * Seperate the directory part and last part of the - * path. We do this so that we can get the realpath of - * the directory. We don't get the realpath on the - * whole path because if it's a symlink, we want the - * path of the symlink not where it points to. - */ - d = zfs_strdup(hdl, dir); - b = zfs_strdup(hdl, dir); - dpath = dirname(d); - name = basename(b); - - if (realpath(dpath, path) == NULL) { - error = errno; - if (error == ENOENT) { - error = 0; - goto out; - } - - zfs_error_aux(hdl, strerror(error)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( - TEXT_DOMAIN, "cannot resolve path '%s'"), dir); - goto out; - } - - zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); - -out: - free(b); - free(d); - return (error); -} - -/* - * Scan a list of directories for zfs devices. - */ -static int -zpool_find_import_scan(libzfs_handle_t *hdl, pthread_mutex_t *lock, - avl_tree_t **slice_cache, char **dir, int dirs) -{ - avl_tree_t *cache; - rdsk_node_t *slice; - void *cookie; - int i, error; - - *slice_cache = NULL; - cache = zfs_alloc(hdl, sizeof (avl_tree_t)); - avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), - offsetof(rdsk_node_t, rn_node)); - - for (i = 0; i < dirs; i++) { - struct stat sbuf; - - if (stat(dir[i], &sbuf) != 0) { - error = errno; - if (error == ENOENT) - continue; - - zfs_error_aux(hdl, strerror(error)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( - TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]); - goto error; - } - - /* - * If dir[i] is a directory, we walk through it and add all - * the entry to the cache. If it's not a directory, we just - * add it to the cache. - */ - if (S_ISDIR(sbuf.st_mode)) { - if ((error = zpool_find_import_scan_dir(hdl, lock, - cache, dir[i], i)) != 0) - goto error; - } else { - if ((error = zpool_find_import_scan_path(hdl, lock, - cache, dir[i], i)) != 0) - goto error; - } - } - - *slice_cache = cache; - return (0); - -error: - cookie = NULL; - while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { - free(slice->rn_name); - free(slice); - } - free(cache); - - return (error); -} - -/* - * Use libblkid to quickly enumerate all known zfs devices. - */ -static int -zpool_find_import_blkid(libzfs_handle_t *hdl, pthread_mutex_t *lock, - avl_tree_t **slice_cache) -{ - rdsk_node_t *slice; - blkid_cache cache; - blkid_dev_iterate iter; - blkid_dev dev; - avl_index_t where; - int error; - - *slice_cache = NULL; - - error = blkid_get_cache(&cache, NULL); - if (error != 0) - return (error); - - error = blkid_probe_all_new(cache); - if (error != 0) { - blkid_put_cache(cache); - return (error); - } - - iter = blkid_dev_iterate_begin(cache); - if (iter == NULL) { - blkid_put_cache(cache); - return (EINVAL); - } - - error = blkid_dev_set_search(iter, "TYPE", "zfs_member"); - if (error != 0) { - blkid_dev_iterate_end(iter); - blkid_put_cache(cache); - return (error); - } - - *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t)); - avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), - offsetof(rdsk_node_t, rn_node)); - - while (blkid_dev_next(iter, &dev) == 0) { - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev)); - slice->rn_vdev_guid = 0; - slice->rn_lock = lock; - slice->rn_avl = *slice_cache; - slice->rn_hdl = hdl; - slice->rn_labelpaths = B_TRUE; - - error = zfs_path_order(slice->rn_name, &slice->rn_order); - if (error == 0) - slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; - else - slice->rn_order = IMPORT_ORDER_DEFAULT; - - pthread_mutex_lock(lock); - if (avl_find(*slice_cache, slice, &where)) { - free(slice->rn_name); - free(slice); - } else { - avl_insert(*slice_cache, slice, where); - } - pthread_mutex_unlock(lock); - } - - blkid_dev_iterate_end(iter); - blkid_put_cache(cache); - - return (0); -} - -char * -zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { - "/dev/disk/by-vdev", /* Custom rules, use first if they exist */ - "/dev/mapper", /* Use multipath devices before components */ - "/dev/disk/by-partlabel", /* Single unique entry set by user */ - "/dev/disk/by-partuuid", /* Generated partition uuid */ - "/dev/disk/by-label", /* Custom persistent labels */ - "/dev/disk/by-uuid", /* Single unique entry and persistent */ - "/dev/disk/by-id", /* May be multiple entries and persistent */ - "/dev/disk/by-path", /* Encodes physical location and persistent */ - "/dev" /* UNSAFE device names will change */ -}; - -/* - * Given a list of directories to search, find all pools stored on disk. This - * includes partial pools which are not available to import. If no args are - * given (argc is 0), then the default directory (/dev/dsk) is searched. - * poolname or guid (but not both) are provided by the caller when trying - * to import a specific pool. - */ -static nvlist_t * -zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) -{ - nvlist_t *ret = NULL; - pool_list_t pools = { 0 }; - pool_entry_t *pe, *penext; - vdev_entry_t *ve, *venext; - config_entry_t *ce, *cenext; - name_entry_t *ne, *nenext; - pthread_mutex_t lock; - avl_tree_t *cache; - rdsk_node_t *slice; - void *cookie; - tpool_t *t; - - verify(iarg->poolname == NULL || iarg->guid == 0); - pthread_mutex_init(&lock, NULL); - - /* - * Locate pool member vdevs using libblkid or by directory scanning. - * On success a newly allocated AVL tree which is populated with an - * entry for each discovered vdev will be returned as the cache. - * It's the callers responsibility to consume and destroy this tree. - */ - if (iarg->scan || iarg->paths != 0) { - int dirs = iarg->paths; - char **dir = iarg->path; - - if (dirs == 0) { - dir = zpool_default_import_path; - dirs = DEFAULT_IMPORT_PATH_SIZE; - } - - if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0) - return (NULL); - } else { - if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) - return (NULL); - } - - /* - * Create a thread pool to parallelize the process of reading and - * validating labels, a large number of threads can be used due to - * minimal contention. - */ - t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); - for (slice = avl_first(cache); slice; - (slice = avl_walk(cache, slice, AVL_AFTER))) - (void) tpool_dispatch(t, zpool_open_func, slice); - - tpool_wait(t); - tpool_destroy(t); - - /* - * Process the cache filtering out any entries which are not - * for the specificed pool then adding matching label configs. - */ - cookie = NULL; - while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { - if (slice->rn_config != NULL) { - nvlist_t *config = slice->rn_config; - boolean_t matched = B_TRUE; - boolean_t aux = B_FALSE; - int fd; - - /* - * Check if it's a spare or l2cache device. If it is, - * we need to skip the name and guid check since they - * don't exist on aux device label. - */ - if (iarg->poolname != NULL || iarg->guid != 0) { - uint64_t state; - aux = nvlist_lookup_uint64(config, - ZPOOL_CONFIG_POOL_STATE, &state) == 0 && - (state == POOL_STATE_SPARE || - state == POOL_STATE_L2CACHE); - } - - if (iarg->poolname != NULL && !aux) { - char *pname; - - matched = nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && - strcmp(iarg->poolname, pname) == 0; - } else if (iarg->guid != 0 && !aux) { - uint64_t this_guid; - - matched = nvlist_lookup_uint64(config, - ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 && - iarg->guid == this_guid; - } - if (matched) { - /* - * Verify all remaining entries can be opened - * exclusively. This will prune all underlying - * multipath devices which otherwise could - * result in the vdev appearing as UNAVAIL. - * - * Under zdb, this step isn't required and - * would prevent a zdb -e of active pools with - * no cachefile. - */ - fd = open(slice->rn_name, O_RDONLY | O_EXCL); - if (fd >= 0 || iarg->can_be_active) { - if (fd >= 0) - close(fd); - add_config(hdl, &pools, - slice->rn_name, slice->rn_order, - slice->rn_num_labels, config); - } - } - nvlist_free(config); - } - free(slice->rn_name); - free(slice); - } - avl_destroy(cache); - free(cache); - pthread_mutex_destroy(&lock); - - ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); - - for (pe = pools.pools; pe != NULL; pe = penext) { - penext = pe->pe_next; - for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { - venext = ve->ve_next; - for (ce = ve->ve_configs; ce != NULL; ce = cenext) { - cenext = ce->ce_next; - nvlist_free(ce->ce_config); - free(ce); - } - free(ve); - } - free(pe); - } - - for (ne = pools.names; ne != NULL; ne = nenext) { - nenext = ne->ne_next; - free(ne->ne_name); - free(ne); - } - - return (ret); -} - -nvlist_t * -zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv) -{ - importargs_t iarg = { 0 }; - - iarg.paths = argc; - iarg.path = argv; - - return (zpool_find_import_impl(hdl, &iarg)); -} - -/* - * Given a cache file, return the contents as a list of importable pools. - * poolname or guid (but not both) are provided by the caller when trying - * to import a specific pool. - */ -nvlist_t * -zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile, - char *poolname, uint64_t guid) -{ - char *buf; - int fd; - struct stat64 statbuf; - nvlist_t *raw, *src, *dst; - nvlist_t *pools; - nvpair_t *elem; - char *name; - uint64_t this_guid; - boolean_t active; - - verify(poolname == NULL || guid == 0); - - if ((fd = open(cachefile, O_RDONLY)) < 0) { - zfs_error_aux(hdl, "%s", strerror(errno)); - (void) zfs_error(hdl, EZFS_BADCACHE, - dgettext(TEXT_DOMAIN, "failed to open cache file")); - return (NULL); - } - - if (fstat64(fd, &statbuf) != 0) { - zfs_error_aux(hdl, "%s", strerror(errno)); - (void) close(fd); - (void) zfs_error(hdl, EZFS_BADCACHE, - dgettext(TEXT_DOMAIN, "failed to get size of cache file")); - return (NULL); - } - - if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) { - (void) close(fd); - return (NULL); - } - - if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { - (void) close(fd); - free(buf); - (void) zfs_error(hdl, EZFS_BADCACHE, - dgettext(TEXT_DOMAIN, - "failed to read cache file contents")); - return (NULL); - } - - (void) close(fd); - - if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { - free(buf); - (void) zfs_error(hdl, EZFS_BADCACHE, - dgettext(TEXT_DOMAIN, - "invalid or corrupt cache file contents")); - return (NULL); - } - - free(buf); - - /* - * Go through and get the current state of the pools and refresh their - * state. - */ - if (nvlist_alloc(&pools, 0, 0) != 0) { - (void) no_memory(hdl); - nvlist_free(raw); - return (NULL); - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { - src = fnvpair_value_nvlist(elem); - - name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); - if (poolname != NULL && strcmp(poolname, name) != 0) - continue; - - this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); - if (guid != 0 && guid != this_guid) - continue; - - if (pool_active(hdl, name, this_guid, &active) != 0) { - nvlist_free(raw); - nvlist_free(pools); - return (NULL); - } - - if (active) - continue; - - if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, - cachefile) != 0) { - (void) no_memory(hdl); - nvlist_free(raw); - nvlist_free(pools); - return (NULL); - } - - if ((dst = refresh_config(hdl, src)) == NULL) { - nvlist_free(raw); - nvlist_free(pools); - return (NULL); - } - - if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { - (void) no_memory(hdl); - nvlist_free(dst); - nvlist_free(raw); - nvlist_free(pools); - return (NULL); - } - nvlist_free(dst); - } - - nvlist_free(raw); - return (pools); -} - -static int -name_or_guid_exists(zpool_handle_t *zhp, void *data) -{ - importargs_t *import = data; - int found = 0; - - if (import->poolname != NULL) { - char *pool_name; - - verify(nvlist_lookup_string(zhp->zpool_config, - ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); - if (strcmp(pool_name, import->poolname) == 0) - found = 1; - } else { - uint64_t pool_guid; - - verify(nvlist_lookup_uint64(zhp->zpool_config, - ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); - if (pool_guid == import->guid) - found = 1; - } - - zpool_close(zhp); - return (found); -} - -nvlist_t * -zpool_search_import(libzfs_handle_t *hdl, importargs_t *import) -{ - verify(import->poolname == NULL || import->guid == 0); - - if (import->unique) - import->exists = zpool_iter(hdl, name_or_guid_exists, import); - - if (import->cachefile != NULL) - return (zpool_find_import_cached(hdl, import->cachefile, - import->poolname, import->guid)); - - return (zpool_find_import_impl(hdl, import)); -} + free(label); + return (0); +} static boolean_t -pool_match(nvlist_t *cfg, char *tgt) -{ - uint64_t v, guid = strtoull(tgt, NULL, 0); - char *s; - - if (guid != 0) { - if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) - return (v == guid); - } else { - if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) - return (strcmp(s, tgt) == 0); - } - return (B_FALSE); -} - -int -zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp, - importargs_t *args) -{ - nvlist_t *pools; - nvlist_t *match = NULL; - nvlist_t *config = NULL; - char *name = NULL, *sepp = NULL; - char sep = '\0'; - int count = 0; - char *targetdup = strdup(target); - - *configp = NULL; - - if ((sepp = strpbrk(targetdup, "/@")) != NULL) { - sep = *sepp; - *sepp = '\0'; - } - - pools = zpool_search_import(hdl, args); - - if (pools != NULL) { - nvpair_t *elem = NULL; - while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { - VERIFY0(nvpair_value_nvlist(elem, &config)); - if (pool_match(config, targetdup)) { - count++; - if (match != NULL) { - /* multiple matches found */ - continue; - } else { - match = config; - name = nvpair_name(elem); - } - } - } - } - - if (count == 0) { - (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "no pools found")); - free(targetdup); - return (ENOENT); - } - - if (count > 1) { - (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "%d pools found, use pool GUID\n"), count); - free(targetdup); - return (EINVAL); - } - - *configp = match; - free(targetdup); - - return (0); -} - -boolean_t find_guid(nvlist_t *nv, uint64_t guid) { uint64_t tmp; diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c index 73dc2c793..b1bdc4a6d 100644 --- a/lib/libzfs/libzfs_iter.c +++ b/lib/libzfs/libzfs_iter.c @@ -33,6 +33,7 @@ #include <stddef.h> #include <libintl.h> #include <libzfs.h> +#include <libzutil.h> #include <sys/mntent.h> #include "libzfs_impl.h" diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index fca1a4178..128c6efe9 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -29,10 +29,8 @@ * Copyright (c) 2017, Intel Corporation. */ -#include <ctype.h> #include <errno.h> #include <devid.h> -#include <fcntl.h> #include <libintl.h> #include <stdio.h> #include <stdlib.h> @@ -47,6 +45,7 @@ #include <sys/zfs_ioctl.h> #include <sys/vdev_disk.h> #include <dlfcn.h> +#include <libzutil.h> #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -3697,80 +3696,6 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) } #endif /* sun */ -/* - * Remove partition suffix from a vdev path. Partition suffixes may take three - * forms: "-partX", "pX", or "X", where X is a string of digits. The second - * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The - * third case only occurs when preceded by a string matching the regular - * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. - * - * caller must free the returned string - */ -char * -zfs_strip_partition(char *path) -{ - char *tmp = strdup(path); - char *part = NULL, *d = NULL; - if (!tmp) - return (NULL); - - if ((part = strstr(tmp, "-part")) && part != tmp) { - d = part + 5; - } else if ((part = strrchr(tmp, 'p')) && - part > tmp + 1 && isdigit(*(part-1))) { - d = part + 1; - } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && - tmp[1] == 'd') { - for (d = &tmp[2]; isalpha(*d); part = ++d) { } - } else if (strncmp("xvd", tmp, 3) == 0) { - for (d = &tmp[3]; isalpha(*d); part = ++d) { } - } - if (part && d && *d != '\0') { - for (; isdigit(*d); d++) { } - if (*d == '\0') - *part = '\0'; - } - - return (tmp); -} - -/* - * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname - * - * path: /dev/sda1 - * returns: /dev/sda - * - * Returned string must be freed. - */ -char * -zfs_strip_partition_path(char *path) -{ - char *newpath = strdup(path); - char *sd_offset; - char *new_sd; - - if (!newpath) - return (NULL); - - /* Point to "sda1" part of "/dev/sda1" */ - sd_offset = strrchr(newpath, '/') + 1; - - /* Get our new name "sda" */ - new_sd = zfs_strip_partition(sd_offset); - if (!new_sd) { - free(newpath); - return (NULL); - } - - /* Paste the "sda" where "sda1" was */ - strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); - - /* Free temporary "sda" */ - free(new_sd); - - return (newpath); -} - #define PATH_BUF_LEN 64 /* @@ -4133,54 +4058,6 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) return (0); } -/* - * Process the buffer of nvlists, unpacking and storing each nvlist record - * into 'records'. 'leftover' is set to the number of bytes that weren't - * processed as there wasn't a complete record. - */ -int -zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, - nvlist_t ***records, uint_t *numrecords) -{ - uint64_t reclen; - nvlist_t *nv; - int i; - void *tmp; - - while (bytes_read > sizeof (reclen)) { - - /* get length of packed record (stored as little endian) */ - for (i = 0, reclen = 0; i < sizeof (reclen); i++) - reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i); - - if (bytes_read < sizeof (reclen) + reclen) - break; - - /* unpack record */ - if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0) - return (ENOMEM); - bytes_read -= sizeof (reclen) + reclen; - buf += sizeof (reclen) + reclen; - - /* add record to nvlist array */ - (*numrecords)++; - if (ISP2(*numrecords + 1)) { - tmp = realloc(*records, - *numrecords * 2 * sizeof (nvlist_t *)); - if (tmp == NULL) { - nvlist_free(nv); - (*numrecords)--; - return (ENOMEM); - } - *records = tmp; - } - (*records)[*numrecords - 1] = nv; - } - - *leftover = bytes_read; - return (0); -} - /* * Retrieve the command history of a pool. */ @@ -4669,281 +4546,3 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) return (0); } - -/* - * Allocate and return the underlying device name for a device mapper device. - * If a device mapper device maps to multiple devices, return the first device. - * - * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a - * DM device (like /dev/disk/by-vdev/A0) are also allowed. - * - * Returns device name, or NULL on error or no match. If dm_name is not a DM - * device then return NULL. - * - * NOTE: The returned name string must be *freed*. - */ -char * -dm_get_underlying_path(char *dm_name) -{ - DIR *dp = NULL; - struct dirent *ep; - char *realp; - char *tmp = NULL; - char *path = NULL; - char *dev_str; - int size; - - if (dm_name == NULL) - return (NULL); - - /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ - realp = realpath(dm_name, NULL); - if (realp == NULL) - return (NULL); - - /* - * If they preface 'dev' with a path (like "/dev") then strip it off. - * We just want the 'dm-N' part. - */ - tmp = strrchr(realp, '/'); - if (tmp != NULL) - dev_str = tmp + 1; /* +1 since we want the chr after '/' */ - else - dev_str = tmp; - - size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str); - if (size == -1 || !tmp) - goto end; - - dp = opendir(tmp); - if (dp == NULL) - goto end; - - /* Return first sd* entry in /sys/block/dm-N/slaves/ */ - while ((ep = readdir(dp))) { - if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ - size = asprintf(&path, "/dev/%s", ep->d_name); - break; - } - } - -end: - if (dp != NULL) - closedir(dp); - free(tmp); - free(realp); - return (path); -} - -/* - * Return 1 if device is a device mapper or multipath device. - * Return 0 if not. - */ -int -zfs_dev_is_dm(char *dev_name) -{ - - char *tmp; - tmp = dm_get_underlying_path(dev_name); - if (tmp == NULL) - return (0); - - free(tmp); - return (1); -} - -/* - * By "whole disk" we mean an entire physical disk (something we can - * label, toggle the write cache on, etc.) as opposed to the full - * capacity of a pseudo-device such as lofi or did. We act as if we - * are labeling the disk, which should be a pretty good test of whether - * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if - * it isn't. - */ -int -zfs_dev_is_whole_disk(char *dev_name) -{ - struct dk_gpt *label; - int fd; - - if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0) - return (0); - - if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { - (void) close(fd); - return (0); - } - - efi_free(label); - (void) close(fd); - - return (1); -} - -/* - * Lookup the underlying device for a device name - * - * Often you'll have a symlink to a device, a partition device, - * or a multipath device, and want to look up the underlying device. - * This function returns the underlying device name. If the device - * name is already the underlying device, then just return the same - * name. If the device is a DM device with multiple underlying devices - * then return the first one. - * - * For example: - * - * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda - * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 - * returns: /dev/sda - * - * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) - * dev_name: /dev/mapper/mpatha - * returns: /dev/sda (first device) - * - * 3. /dev/sda (already the underlying device) - * dev_name: /dev/sda - * returns: /dev/sda - * - * 4. /dev/dm-3 (mapped to /dev/sda) - * dev_name: /dev/dm-3 - * returns: /dev/sda - * - * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 - * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 - * returns: /dev/sdb - * - * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 - * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a - * returns: /dev/sda - * - * Returns underlying device name, or NULL on error or no match. - * - * NOTE: The returned name string must be *freed*. - */ -char * -zfs_get_underlying_path(char *dev_name) -{ - char *name = NULL; - char *tmp; - - if (dev_name == NULL) - return (NULL); - - tmp = dm_get_underlying_path(dev_name); - - /* dev_name not a DM device, so just un-symlinkize it */ - if (tmp == NULL) - tmp = realpath(dev_name, NULL); - - if (tmp != NULL) { - name = zfs_strip_partition_path(tmp); - free(tmp); - } - - return (name); -} - -/* - * Given a dev name like "sda", return the full enclosure sysfs path to - * the disk. You can also pass in the name with "/dev" prepended - * to it (like /dev/sda). - * - * For example, disk "sda" in enclosure slot 1: - * dev: "sda" - * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" - * - * 'dev' must be a non-devicemapper device. - * - * Returned string must be freed. - */ -char * -zfs_get_enclosure_sysfs_path(char *dev_name) -{ - DIR *dp = NULL; - struct dirent *ep; - char buf[MAXPATHLEN]; - char *tmp1 = NULL; - char *tmp2 = NULL; - char *tmp3 = NULL; - char *path = NULL; - size_t size; - int tmpsize; - - if (dev_name == NULL) - return (NULL); - - /* If they preface 'dev' with a path (like "/dev") then strip it off */ - tmp1 = strrchr(dev_name, '/'); - if (tmp1 != NULL) - dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ - - tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); - if (tmpsize == -1 || tmp1 == NULL) { - tmp1 = NULL; - goto end; - } - - dp = opendir(tmp1); - if (dp == NULL) { - tmp1 = NULL; /* To make free() at the end a NOP */ - goto end; - } - - /* - * Look though all sysfs entries in /sys/block/<dev>/device for - * the enclosure symlink. - */ - while ((ep = readdir(dp))) { - /* Ignore everything that's not our enclosure_device link */ - if (strstr(ep->d_name, "enclosure_device") == NULL) - continue; - - if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 || - tmp2 == NULL) - break; - - size = readlink(tmp2, buf, sizeof (buf)); - - /* Did readlink fail or crop the link name? */ - if (size == -1 || size >= sizeof (buf)) { - free(tmp2); - tmp2 = NULL; /* To make free() at the end a NOP */ - break; - } - - /* - * We got a valid link. readlink() doesn't terminate strings - * so we have to do it. - */ - buf[size] = '\0'; - - /* - * Our link will look like: - * - * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" - * - * We want to grab the "enclosure/1:0:3:0/SLOT 1" part - */ - tmp3 = strstr(buf, "enclosure"); - if (tmp3 == NULL) - break; - - if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { - /* If asprintf() fails, 'path' is undefined */ - path = NULL; - break; - } - - if (path == NULL) - break; - } - -end: - free(tmp2); - free(tmp1); - - if (dp != NULL) - closedir(dp); - - return (path); -} diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index b5c91ec20..4a620a9da 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -54,6 +54,7 @@ #include <libzfs.h> #include <libzfs_core.h> +#include <libzutil.h> #include "zfs_namecheck.h" #include "zfs_prop.h" diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index 4089f0cc6..e49d79baa 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -42,6 +42,7 @@ */ #include <libzfs.h> +#include <libzutil.h> #include <string.h> #include <unistd.h> #include <sys/systeminfo.h> @@ -425,68 +426,3 @@ zpool_import_status(nvlist_t *config, char **msgid, zpool_errata_t *errata) return (ret); } - -static void -dump_ddt_stat(const ddt_stat_t *dds, int h) -{ - char refcnt[6]; - char blocks[6], lsize[6], psize[6], dsize[6]; - char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6]; - - if (dds == NULL || dds->dds_blocks == 0) - return; - - if (h == -1) - (void) strcpy(refcnt, "Total"); - else - zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt)); - - zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks)); - zfs_nicebytes(dds->dds_lsize, lsize, sizeof (lsize)); - zfs_nicebytes(dds->dds_psize, psize, sizeof (psize)); - zfs_nicebytes(dds->dds_dsize, dsize, sizeof (dsize)); - zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks)); - zfs_nicebytes(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize)); - zfs_nicebytes(dds->dds_ref_psize, ref_psize, sizeof (ref_psize)); - zfs_nicebytes(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize)); - - (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", - refcnt, - blocks, lsize, psize, dsize, - ref_blocks, ref_lsize, ref_psize, ref_dsize); -} - -/* - * Print the DDT histogram and the column totals. - */ -void -zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh) -{ - int h; - - (void) printf("\n"); - - (void) printf("bucket " - " allocated " - " referenced \n"); - (void) printf("______ " - "______________________________ " - "______________________________\n"); - - (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", - "refcnt", - "blocks", "LSIZE", "PSIZE", "DSIZE", - "blocks", "LSIZE", "PSIZE", "DSIZE"); - - (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", - "------", - "------", "-----", "-----", "-----", - "------", "-----", "-----", "-----"); - - for (h = 0; h < 64; h++) - dump_ddt_stat(&ddh->ddh_stat[h], h); - - dump_ddt_stat(dds_total, -1); - - (void) printf("\n"); -} diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 440ed3bc0..9f4fe3b72 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -39,7 +39,6 @@ #include <stdlib.h> #include <strings.h> #include <unistd.h> -#include <ctype.h> #include <math.h> #include <sys/stat.h> #include <sys/mnttab.h> @@ -54,6 +53,7 @@ #include "zfs_prop.h" #include "zfeature_common.h" #include <zfs_fletcher.h> +#include <libzutil.h> int libzfs_errno(libzfs_handle_t *hdl) @@ -677,135 +677,6 @@ zfs_strdup(libzfs_handle_t *hdl, const char *str) return (ret); } -/* - * Convert a number to an appropriately human-readable output. - */ -void -zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, - enum zfs_nicenum_format format) -{ - uint64_t n = num; - int index = 0; - const char *u; - const char *units[3][7] = { - [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"}, - [ZFS_NICENUM_BYTES] = {"B", "K", "M", "G", "T", "P", "E"}, - [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"} - }; - - const int units_len[] = {[ZFS_NICENUM_1024] = 6, - [ZFS_NICENUM_BYTES] = 6, - [ZFS_NICENUM_TIME] = 4}; - - const int k_unit[] = { [ZFS_NICENUM_1024] = 1024, - [ZFS_NICENUM_BYTES] = 1024, - [ZFS_NICENUM_TIME] = 1000}; - - double val; - - if (format == ZFS_NICENUM_RAW) { - snprintf(buf, buflen, "%llu", (u_longlong_t)num); - return; - } else if (format == ZFS_NICENUM_RAWTIME && num > 0) { - snprintf(buf, buflen, "%llu", (u_longlong_t)num); - return; - } else if (format == ZFS_NICENUM_RAWTIME && num == 0) { - snprintf(buf, buflen, "%s", "-"); - return; - } - - while (n >= k_unit[format] && index < units_len[format]) { - n /= k_unit[format]; - index++; - } - - u = units[format][index]; - - /* Don't print zero latencies since they're invalid */ - if ((format == ZFS_NICENUM_TIME) && (num == 0)) { - (void) snprintf(buf, buflen, "-"); - } else if ((index == 0) || ((num % - (uint64_t)powl(k_unit[format], index)) == 0)) { - /* - * If this is an even multiple of the base, always display - * without any decimal precision. - */ - (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t)n, u); - - } else { - /* - * We want to choose a precision that reflects the best choice - * for fitting in 5 characters. This can get rather tricky when - * we have numbers that are very close to an order of magnitude. - * For example, when displaying 10239 (which is really 9.999K), - * we want only a single place of precision for 10.0K. We could - * develop some complex heuristics for this, but it's much - * easier just to try each combination in turn. - */ - int i; - for (i = 2; i >= 0; i--) { - val = (double)num / - (uint64_t)powl(k_unit[format], index); - - /* - * Don't print floating point values for time. Note, - * we use floor() instead of round() here, since - * round can result in undesirable results. For - * example, if "num" is in the range of - * 999500-999999, it will print out "1000us". This - * doesn't happen if we use floor(). - */ - if (format == ZFS_NICENUM_TIME) { - if (snprintf(buf, buflen, "%d%s", - (unsigned int) floor(val), u) <= 5) - break; - - } else { - if (snprintf(buf, buflen, "%.*f%s", i, - val, u) <= 5) - break; - } - } - } -} - -/* - * Convert a number to an appropriately human-readable output. - */ -void -zfs_nicenum(uint64_t num, char *buf, size_t buflen) -{ - zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024); -} - -/* - * Convert a time to an appropriately human-readable output. - * @num: Time in nanoseconds - */ -void -zfs_nicetime(uint64_t num, char *buf, size_t buflen) -{ - zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME); -} - -/* - * Print out a raw number with correct column spacing - */ -void -zfs_niceraw(uint64_t num, char *buf, size_t buflen) -{ - zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW); -} - -/* - * Convert a number of bytes to an appropriately human-readable output. - */ -void -zfs_nicebytes(uint64_t num, char *buf, size_t buflen) -{ - zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_BYTES); -} - void libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) { @@ -1232,210 +1103,6 @@ zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); } -/* - * Append partition suffix to an otherwise fully qualified device path. - * This is used to generate the name the full path as its stored in - * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length - * of 'path' will be returned on error a negative value is returned. - */ -int -zfs_append_partition(char *path, size_t max_len) -{ - int len = strlen(path); - - if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || - (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { - if (len + 6 >= max_len) - return (-1); - - (void) strcat(path, "-part1"); - len += 6; - } else { - if (len + 2 >= max_len) - return (-1); - - if (isdigit(path[len-1])) { - (void) strcat(path, "p1"); - len += 2; - } else { - (void) strcat(path, "1"); - len += 1; - } - } - - return (len); -} - -/* - * Given a shorthand device name check if a file by that name exists in any - * of the 'zpool_default_import_path' or ZPOOL_IMPORT_PATH directories. If - * one is found, store its fully qualified path in the 'path' buffer passed - * by the caller and return 0, otherwise return an error. - */ -int -zfs_resolve_shortname(const char *name, char *path, size_t len) -{ - int i, error = -1; - char *dir, *env, *envdup; - - env = getenv("ZPOOL_IMPORT_PATH"); - errno = ENOENT; - - if (env) { - envdup = strdup(env); - dir = strtok(envdup, ":"); - while (dir && error) { - (void) snprintf(path, len, "%s/%s", dir, name); - error = access(path, F_OK); - dir = strtok(NULL, ":"); - } - free(envdup); - } else { - for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE && error < 0; i++) { - (void) snprintf(path, len, "%s/%s", - zpool_default_import_path[i], name); - error = access(path, F_OK); - } - } - - return (error ? ENOENT : 0); -} - -/* - * Given a shorthand device name look for a match against 'cmp_name'. This - * is done by checking all prefix expansions using either the default - * 'zpool_default_import_paths' or the ZPOOL_IMPORT_PATH environment - * variable. Proper partition suffixes will be appended if this is a - * whole disk. When a match is found 0 is returned otherwise ENOENT. - */ -static int -zfs_strcmp_shortname(char *name, char *cmp_name, int wholedisk) -{ - int path_len, cmp_len, i = 0, error = ENOENT; - char *dir, *env, *envdup = NULL; - char path_name[MAXPATHLEN]; - - cmp_len = strlen(cmp_name); - env = getenv("ZPOOL_IMPORT_PATH"); - - if (env) { - envdup = strdup(env); - dir = strtok(envdup, ":"); - } else { - dir = zpool_default_import_path[i]; - } - - while (dir) { - /* Trim trailing directory slashes from ZPOOL_IMPORT_PATH */ - while (dir[strlen(dir)-1] == '/') - dir[strlen(dir)-1] = '\0'; - - path_len = snprintf(path_name, MAXPATHLEN, "%s/%s", dir, name); - if (wholedisk) - path_len = zfs_append_partition(path_name, MAXPATHLEN); - - if ((path_len == cmp_len) && strcmp(path_name, cmp_name) == 0) { - error = 0; - break; - } - - if (env) { - dir = strtok(NULL, ":"); - } else if (++i < DEFAULT_IMPORT_PATH_SIZE) { - dir = zpool_default_import_path[i]; - } else { - dir = NULL; - } - } - - if (env) - free(envdup); - - return (error); -} - -/* - * Given either a shorthand or fully qualified path name look for a match - * against 'cmp'. The passed name will be expanded as needed for comparison - * purposes and redundant slashes stripped to ensure an accurate match. - */ -int -zfs_strcmp_pathname(char *name, char *cmp, int wholedisk) -{ - int path_len, cmp_len; - char path_name[MAXPATHLEN]; - char cmp_name[MAXPATHLEN]; - char *dir, *dup; - - /* Strip redundant slashes if one exists due to ZPOOL_IMPORT_PATH */ - memset(cmp_name, 0, MAXPATHLEN); - dup = strdup(cmp); - dir = strtok(dup, "/"); - while (dir) { - strlcat(cmp_name, "/", sizeof (cmp_name)); - strlcat(cmp_name, dir, sizeof (cmp_name)); - dir = strtok(NULL, "/"); - } - free(dup); - - if (name[0] != '/') - return (zfs_strcmp_shortname(name, cmp_name, wholedisk)); - - (void) strlcpy(path_name, name, MAXPATHLEN); - path_len = strlen(path_name); - cmp_len = strlen(cmp_name); - - if (wholedisk) { - path_len = zfs_append_partition(path_name, MAXPATHLEN); - if (path_len == -1) - return (ENOMEM); - } - - if ((path_len != cmp_len) || strcmp(path_name, cmp_name)) - return (ENOENT); - - return (0); -} - -/* - * Given a full path to a device determine if that device appears in the - * import search path. If it does return the first match and store the - * index in the passed 'order' variable, otherwise return an error. - */ -int -zfs_path_order(char *name, int *order) -{ - int i = 0, error = ENOENT; - char *dir, *env, *envdup; - - env = getenv("ZPOOL_IMPORT_PATH"); - if (env) { - envdup = strdup(env); - dir = strtok(envdup, ":"); - while (dir) { - if (strncmp(name, dir, strlen(dir)) == 0) { - *order = i; - error = 0; - break; - } - dir = strtok(NULL, ":"); - i++; - } - free(envdup); - } else { - for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) { - if (strncmp(name, zpool_default_import_path[i], - strlen(zpool_default_import_path[i])) == 0) { - *order = i; - error = 0; - break; - } - } - } - - return (error); -} - /* * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from * an ioctl(). diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index e10f20dd9..efc44b27e 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -188,11 +188,10 @@ nodist_libzpool_la_SOURCES = \ $(LUA_C) libzpool_la_LIBADD = \ - $(top_builddir)/lib/libavl/libavl.la \ $(top_builddir)/lib/libicp/libicp.la \ $(top_builddir)/lib/libnvpair/libnvpair.la \ - $(top_builddir)/lib/libspl/libspl.la \ - $(top_builddir)/lib/libunicode/libunicode.la + $(top_builddir)/lib/libunicode/libunicode.la \ + $(top_builddir)/lib/libzutil/libzutil.la libzpool_la_LIBADD += $(ZLIB) -ldl libzpool_la_LDFLAGS = -pthread -version-info 2:0:0 diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index 8a6f4c325..87772bcb7 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -34,112 +34,14 @@ #include <sys/spa.h> #include <sys/fs/zfs.h> #include <sys/refcount.h> +#include <sys/zfs_ioctl.h> #include <dlfcn.h> +#include <libzutil.h> /* * Routines needed by more than one client of libzpool. */ -/* The largest suffix that can fit, aka an exabyte (2^60 / 10^18) */ -#define INDEX_MAX (6) - -/* Verify INDEX_MAX fits */ -CTASSERT_GLOBAL(INDEX_MAX * 10 < sizeof (uint64_t) * 8); - -void -nicenum_scale(uint64_t n, size_t units, char *buf, size_t buflen, - uint32_t flags) -{ - uint64_t divamt = 1024; - uint64_t divisor = 1; - int index = 0; - int rc = 0; - char u; - - if (units == 0) - units = 1; - - if (n > 0) { - n *= units; - if (n < units) - goto overflow; - } - - if (flags & NN_DIVISOR_1000) - divamt = 1000; - - /* - * This tries to find the suffix S(n) such that - * S(n) <= n < S(n+1), where S(n) = 2^(n*10) | 10^(3*n) - * (i.e. 1024/1000, 1,048,576/1,000,000, etc). Stop once S(n) - * is the largest prefix supported (i.e. don't bother computing - * and checking S(n+1). Since INDEX_MAX should be the largest - * suffix that fits (currently an exabyte), S(INDEX_MAX + 1) is - * never checked as it would overflow. - */ - while (index < INDEX_MAX) { - uint64_t newdiv = divisor * divamt; - - /* CTASSERT() guarantee these never trip */ - VERIFY3U(newdiv, >=, divamt); - VERIFY3U(newdiv, >=, divisor); - - if (n < newdiv) - break; - - divisor = newdiv; - index++; - } - - u = " KMGTPE"[index]; - - if (index == 0) { - rc = snprintf(buf, buflen, "%llu", (u_longlong_t)n); - } else if (n % divisor == 0) { - /* - * If this is an even multiple of the base, always display - * without any decimal precision. - */ - rc = snprintf(buf, buflen, "%llu%c", - (u_longlong_t)(n / divisor), u); - } else { - /* - * We want to choose a precision that reflects the best choice - * for fitting in 5 characters. This can get rather tricky - * when we have numbers that are very close to an order of - * magnitude. For example, when displaying 10239 (which is - * really 9.999K), we want only a single place of precision - * for 10.0K. We could develop some complex heuristics for - * this, but it's much easier just to try each combination - * in turn. - */ - int i; - for (i = 2; i >= 0; i--) { - if ((rc = snprintf(buf, buflen, "%.*f%c", i, - (double)n / divisor, u)) <= 5) - break; - } - } - - if (rc + 1 > buflen || rc < 0) - goto overflow; - - return; - -overflow: - /* prefer a more verbose message if possible */ - if (buflen > 10) - (void) strlcpy(buf, "<overflow>", buflen); - else - (void) strlcpy(buf, "??", buflen); -} - -void -nicenum(uint64_t num, char *buf, size_t buflen) -{ - nicenum_scale(num, 1, buf, buflen, 0); -} - static void show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) { @@ -300,3 +202,56 @@ set_global_var(char *arg) return (0); } + +static nvlist_t * +refresh_config(void *unused, nvlist_t *tryconfig) +{ + return (spa_tryimport(tryconfig)); +} + +static int +pool_active(void *unused, const char *name, uint64_t guid, + boolean_t *isactive) +{ + zfs_cmd_t *zcp; + nvlist_t *innvl; + char *packed = NULL; + size_t size = 0; + int fd, ret; + + /* + * Use ZFS_IOC_POOL_SYNC to confirm if a pool is active + */ + + fd = open("/dev/zfs", O_RDWR); + if (fd < 0) + return (-1); + + zcp = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); + + innvl = fnvlist_alloc(); + fnvlist_add_boolean_value(innvl, "force", B_FALSE); + + (void) strlcpy(zcp->zc_name, name, sizeof (zcp->zc_name)); + packed = fnvlist_pack(innvl, &size); + zcp->zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zcp->zc_nvlist_src_size = size; + + ret = ioctl(fd, ZFS_IOC_POOL_SYNC, zcp); + + fnvlist_pack_free(packed, size); + free((void *)(uintptr_t)zcp->zc_nvlist_dst); + nvlist_free(innvl); + umem_free(zcp, sizeof (zfs_cmd_t)); + + (void) close(fd); + + *isactive = (ret == 0); + + return (0); +} + +const pool_config_ops_t libzpool_config_ops = { + .pco_refresh_config = refresh_config, + .pco_pool_active = pool_active, +}; diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am new file mode 100644 index 000000000..720b843ab --- /dev/null +++ b/lib/libzutil/Makefile.am @@ -0,0 +1,27 @@ +include $(top_srcdir)/config/Rules.am + +# Suppress unused but set variable warnings often due to ASSERTs +AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) + +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib/libspl/include + +noinst_LTLIBRARIES = libzutil.la + +USER_C = \ + zutil_device_path.c \ + zutil_import.c \ + zutil_nicenum.c \ + zutil_pool.c + +nodist_libzutil_la_SOURCES = $(USER_C) + +libzutil_la_LIBADD = \ + $(top_builddir)/lib/libavl/libavl.la \ + $(top_builddir)/lib/libefi/libefi.la \ + $(top_builddir)/lib/libtpool/libtpool.la + +libzutil_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) + +EXTRA_DIST = $(USER_C) diff --git a/lib/libzutil/zutil_device_path.c b/lib/libzutil/zutil_device_path.c new file mode 100644 index 000000000..1dc0d4d1d --- /dev/null +++ b/lib/libzutil/zutil_device_path.c @@ -0,0 +1,625 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <ctype.h> +#include <errno.h> +#include <dirent.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/efi_partition.h> + +#include <libzutil.h> +#ifdef HAVE_LIBUDEV +#include <libudev.h> +#endif + +/* + * Append partition suffix to an otherwise fully qualified device path. + * This is used to generate the name the full path as its stored in + * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length + * of 'path' will be returned on error a negative value is returned. + */ +int +zfs_append_partition(char *path, size_t max_len) +{ + int len = strlen(path); + + if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || + (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { + if (len + 6 >= max_len) + return (-1); + + (void) strcat(path, "-part1"); + len += 6; + } else { + if (len + 2 >= max_len) + return (-1); + + if (isdigit(path[len-1])) { + (void) strcat(path, "p1"); + len += 2; + } else { + (void) strcat(path, "1"); + len += 1; + } + } + + return (len); +} + +/* + * Given a shorthand device name check if a file by that name exists in any + * of the 'zpool_default_import_path' or ZPOOL_IMPORT_PATH directories. If + * one is found, store its fully qualified path in the 'path' buffer passed + * by the caller and return 0, otherwise return an error. + */ +int +zfs_resolve_shortname(const char *name, char *path, size_t len) +{ + int i, error = -1; + char *dir, *env, *envdup; + + env = getenv("ZPOOL_IMPORT_PATH"); + errno = ENOENT; + + if (env) { + envdup = strdup(env); + dir = strtok(envdup, ":"); + while (dir && error) { + (void) snprintf(path, len, "%s/%s", dir, name); + error = access(path, F_OK); + dir = strtok(NULL, ":"); + } + free(envdup); + } else { + const char * const *zpool_default_import_path; + size_t count; + + zpool_default_import_path = zpool_default_search_paths(&count); + + for (i = 0; i < count && error < 0; i++) { + (void) snprintf(path, len, "%s/%s", + zpool_default_import_path[i], name); + error = access(path, F_OK); + } + } + + return (error ? ENOENT : 0); +} + +/* + * Given a shorthand device name look for a match against 'cmp_name'. This + * is done by checking all prefix expansions using either the default + * 'zpool_default_import_paths' or the ZPOOL_IMPORT_PATH environment + * variable. Proper partition suffixes will be appended if this is a + * whole disk. When a match is found 0 is returned otherwise ENOENT. + */ +static int +zfs_strcmp_shortname(const char *name, const char *cmp_name, int wholedisk) +{ + int path_len, cmp_len, i = 0, error = ENOENT; + char *dir, *env, *envdup = NULL; + char path_name[MAXPATHLEN]; + const char * const *zpool_default_import_path; + size_t count; + + zpool_default_import_path = zpool_default_search_paths(&count); + + cmp_len = strlen(cmp_name); + env = getenv("ZPOOL_IMPORT_PATH"); + + if (env) { + envdup = strdup(env); + dir = strtok(envdup, ":"); + } else { + dir = (char *)zpool_default_import_path[i]; + } + + while (dir) { + /* Trim trailing directory slashes from ZPOOL_IMPORT_PATH */ + if (env) { + while (dir[strlen(dir)-1] == '/') + dir[strlen(dir)-1] = '\0'; + } + + path_len = snprintf(path_name, MAXPATHLEN, "%s/%s", dir, name); + if (wholedisk) + path_len = zfs_append_partition(path_name, MAXPATHLEN); + + if ((path_len == cmp_len) && strcmp(path_name, cmp_name) == 0) { + error = 0; + break; + } + + if (env) { + dir = strtok(NULL, ":"); + } else if (++i < count) { + dir = (char *)zpool_default_import_path[i]; + } else { + dir = NULL; + } + } + + if (env) + free(envdup); + + return (error); +} + +/* + * Given either a shorthand or fully qualified path name look for a match + * against 'cmp'. The passed name will be expanded as needed for comparison + * purposes and redundant slashes stripped to ensure an accurate match. + */ +int +zfs_strcmp_pathname(const char *name, const char *cmp, int wholedisk) +{ + int path_len, cmp_len; + char path_name[MAXPATHLEN]; + char cmp_name[MAXPATHLEN]; + char *dir, *dup; + + /* Strip redundant slashes if one exists due to ZPOOL_IMPORT_PATH */ + memset(cmp_name, 0, MAXPATHLEN); + dup = strdup(cmp); + dir = strtok(dup, "/"); + while (dir) { + strlcat(cmp_name, "/", sizeof (cmp_name)); + strlcat(cmp_name, dir, sizeof (cmp_name)); + dir = strtok(NULL, "/"); + } + free(dup); + + if (name[0] != '/') + return (zfs_strcmp_shortname(name, cmp_name, wholedisk)); + + (void) strlcpy(path_name, name, MAXPATHLEN); + path_len = strlen(path_name); + cmp_len = strlen(cmp_name); + + if (wholedisk) { + path_len = zfs_append_partition(path_name, MAXPATHLEN); + if (path_len == -1) + return (ENOMEM); + } + + if ((path_len != cmp_len) || strcmp(path_name, cmp_name)) + return (ENOENT); + + return (0); +} + +/* + * Allocate and return the underlying device name for a device mapper device. + * If a device mapper device maps to multiple devices, return the first device. + * + * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a + * DM device (like /dev/disk/by-vdev/A0) are also allowed. + * + * Returns device name, or NULL on error or no match. If dm_name is not a DM + * device then return NULL. + * + * NOTE: The returned name string must be *freed*. + */ +static char * +dm_get_underlying_path(const char *dm_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char *realp; + char *tmp = NULL; + char *path = NULL; + char *dev_str; + int size; + + if (dm_name == NULL) + return (NULL); + + /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ + realp = realpath(dm_name, NULL); + if (realp == NULL) + return (NULL); + + /* + * If they preface 'dev' with a path (like "/dev") then strip it off. + * We just want the 'dm-N' part. + */ + tmp = strrchr(realp, '/'); + if (tmp != NULL) + dev_str = tmp + 1; /* +1 since we want the chr after '/' */ + else + dev_str = tmp; + + size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str); + if (size == -1 || !tmp) + goto end; + + dp = opendir(tmp); + if (dp == NULL) + goto end; + + /* Return first sd* entry in /sys/block/dm-N/slaves/ */ + while ((ep = readdir(dp))) { + if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ + size = asprintf(&path, "/dev/%s", ep->d_name); + break; + } + } + +end: + if (dp != NULL) + closedir(dp); + free(tmp); + free(realp); + return (path); +} + +/* + * Return 1 if device is a device mapper or multipath device. + * Return 0 if not. + */ +int +zfs_dev_is_dm(const char *dev_name) +{ + + char *tmp; + tmp = dm_get_underlying_path(dev_name); + if (tmp == NULL) + return (0); + + free(tmp); + return (1); +} + +/* + * By "whole disk" we mean an entire physical disk (something we can + * label, toggle the write cache on, etc.) as opposed to the full + * capacity of a pseudo-device such as lofi or did. We act as if we + * are labeling the disk, which should be a pretty good test of whether + * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if + * it isn't. + */ +int +zfs_dev_is_whole_disk(const char *dev_name) +{ + struct dk_gpt *label; + int fd; + + if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0) + return (0); + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { + (void) close(fd); + return (0); + } + + efi_free(label); + (void) close(fd); + + return (1); +} + +/* + * Lookup the underlying device for a device name + * + * Often you'll have a symlink to a device, a partition device, + * or a multipath device, and want to look up the underlying device. + * This function returns the underlying device name. If the device + * name is already the underlying device, then just return the same + * name. If the device is a DM device with multiple underlying devices + * then return the first one. + * + * For example: + * + * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda + * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 + * returns: /dev/sda + * + * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) + * dev_name: /dev/mapper/mpatha + * returns: /dev/sda (first device) + * + * 3. /dev/sda (already the underlying device) + * dev_name: /dev/sda + * returns: /dev/sda + * + * 4. /dev/dm-3 (mapped to /dev/sda) + * dev_name: /dev/dm-3 + * returns: /dev/sda + * + * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 + * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 + * returns: /dev/sdb + * + * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 + * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a + * returns: /dev/sda + * + * Returns underlying device name, or NULL on error or no match. + * + * NOTE: The returned name string must be *freed*. + */ +char * +zfs_get_underlying_path(const char *dev_name) +{ + char *name = NULL; + char *tmp; + + if (dev_name == NULL) + return (NULL); + + tmp = dm_get_underlying_path(dev_name); + + /* dev_name not a DM device, so just un-symlinkize it */ + if (tmp == NULL) + tmp = realpath(dev_name, NULL); + + if (tmp != NULL) { + name = zfs_strip_partition_path(tmp); + free(tmp); + } + + return (name); +} + +/* + * Given a dev name like "sda", return the full enclosure sysfs path to + * the disk. You can also pass in the name with "/dev" prepended + * to it (like /dev/sda). + * + * For example, disk "sda" in enclosure slot 1: + * dev: "sda" + * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" + * + * 'dev' must be a non-devicemapper device. + * + * Returned string must be freed. + */ +char * +zfs_get_enclosure_sysfs_path(const char *dev_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char buf[MAXPATHLEN]; + char *tmp1 = NULL; + char *tmp2 = NULL; + char *tmp3 = NULL; + char *path = NULL; + size_t size; + int tmpsize; + + if (dev_name == NULL) + return (NULL); + + /* If they preface 'dev' with a path (like "/dev") then strip it off */ + tmp1 = strrchr(dev_name, '/'); + if (tmp1 != NULL) + dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ + + tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); + if (tmpsize == -1 || tmp1 == NULL) { + tmp1 = NULL; + goto end; + } + + dp = opendir(tmp1); + if (dp == NULL) { + tmp1 = NULL; /* To make free() at the end a NOP */ + goto end; + } + + /* + * Look though all sysfs entries in /sys/block/<dev>/device for + * the enclosure symlink. + */ + while ((ep = readdir(dp))) { + /* Ignore everything that's not our enclosure_device link */ + if (strstr(ep->d_name, "enclosure_device") == NULL) + continue; + + if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 || + tmp2 == NULL) + break; + + size = readlink(tmp2, buf, sizeof (buf)); + + /* Did readlink fail or crop the link name? */ + if (size == -1 || size >= sizeof (buf)) { + free(tmp2); + tmp2 = NULL; /* To make free() at the end a NOP */ + break; + } + + /* + * We got a valid link. readlink() doesn't terminate strings + * so we have to do it. + */ + buf[size] = '\0'; + + /* + * Our link will look like: + * + * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" + * + * We want to grab the "enclosure/1:0:3:0/SLOT 1" part + */ + tmp3 = strstr(buf, "enclosure"); + if (tmp3 == NULL) + break; + + if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { + /* If asprintf() fails, 'path' is undefined */ + path = NULL; + break; + } + + if (path == NULL) + break; + } + +end: + free(tmp2); + free(tmp1); + + if (dp != NULL) + closedir(dp); + + return (path); +} + +/* + * Remove partition suffix from a vdev path. Partition suffixes may take three + * forms: "-partX", "pX", or "X", where X is a string of digits. The second + * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The + * third case only occurs when preceded by a string matching the regular + * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. + * + * caller must free the returned string + */ +char * +zfs_strip_partition(char *path) +{ + char *tmp = strdup(path); + char *part = NULL, *d = NULL; + if (!tmp) + return (NULL); + + if ((part = strstr(tmp, "-part")) && part != tmp) { + d = part + 5; + } else if ((part = strrchr(tmp, 'p')) && + part > tmp + 1 && isdigit(*(part-1))) { + d = part + 1; + } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && + tmp[1] == 'd') { + for (d = &tmp[2]; isalpha(*d); part = ++d) { } + } else if (strncmp("xvd", tmp, 3) == 0) { + for (d = &tmp[3]; isalpha(*d); part = ++d) { } + } + if (part && d && *d != '\0') { + for (; isdigit(*d); d++) { } + if (*d == '\0') + *part = '\0'; + } + + return (tmp); +} + +/* + * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname + * + * path: /dev/sda1 + * returns: /dev/sda + * + * Returned string must be freed. + */ +char * +zfs_strip_partition_path(char *path) +{ + char *newpath = strdup(path); + char *sd_offset; + char *new_sd; + + if (!newpath) + return (NULL); + + /* Point to "sda1" part of "/dev/sda1" */ + sd_offset = strrchr(newpath, '/') + 1; + + /* Get our new name "sda" */ + new_sd = zfs_strip_partition(sd_offset); + if (!new_sd) { + free(newpath); + return (NULL); + } + + /* Paste the "sda" where "sda1" was */ + strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); + + /* Free temporary "sda" */ + free(new_sd); + + return (newpath); +} + +#ifdef HAVE_LIBUDEV +/* + * A disk is considered a multipath whole disk when: + * DEVNAME key value has "dm-" + * DM_NAME key value has "mpath" prefix + * DM_UUID key exists + * ID_PART_TABLE_TYPE key does not exist or is not gpt + */ +static boolean_t +udev_mpath_whole_disk(struct udev_device *dev) +{ + const char *devname, *type, *uuid; + + devname = udev_device_get_property_value(dev, "DEVNAME"); + type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + + if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && + ((type == NULL) || (strcmp(type, "gpt") != 0)) && + (uuid != NULL)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Check if a disk is effectively a multipath whole disk + */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname; + boolean_t wholedisk = B_FALSE; + + if (realpath(path, nodepath) == NULL) + return (B_FALSE); + sysname = strrchr(nodepath, '/') + 1; + if (strncmp(sysname, "dm-", 3) != 0) + return (B_FALSE); + if ((udev = udev_new()) == NULL) + return (B_FALSE); + if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname)) == NULL) { + udev_device_unref(dev); + return (B_FALSE); + } + + wholedisk = udev_mpath_whole_disk(dev); + + udev_device_unref(dev); + return (wholedisk); +} +#endif diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c new file mode 100644 index 000000000..f6e56fabf --- /dev/null +++ b/lib/libzutil/zutil_import.c @@ -0,0 +1,2389 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * Pool import support functions. + * + * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since + * these commands are expected to run in the global zone, we can assume + * that the devices are all readable when called. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include <ctype.h> +#include <devid.h> +#include <dirent.h> +#include <errno.h> +#include <libintl.h> +#include <libgen.h> +#ifdef HAVE_LIBUDEV +#include <libudev.h> +#include <sched.h> +#endif +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/dktp/fdisk.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/vdev_impl.h> + +#include <blkid/blkid.h> +#include <thread_pool.h> +#include <libzutil.h> +#include <libnvpair.h> + +#define IMPORT_ORDER_PREFERRED_1 1 +#define IMPORT_ORDER_PREFERRED_2 2 +#define IMPORT_ORDER_SCAN_OFFSET 10 +#define IMPORT_ORDER_DEFAULT 100 +#define DEFAULT_IMPORT_PATH_SIZE 9 + +#define EZFS_BADCACHE "invalid or missing cache file" +#define EZFS_BADPATH "must be an absolute path" +#define EZFS_NOMEM "out of memory" +#define EZFS_EACESS "some devices require root privileges" + +typedef struct libpc_handle { + boolean_t lpc_printerr; + boolean_t lpc_open_access_error; + boolean_t lpc_desc_active; + char lpc_desc[1024]; + const pool_config_ops_t *lpc_ops; + void *lpc_lib_handle; +} libpc_handle_t; + +/*PRINTFLIKE2*/ +static void +zfs_error_aux(libpc_handle_t *hdl, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + (void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap); + hdl->lpc_desc_active = B_TRUE; + + va_end(ap); +} + +static void +zfs_verror(libpc_handle_t *hdl, const char *error, const char *fmt, va_list ap) +{ + char action[1024]; + + (void) vsnprintf(action, sizeof (action), fmt, ap); + + if (hdl->lpc_desc_active) + hdl->lpc_desc_active = B_FALSE; + else + hdl->lpc_desc[0] = '\0'; + + if (hdl->lpc_printerr) { + if (hdl->lpc_desc[0] != '\0') + error = hdl->lpc_desc; + + (void) fprintf(stderr, "%s: %s\n", action, error); + } +} + +/*PRINTFLIKE3*/ +static int +zfs_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + zfs_verror(hdl, error, fmt, ap); + + va_end(ap); + + return (-1); +} + +static int +zfs_error(libpc_handle_t *hdl, const char *error, const char *msg) +{ + return (zfs_error_fmt(hdl, error, "%s", msg)); +} + +static int +no_memory(libpc_handle_t *hdl) +{ + zfs_error(hdl, EZFS_NOMEM, "internal error"); + exit(1); +} + +static void * +zfs_alloc(libpc_handle_t *hdl, size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) + (void) no_memory(hdl); + + return (data); +} + +static char * +zfs_strdup(libpc_handle_t *hdl, const char *str) +{ + char *ret; + + if ((ret = strdup(str)) == NULL) + (void) no_memory(hdl); + + return (ret); +} + +/* + * Intermediate structures used to gather configuration information. + */ +typedef struct config_entry { + uint64_t ce_txg; + nvlist_t *ce_config; + struct config_entry *ce_next; +} config_entry_t; + +typedef struct vdev_entry { + uint64_t ve_guid; + config_entry_t *ve_configs; + struct vdev_entry *ve_next; +} vdev_entry_t; + +typedef struct pool_entry { + uint64_t pe_guid; + vdev_entry_t *pe_vdevs; + struct pool_entry *pe_next; +} pool_entry_t; + +typedef struct name_entry { + char *ne_name; + uint64_t ne_guid; + uint64_t ne_order; + uint64_t ne_num_labels; + struct name_entry *ne_next; +} name_entry_t; + +typedef struct pool_list { + pool_entry_t *pools; + name_entry_t *names; +} pool_list_t; + +#define ZVOL_ROOT "/dev/zvol" +#define DEV_BYID_PATH "/dev/disk/by-id/" + +/* + * Linux persistent device strings for vdev labels + * + * based on libudev for consistency with libudev disk add/remove events + */ + +typedef struct vdev_dev_strs { + char vds_devid[128]; + char vds_devphys[128]; +} vdev_dev_strs_t; + +/* + * Obtain the persistent device id string (describes what) + * + * used by ZED vdev matching for auto-{online,expand,replace} + */ +int +zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +{ + struct udev_list_entry *entry; + const char *bus; + char devbyid[MAXPATHLEN]; + + /* The bus based by-id path is preferred */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + + if (bus == NULL) { + const char *dm_uuid; + + /* + * For multipath nodes use the persistent uuid based identifier + * + * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f + */ + dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (dm_uuid != NULL) { + (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); + return (0); + } + + /* + * For volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + /* + * NVME 'by-id' symlinks are similar to bus case + */ + struct udev_device *parent; + + parent = udev_device_get_parent_with_subsystem_devtype(dev, + "nvme", NULL); + if (parent != NULL) + bus = "nvme"; /* continue with bus symlink search */ + else + return (ENODATA); + } + + /* + * locate the bus specific by-id link + */ + (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, devbyid, strlen(devbyid)) == 0) { + name += strlen(DEV_BYID_PATH); + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + return (ENODATA); +} + +/* + * Obtain the persistent physical location string (describes where) + * + * used by ZED vdev matching for auto-{online,expand,replace} + */ +int +zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +{ + const char *physpath = NULL; + struct udev_list_entry *entry; + + /* + * Normal disks use ID_PATH for their physical path. + */ + physpath = udev_device_get_property_value(dev, "ID_PATH"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * Device mapper devices are virtual and don't have a physical + * path. For them we use ID_VDEV instead, which is setup via the + * /etc/vdev_id.conf file. ID_VDEV provides a persistent path + * to a virtual device. If you don't have vdev_id.conf setup, + * you cannot use multipath autoreplace with device mapper. + */ + physpath = udev_device_get_property_value(dev, "ID_VDEV"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * For ZFS volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + /* + * For all other devices fallback to using the by-uuid name. + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + return (ENODATA); +} + +/* + * A disk is considered a multipath whole disk when: + * DEVNAME key value has "dm-" + * DM_NAME key value has "mpath" prefix + * DM_UUID key exists + * ID_PART_TABLE_TYPE key does not exist or is not gpt + */ +static boolean_t +udev_mpath_whole_disk(struct udev_device *dev) +{ + const char *devname, *type, *uuid; + + devname = udev_device_get_property_value(dev, "DEVNAME"); + type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + + if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && + ((type == NULL) || (strcmp(type, "gpt") != 0)) && + (uuid != NULL)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +udev_device_is_ready(struct udev_device *dev) +{ +#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED + return (udev_device_get_is_initialized(dev)); +#else + /* wait for DEVLINKS property to be initialized */ + return (udev_device_get_property_value(dev, "DEVLINKS") != NULL); +#endif +} + +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(const char *path, int timeout_ms) +{ +#ifdef HAVE_LIBUDEV + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname = NULL; + int ret = ENODEV; + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + + if ((udev = udev_new()) == NULL) + return (ENXIO); + + start = gethrtime(); + settle = 0; + + do { + if (sysname == NULL) { + if (realpath(path, nodepath) != NULL) { + sysname = strrchr(nodepath, '/') + 1; + } else { + (void) usleep(sleep_ms * MILLISEC); + continue; + } + } + + dev = udev_device_new_from_subsystem_sysname(udev, + "block", sysname); + if ((dev != NULL) && udev_device_is_ready(dev)) { + struct udev_list_entry *links, *link = NULL; + + ret = 0; + links = udev_device_get_devlinks_list_entry(dev); + + udev_list_entry_foreach(link, links) { + struct stat64 statbuf; + const char *name; + + name = udev_list_entry_get_name(link); + errno = 0; + if (stat64(name, &statbuf) == 0 && errno == 0) + continue; + + settle = 0; + ret = ENODEV; + break; + } + + if (ret == 0) { + if (settle == 0) { + settle = gethrtime(); + } else if (NSEC2MSEC(gethrtime() - settle) >= + settle_ms) { + udev_device_unref(dev); + break; + } + } + } + + udev_device_unref(dev); + (void) usleep(sleep_ms * MILLISEC); + + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + udev_unref(udev); + + return (ret); +#else + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +#endif /* HAVE_LIBUDEV */ +} + +/* + * Encode the persistent devices strings + * used for the vdev disk label + */ +static int +encode_device_strings(const char *path, vdev_dev_strs_t *ds, + boolean_t wholedisk) +{ +#ifdef HAVE_LIBUDEV + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname; + int ret = ENODEV; + hrtime_t start; + + if ((udev = udev_new()) == NULL) + return (ENXIO); + + /* resolve path to a runtime device node instance */ + if (realpath(path, nodepath) == NULL) + goto no_dev; + + sysname = strrchr(nodepath, '/') + 1; + + /* + * Wait up to 3 seconds for udev to set up the device node context + */ + start = gethrtime(); + do { + dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname); + if (dev == NULL) + goto no_dev; + if (udev_device_is_ready(dev)) + break; /* udev ready */ + + udev_device_unref(dev); + dev = NULL; + + if (NSEC2MSEC(gethrtime() - start) < 10) + (void) sched_yield(); /* yield/busy wait up to 10ms */ + else + (void) usleep(10 * MILLISEC); + + } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC)); + + if (dev == NULL) + goto no_dev; + + /* + * Only whole disks require extra device strings + */ + if (!wholedisk && !udev_mpath_whole_disk(dev)) + goto no_dev; + + ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); + if (ret != 0) + goto no_dev_ref; + + /* physical location string (optional) */ + if (zfs_device_get_physical(dev, ds->vds_devphys, + sizeof (ds->vds_devphys)) != 0) { + ds->vds_devphys[0] = '\0'; /* empty string --> not available */ + } + +no_dev_ref: + udev_device_unref(dev); +no_dev: + udev_unref(udev); + + return (ret); +#else + return (ENOENT); +#endif +} + +/* + * Update a leaf vdev's persistent device strings (Linux only) + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * single device node example: + * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' + * + * multipath device node example: + * devid: 'dm-uuid-mpath-35000c5006304de3f' + * + * We also store the enclosure sysfs path for turning on enclosure LEDs + * (if applicable): + * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + vdev_dev_strs_t vds; + char *env, *type, *path; + uint64_t wholedisk = 0; + char *upath, *spath; + + /* + * For the benefit of legacy ZFS implementations, allow + * for opting out of devid strings in the vdev label. + * + * example use: + * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer + * + * explanation: + * Older ZFS on Linux implementations had issues when attempting to + * display pool config VDEV names if a "devid" NVP value is present + * in the pool's config. + * + * For example, a pool that originated on illumos platform would + * have a devid value in the config and "zpool status" would fail + * when listing the config. + * + * A pool can be stripped of any "devid" values on import or + * prevented from adding them on zpool create|add by setting + * ZFS_VDEV_DEVID_OPT_OUT. + */ + env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + return; + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || + strcmp(type, VDEV_TYPE_DISK) != 0) { + return; + } + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + + /* + * Update device string values in config nvlist + */ + if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); + if (vds.vds_devphys[0] != '\0') { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vds.vds_devphys); + } + + /* Add enclosure sysfs path (if disk is in an enclosure) */ + upath = zfs_get_underlying_path(path); + spath = zfs_get_enclosure_sysfs_path(upath); + if (spath) + nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + spath); + else + nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + + free(upath); + free(spath); + } else { + /* clear out any stale entries */ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + } +} + +/* + * Go through and fix up any path and/or devid information for the given vdev + * configuration. + */ +static int +fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names) +{ + nvlist_t **child; + uint_t c, children; + uint64_t guid; + name_entry_t *ne, *best; + char *path; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (fix_paths(hdl, child[c], names) != 0) + return (-1); + return (0); + } + + /* + * This is a leaf (file or disk) vdev. In either case, go through + * the name list and see if we find a matching guid. If so, replace + * the path and see if we can calculate a new devid. + * + * There may be multiple names associated with a particular guid, in + * which case we have overlapping partitions or multiple paths to the + * same disk. In this case we prefer to use the path name which + * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we + * use the lowest order device which corresponds to the first match + * while traversing the ZPOOL_IMPORT_PATH search path. + */ + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + path = NULL; + + best = NULL; + for (ne = names; ne != NULL; ne = ne->ne_next) { + if (ne->ne_guid == guid) { + if (path == NULL) { + best = ne; + break; + } + + if ((strlen(path) == strlen(ne->ne_name)) && + strncmp(path, ne->ne_name, strlen(path)) == 0) { + best = ne; + break; + } + + if (best == NULL) { + best = ne; + continue; + } + + /* Prefer paths with move vdev labels. */ + if (ne->ne_num_labels > best->ne_num_labels) { + best = ne; + continue; + } + + /* Prefer paths earlier in the search order. */ + if (ne->ne_num_labels == best->ne_num_labels && + ne->ne_order < best->ne_order) { + best = ne; + continue; + } + } + } + + if (best == NULL) + return (0); + + if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) + return (-1); + + /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */ + update_vdev_config_dev_strs(nv); + + return (0); +} + +/* + * Add the given configuration to the list of known devices. + */ +static int +add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, + int order, int num_labels, nvlist_t *config) +{ + uint64_t pool_guid, vdev_guid, top_guid, txg, state; + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + name_entry_t *ne; + + /* + * If this is a hot spare not currently in use or level 2 cache + * device, add it to the list of names to translate, but don't do + * anything else. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) == 0 && + (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { + if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) + return (-1); + + if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { + free(ne); + return (-1); + } + ne->ne_guid = vdev_guid; + ne->ne_order = order; + ne->ne_num_labels = num_labels; + ne->ne_next = pl->names; + pl->names = ne; + + return (0); + } + + /* + * If we have a valid config but cannot read any of these fields, then + * it means we have a half-initialized label. In vdev_label_init() + * we write a label with txg == 0 so that we can identify the device + * in case the user refers to the same disk later on. If we fail to + * create the pool, we'll be left with a label in this state + * which should not be considered part of a valid pool. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, + &vdev_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + return (0); + } + + /* + * First, see if we know about this pool. If not, then add it to the + * list of known pools. + */ + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + if (pe->pe_guid == pool_guid) + break; + } + + if (pe == NULL) { + if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) { + return (-1); + } + pe->pe_guid = pool_guid; + pe->pe_next = pl->pools; + pl->pools = pe; + } + + /* + * Second, see if we know about this toplevel vdev. Add it if its + * missing. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + if (ve->ve_guid == top_guid) + break; + } + + if (ve == NULL) { + if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { + return (-1); + } + ve->ve_guid = top_guid; + ve->ve_next = pe->pe_vdevs; + pe->pe_vdevs = ve; + } + + /* + * Third, see if we have a config with a matching transaction group. If + * so, then we do nothing. Otherwise, add it to the list of known + * configs. + */ + for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { + if (ce->ce_txg == txg) + break; + } + + if (ce == NULL) { + if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) { + return (-1); + } + ce->ce_txg = txg; + ce->ce_config = fnvlist_dup(config); + ce->ce_next = ve->ve_configs; + ve->ve_configs = ce; + } + + /* + * At this point we've successfully added our config to the list of + * known configs. The last thing to do is add the vdev guid -> path + * mappings so that we can fix up the configuration as necessary before + * doing the import. + */ + if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) + return (-1); + + if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { + free(ne); + return (-1); + } + + ne->ne_guid = vdev_guid; + ne->ne_order = order; + ne->ne_num_labels = num_labels; + ne->ne_next = pl->names; + pl->names = ne; + + return (0); +} + +static int +pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, + boolean_t *isactive) +{ + ASSERT(hdl->lpc_ops->pco_pool_active != NULL); + + int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name, + guid, isactive); + + return (error); +} + +static nvlist_t * +refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig) +{ + ASSERT(hdl->lpc_ops->pco_refresh_config != NULL); + + return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle, + tryconfig)); +} + +/* + * Determine if the vdev id is a hole in the namespace. + */ +static boolean_t +vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) +{ + int c; + + for (c = 0; c < holes; c++) { + + /* Top-level is a hole */ + if (hole_array[c] == id) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Convert our list of pools into the definitive set of configurations. We + * start by picking the best config for each toplevel vdev. Once that's done, + * we assemble the toplevel vdevs into a full config for the pool. We make a + * pass to fix up any incorrect paths, and then add it to the main list to + * return to the user. + */ +static nvlist_t * +get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, + nvlist_t *policy) +{ + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; + nvlist_t **spares, **l2cache; + uint_t i, nspares, nl2cache; + boolean_t config_seen; + uint64_t best_txg; + char *name, *hostname = NULL; + uint64_t guid; + uint_t children = 0; + nvlist_t **child = NULL; + uint_t holes; + uint64_t *hole_array, max_id; + uint_t c; + boolean_t isactive; + uint64_t hostid; + nvlist_t *nvl; + boolean_t valid_top_config = B_FALSE; + + if (nvlist_alloc(&ret, 0, 0) != 0) + goto nomem; + + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + uint64_t id, max_txg = 0; + + if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + config_seen = B_FALSE; + + /* + * Iterate over all toplevel vdevs. Grab the pool configuration + * from the first one we find, and then go through the rest and + * add them as necessary to the 'vdevs' member of the config. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + + /* + * Determine the best configuration for this vdev by + * selecting the config with the latest transaction + * group. + */ + best_txg = 0; + for (ce = ve->ve_configs; ce != NULL; + ce = ce->ce_next) { + + if (ce->ce_txg > best_txg) { + tmp = ce->ce_config; + best_txg = ce->ce_txg; + } + } + + /* + * We rely on the fact that the max txg for the + * pool will contain the most up-to-date information + * about the valid top-levels in the vdev namespace. + */ + if (best_txg > max_txg) { + (void) nvlist_remove(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, + ZPOOL_CONFIG_HOLE_ARRAY, + DATA_TYPE_UINT64_ARRAY); + + max_txg = best_txg; + hole_array = NULL; + holes = 0; + max_id = 0; + valid_top_config = B_FALSE; + + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { + verify(nvlist_add_uint64(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + max_id) == 0); + valid_top_config = B_TRUE; + } + + if (nvlist_lookup_uint64_array(tmp, + ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, + &holes) == 0) { + verify(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_HOLE_ARRAY, + hole_array, holes) == 0); + } + } + + if (!config_seen) { + /* + * Copy the relevant pieces of data to the pool + * configuration: + * + * version + * pool guid + * name + * comment (if available) + * pool state + * hostid (if available) + * hostname (if available) + */ + uint64_t state, version; + char *comment = NULL; + + version = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VERSION); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_VERSION, version); + guid = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_GUID); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_GUID, guid); + name = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_POOL_NAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_POOL_NAME, name); + + if (nvlist_lookup_string(tmp, + ZPOOL_CONFIG_COMMENT, &comment) == 0) + fnvlist_add_string(config, + ZPOOL_CONFIG_COMMENT, comment); + + state = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_STATE); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_STATE, state); + + hostid = 0; + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + fnvlist_add_uint64(config, + ZPOOL_CONFIG_HOSTID, hostid); + hostname = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_HOSTNAME, hostname); + } + + config_seen = B_TRUE; + } + + /* + * Add this top-level vdev to the child array. + */ + verify(nvlist_lookup_nvlist(tmp, + ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); + verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, + &id) == 0); + + if (id >= children) { + nvlist_t **newchild; + + newchild = zfs_alloc(hdl, (id + 1) * + sizeof (nvlist_t *)); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + free(child); + child = newchild; + children = id + 1; + } + if (nvlist_dup(nvtop, &child[id], 0) != 0) + goto nomem; + + } + + /* + * If we have information about all the top-levels then + * clean up the nvlist which we've constructed. This + * means removing any extraneous devices that are + * beyond the valid range or adding devices to the end + * of our array which appear to be missing. + */ + if (valid_top_config) { + if (max_id < children) { + for (c = max_id; c < children; c++) + nvlist_free(child[c]); + children = max_id; + } else if (max_id > children) { + nvlist_t **newchild; + + newchild = zfs_alloc(hdl, (max_id) * + sizeof (nvlist_t *)); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + free(child); + child = newchild; + children = max_id; + } + } + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + /* + * The vdev namespace may contain holes as a result of + * device removal. We must add them back into the vdev + * tree before we process any missing devices. + */ + if (holes > 0) { + ASSERT(valid_top_config); + + for (c = 0; c < children; c++) { + nvlist_t *holey; + + if (child[c] != NULL || + !vdev_is_hole(hole_array, holes, c)) + continue; + + if (nvlist_alloc(&holey, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + + /* + * Holes in the namespace are treated as + * "hole" top-level vdevs and have a + * special flag set on them. + */ + if (nvlist_add_string(holey, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(holey); + goto nomem; + } + child[c] = holey; + } + } + + /* + * Look for any missing top-level vdevs. If this is the case, + * create a faked up 'missing' vdev as a placeholder. We cannot + * simply compress the child array, because the kernel performs + * certain checks to make sure the vdev IDs match their location + * in the configuration. + */ + for (c = 0; c < children; c++) { + if (child[c] == NULL) { + nvlist_t *missing; + if (nvlist_alloc(&missing, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + if (nvlist_add_string(missing, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(missing); + goto nomem; + } + child[c] = missing; + } + } + + /* + * Put all of this pool's top-level vdevs into a root vdev. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + child, children) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + children = 0; + child = NULL; + + /* + * Go through and fix up any paths and/or devids based on our + * known list of vdev GUID -> path mappings. + */ + if (fix_paths(hdl, nvroot, pl->names) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + /* + * Add the root vdev to this pool's configuration. + */ + if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + nvroot) != 0) { + nvlist_free(nvroot); + goto nomem; + } + nvlist_free(nvroot); + + /* + * zdb uses this path to report on active pools that were + * imported or created using -R. + */ + if (active_ok) + goto add_pool; + + /* + * Determine if this pool is currently active, in which case we + * can't actually import it. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + if (pool_active(hdl, name, guid, &isactive) != 0) + goto error; + + if (isactive) { + nvlist_free(config); + config = NULL; + continue; + } + + if (policy != NULL) { + if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, + policy) != 0) + goto nomem; + } + + if ((nvl = refresh_config(hdl, config)) == NULL) { + nvlist_free(config); + config = NULL; + continue; + } + + nvlist_free(config); + config = nvl; + + /* + * Go through and update the paths for spares, now that we have + * them. + */ + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + if (fix_paths(hdl, spares[i], pl->names) != 0) + goto nomem; + } + } + + /* + * Update the paths for l2cache devices. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + for (i = 0; i < nl2cache; i++) { + if (fix_paths(hdl, l2cache[i], pl->names) != 0) + goto nomem; + } + } + + /* + * Restore the original information read from the actual label. + */ + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, + DATA_TYPE_STRING); + if (hostid != 0) { + verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, + hostid) == 0); + verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, + hostname) == 0); + } + +add_pool: + /* + * Add this pool to the list of configs. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + + if (nvlist_add_nvlist(ret, name, config) != 0) + goto nomem; + + nvlist_free(config); + config = NULL; + } + + return (ret); + +nomem: + (void) no_memory(hdl); +error: + nvlist_free(config); + nvlist_free(ret); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + + return (NULL); +} + +/* + * Return the offset of the given label. + */ +static uint64_t +label_offset(uint64_t size, int l) +{ + ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Given a file descriptor, read the label information and return an nvlist + * describing the configuration, if there is one. The number of valid + * labels found will be returned in num_labels when non-NULL. + */ +int +zpool_read_label(int fd, nvlist_t **config, int *num_labels) +{ + struct stat64 statbuf; + int l, count = 0; + vdev_label_t *label; + nvlist_t *expected_config = NULL; + uint64_t expected_guid = 0, size; + int error; + + *config = NULL; + + if (fstat64_blk(fd, &statbuf) == -1) + return (0); + size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); + + error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label)); + if (error) + return (-1); + + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t state, guid, txg; + + if (pread64(fd, label, sizeof (vdev_label_t), + label_offset(size, l)) != sizeof (vdev_label_t)) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid == 0) { + nvlist_free(*config); + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(*config); + continue; + } + + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(*config); + continue; + } + + if (expected_guid) { + if (expected_guid == guid) + count++; + + nvlist_free(*config); + } else { + expected_config = *config; + expected_guid = guid; + count++; + } + } + + if (num_labels != NULL) + *num_labels = count; + + free(label); + *config = expected_config; + + return (0); +} + +typedef struct rdsk_node { + char *rn_name; /* Full path to device */ + int rn_order; /* Preferred order (low to high) */ + int rn_num_labels; /* Number of valid labels */ + uint64_t rn_vdev_guid; /* Expected vdev guid when set */ + libpc_handle_t *rn_hdl; + nvlist_t *rn_config; /* Label config */ + avl_tree_t *rn_avl; + avl_node_t rn_node; + pthread_mutex_t *rn_lock; + boolean_t rn_labelpaths; +} rdsk_node_t; + +/* + * Sorted by vdev guid and full path to allow for multiple entries with + * the same full path name. This is required because it's possible to + * have multiple block devices with labels that refer to the same + * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both + * entries need to be added to the cache. Scenarios where this can occur + * include overwritten pool labels, devices which are visible from multiple + * hosts and multipath devices. + */ +static int +slice_cache_compare(const void *arg1, const void *arg2) +{ + const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; + const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; + uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; + uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; + int rv; + + rv = AVL_CMP(guid1, guid2); + if (rv) + return (rv); + + return (AVL_ISIGN(strcmp(nm1, nm2))); +} + +static boolean_t +is_watchdog_dev(char *dev) +{ + /* For 'watchdog' dev */ + if (strcmp(dev, "watchdog") == 0) + return (B_TRUE); + + /* For 'watchdog<digit><whatever> */ + if (strstr(dev, "watchdog") == dev && isdigit(dev[8])) + return (B_TRUE); + + return (B_FALSE); +} + +static int +label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid, + uint64_t vdev_guid, char **path, char **devid) +{ + nvlist_t **child; + uint_t c, children; + uint64_t guid; + char *val; + int error; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + error = label_paths_impl(hdl, child[c], + pool_guid, vdev_guid, path, devid); + if (error) + return (error); + } + return (0); + } + + if (nvroot == NULL) + return (0); + + error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid); + if ((error != 0) || (guid != vdev_guid)) + return (0); + + error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val); + if (error == 0) + *path = val; + + error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val); + if (error == 0) + *devid = val; + + return (0); +} + +/* + * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID + * and store these strings as config_path and devid_path respectively. + * The returned pointers are only valid as long as label remains valid. + */ +static int +label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid) +{ + nvlist_t *nvroot; + uint64_t pool_guid; + uint64_t vdev_guid; + + *path = NULL; + *devid = NULL; + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid)) + return (ENOENT); + + return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, + devid)); +} + +static void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + libpc_handle_t *hdl = rn->rn_hdl; + struct stat64 statbuf; + nvlist_t *config; + char *bname, *dupname; + uint64_t vdev_guid = 0; + int error; + int num_labels = 0; + int fd; + + /* + * Skip devices with well known prefixes there can be side effects + * when opening devices which need to be avoided. + * + * hpet - High Precision Event Timer + * watchdog - Watchdog must be closed in a special way. + */ + dupname = zfs_strdup(hdl, rn->rn_name); + bname = basename(dupname); + error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname)); + free(dupname); + if (error) + return; + + /* + * Ignore failed stats. We only want regular files and block devices. + */ + if (stat64(rn->rn_name, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) + return; + + /* + * Preferentially open using O_DIRECT to bypass the block device + * cache which may be stale for multipath devices. An EINVAL errno + * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. + */ + fd = open(rn->rn_name, O_RDONLY | O_DIRECT); + if ((fd < 0) && (errno == EINVAL)) + fd = open(rn->rn_name, O_RDONLY); + + if ((fd < 0) && (errno == EACCES)) + hdl->lpc_open_access_error = B_TRUE; + + if (fd < 0) + return; + + /* + * This file is too small to hold a zpool + */ + if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } + + error = zpool_read_label(fd, &config, &num_labels); + if (error != 0) { + (void) close(fd); + return; + } + + if (num_labels == 0) { + (void) close(fd); + nvlist_free(config); + return; + } + + /* + * Check that the vdev is for the expected guid. Additional entries + * are speculatively added based on the paths stored in the labels. + * Entries with valid paths but incorrect guids must be removed. + */ + error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { + (void) close(fd); + nvlist_free(config); + return; + } + + (void) close(fd); + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + + /* + * Add additional entries for paths described by this label. + */ + if (rn->rn_labelpaths) { + char *path = NULL; + char *devid = NULL; + rdsk_node_t *slice; + avl_index_t where; + int error; + + if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) + return; + + /* + * Allow devlinks to stabilize so all paths are available. + */ + zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT); + + if (path != NULL) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_strdup(hdl, path); + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_1; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + + if (devid != NULL) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + error = asprintf(&slice->rn_name, "%s%s", + DEV_BYID_PATH, devid); + if (error == -1) { + free(slice); + return; + } + + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_2; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + } +} + +static void +zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t *cache, const char *path, const char *name, int order) +{ + avl_index_t where; + rdsk_node_t *slice; + + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) { + free(slice); + return; + } + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = cache; + slice->rn_hdl = hdl; + slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET; + slice->rn_labelpaths = B_FALSE; + + pthread_mutex_lock(lock); + if (avl_find(cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(cache, slice, where); + } + pthread_mutex_unlock(lock); +} + +static int +zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t *cache, const char *dir, int order) +{ + int error; + char path[MAXPATHLEN]; + struct dirent64 *dp; + DIR *dirp; + + if (realpath(dir, path) == NULL) { + error = errno; + if (error == ENOENT) + return (0); + + zfs_error_aux(hdl, strerror(error)); + (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( + TEXT_DOMAIN, "cannot resolve path '%s'"), dir); + return (error); + } + + dirp = opendir(path); + if (dirp == NULL) { + error = errno; + zfs_error_aux(hdl, strerror(error)); + (void) zfs_error_fmt(hdl, EZFS_BADPATH, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); + return (error); + } + + while ((dp = readdir64(dirp)) != NULL) { + const char *name = dp->d_name; + if (name[0] == '.' && + (name[1] == 0 || (name[1] == '.' && name[2] == 0))) + continue; + + zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, + order); + } + + (void) closedir(dirp); + return (0); +} + +static int +zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t *cache, const char *dir, int order) +{ + int error = 0; + char path[MAXPATHLEN]; + char *d, *b; + char *dpath, *name; + + /* + * Seperate the directory part and last part of the + * path. We do this so that we can get the realpath of + * the directory. We don't get the realpath on the + * whole path because if it's a symlink, we want the + * path of the symlink not where it points to. + */ + d = zfs_strdup(hdl, dir); + b = zfs_strdup(hdl, dir); + dpath = dirname(d); + name = basename(b); + + if (realpath(dpath, path) == NULL) { + error = errno; + if (error == ENOENT) { + error = 0; + goto out; + } + + zfs_error_aux(hdl, strerror(error)); + (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( + TEXT_DOMAIN, "cannot resolve path '%s'"), dir); + goto out; + } + + zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); + +out: + free(b); + free(d); + return (error); +} + +/* + * Scan a list of directories for zfs devices. + */ +static int +zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache, char **dir, int dirs) +{ + avl_tree_t *cache; + rdsk_node_t *slice; + void *cookie; + int i, error; + + *slice_cache = NULL; + cache = zfs_alloc(hdl, sizeof (avl_tree_t)); + avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + + for (i = 0; i < dirs; i++) { + struct stat sbuf; + + if (stat(dir[i], &sbuf) != 0) { + error = errno; + if (error == ENOENT) + continue; + + zfs_error_aux(hdl, strerror(error)); + (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext( + TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]); + goto error; + } + + /* + * If dir[i] is a directory, we walk through it and add all + * the entry to the cache. If it's not a directory, we just + * add it to the cache. + */ + if (S_ISDIR(sbuf.st_mode)) { + if ((error = zpool_find_import_scan_dir(hdl, lock, + cache, dir[i], i)) != 0) + goto error; + } else { + if ((error = zpool_find_import_scan_path(hdl, lock, + cache, dir[i], i)) != 0) + goto error; + } + } + + *slice_cache = cache; + return (0); + +error: + cookie = NULL; + while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { + free(slice->rn_name); + free(slice); + } + free(cache); + + return (error); +} + +static char * +zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { + "/dev/disk/by-vdev", /* Custom rules, use first if they exist */ + "/dev/mapper", /* Use multipath devices before components */ + "/dev/disk/by-partlabel", /* Single unique entry set by user */ + "/dev/disk/by-partuuid", /* Generated partition uuid */ + "/dev/disk/by-label", /* Custom persistent labels */ + "/dev/disk/by-uuid", /* Single unique entry and persistent */ + "/dev/disk/by-id", /* May be multiple entries and persistent */ + "/dev/disk/by-path", /* Encodes physical location and persistent */ + "/dev" /* UNSAFE device names will change */ +}; + +const char * const * +zpool_default_search_paths(size_t *count) +{ + *count = DEFAULT_IMPORT_PATH_SIZE; + return ((const char * const *)zpool_default_import_path); +} + +/* + * Given a full path to a device determine if that device appears in the + * import search path. If it does return the first match and store the + * index in the passed 'order' variable, otherwise return an error. + */ +static int +zfs_path_order(char *name, int *order) +{ + int i = 0, error = ENOENT; + char *dir, *env, *envdup; + + env = getenv("ZPOOL_IMPORT_PATH"); + if (env) { + envdup = strdup(env); + dir = strtok(envdup, ":"); + while (dir) { + if (strncmp(name, dir, strlen(dir)) == 0) { + *order = i; + error = 0; + break; + } + dir = strtok(NULL, ":"); + i++; + } + free(envdup); + } else { + for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) { + if (strncmp(name, zpool_default_import_path[i], + strlen(zpool_default_import_path[i])) == 0) { + *order = i; + error = 0; + break; + } + } + } + + return (error); +} + +/* + * Use libblkid to quickly enumerate all known zfs devices. + */ +static int +zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache) +{ + rdsk_node_t *slice; + blkid_cache cache; + blkid_dev_iterate iter; + blkid_dev dev; + avl_index_t where; + int error; + + *slice_cache = NULL; + + error = blkid_get_cache(&cache, NULL); + if (error != 0) + return (error); + + error = blkid_probe_all_new(cache); + if (error != 0) { + blkid_put_cache(cache); + return (error); + } + + iter = blkid_dev_iterate_begin(cache); + if (iter == NULL) { + blkid_put_cache(cache); + return (EINVAL); + } + + error = blkid_dev_set_search(iter, "TYPE", "zfs_member"); + if (error != 0) { + blkid_dev_iterate_end(iter); + blkid_put_cache(cache); + return (error); + } + + *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t)); + avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + + while (blkid_dev_next(iter, &dev) == 0) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev)); + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = *slice_cache; + slice->rn_hdl = hdl; + slice->rn_labelpaths = B_TRUE; + + error = zfs_path_order(slice->rn_name, &slice->rn_order); + if (error == 0) + slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; + else + slice->rn_order = IMPORT_ORDER_DEFAULT; + + pthread_mutex_lock(lock); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } + pthread_mutex_unlock(lock); + } + + blkid_dev_iterate_end(iter); + blkid_put_cache(cache); + + return (0); +} + +/* + * Given a list of directories to search, find all pools stored on disk. This + * includes partial pools which are not available to import. If no args are + * given (argc is 0), then the default directory (/dev/dsk) is searched. + * poolname or guid (but not both) are provided by the caller when trying + * to import a specific pool. + */ +static nvlist_t * +zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) +{ + nvlist_t *ret = NULL; + pool_list_t pools = { 0 }; + pool_entry_t *pe, *penext; + vdev_entry_t *ve, *venext; + config_entry_t *ce, *cenext; + name_entry_t *ne, *nenext; + pthread_mutex_t lock; + avl_tree_t *cache; + rdsk_node_t *slice; + void *cookie; + tpool_t *t; + + verify(iarg->poolname == NULL || iarg->guid == 0); + pthread_mutex_init(&lock, NULL); + + /* + * Locate pool member vdevs using libblkid or by directory scanning. + * On success a newly allocated AVL tree which is populated with an + * entry for each discovered vdev will be returned as the cache. + * It's the callers responsibility to consume and destroy this tree. + */ + if (iarg->scan || iarg->paths != 0) { + int dirs = iarg->paths; + char **dir = iarg->path; + + if (dirs == 0) { + dir = zpool_default_import_path; + dirs = DEFAULT_IMPORT_PATH_SIZE; + } + + if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0) + return (NULL); + } else { + if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) + return (NULL); + } + + /* + * Create a thread pool to parallelize the process of reading and + * validating labels, a large number of threads can be used due to + * minimal contention. + */ + t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); + for (slice = avl_first(cache); slice; + (slice = avl_walk(cache, slice, AVL_AFTER))) + (void) tpool_dispatch(t, zpool_open_func, slice); + + tpool_wait(t); + tpool_destroy(t); + + /* + * Process the cache filtering out any entries which are not + * for the specificed pool then adding matching label configs. + */ + cookie = NULL; + while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { + if (slice->rn_config != NULL) { + nvlist_t *config = slice->rn_config; + boolean_t matched = B_TRUE; + boolean_t aux = B_FALSE; + int fd; + + /* + * Check if it's a spare or l2cache device. If it is, + * we need to skip the name and guid check since they + * don't exist on aux device label. + */ + if (iarg->poolname != NULL || iarg->guid != 0) { + uint64_t state; + aux = nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &state) == 0 && + (state == POOL_STATE_SPARE || + state == POOL_STATE_L2CACHE); + } + + if (iarg->poolname != NULL && !aux) { + char *pname; + + matched = nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && + strcmp(iarg->poolname, pname) == 0; + } else if (iarg->guid != 0 && !aux) { + uint64_t this_guid; + + matched = nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 && + iarg->guid == this_guid; + } + if (matched) { + /* + * Verify all remaining entries can be opened + * exclusively. This will prune all underlying + * multipath devices which otherwise could + * result in the vdev appearing as UNAVAIL. + * + * Under zdb, this step isn't required and + * would prevent a zdb -e of active pools with + * no cachefile. + */ + fd = open(slice->rn_name, O_RDONLY | O_EXCL); + if (fd >= 0 || iarg->can_be_active) { + if (fd >= 0) + close(fd); + add_config(hdl, &pools, + slice->rn_name, slice->rn_order, + slice->rn_num_labels, config); + } + } + nvlist_free(config); + } + free(slice->rn_name); + free(slice); + } + avl_destroy(cache); + free(cache); + pthread_mutex_destroy(&lock); + + ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); + + for (pe = pools.pools; pe != NULL; pe = penext) { + penext = pe->pe_next; + for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { + venext = ve->ve_next; + for (ce = ve->ve_configs; ce != NULL; ce = cenext) { + cenext = ce->ce_next; + nvlist_free(ce->ce_config); + free(ce); + } + free(ve); + } + free(pe); + } + + for (ne = pools.names; ne != NULL; ne = nenext) { + nenext = ne->ne_next; + free(ne->ne_name); + free(ne); + } + + return (ret); +} + +/* + * Given a cache file, return the contents as a list of importable pools. + * poolname or guid (but not both) are provided by the caller when trying + * to import a specific pool. + */ +static nvlist_t * +zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, + const char *poolname, uint64_t guid) +{ + char *buf; + int fd; + struct stat64 statbuf; + nvlist_t *raw, *src, *dst; + nvlist_t *pools; + nvpair_t *elem; + char *name; + uint64_t this_guid; + boolean_t active; + + verify(poolname == NULL || guid == 0); + + if ((fd = open(cachefile, O_RDONLY)) < 0) { + zfs_error_aux(hdl, "%s", strerror(errno)); + (void) zfs_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "failed to open cache file")); + return (NULL); + } + + if (fstat64(fd, &statbuf) != 0) { + zfs_error_aux(hdl, "%s", strerror(errno)); + (void) close(fd); + (void) zfs_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "failed to get size of cache file")); + return (NULL); + } + + if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) { + (void) close(fd); + return (NULL); + } + + if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { + (void) close(fd); + free(buf); + (void) zfs_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, + "failed to read cache file contents")); + return (NULL); + } + + (void) close(fd); + + if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { + free(buf); + (void) zfs_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, + "invalid or corrupt cache file contents")); + return (NULL); + } + + free(buf); + + /* + * Go through and get the current state of the pools and refresh their + * state. + */ + if (nvlist_alloc(&pools, 0, 0) != 0) { + (void) no_memory(hdl); + nvlist_free(raw); + return (NULL); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { + src = fnvpair_value_nvlist(elem); + + name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); + if (poolname != NULL && strcmp(poolname, name) != 0) + continue; + + this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); + if (guid != 0 && guid != this_guid) + continue; + + if (pool_active(hdl, name, this_guid, &active) != 0) { + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + + if (active) + continue; + + if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, + cachefile) != 0) { + (void) no_memory(hdl); + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + + if ((dst = refresh_config(hdl, src)) == NULL) { + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + + if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { + (void) no_memory(hdl); + nvlist_free(dst); + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + nvlist_free(dst); + } + + nvlist_free(raw); + return (pools); +} + +nvlist_t * +zpool_search_import(void *hdl, importargs_t *import, + const pool_config_ops_t *pco) +{ + libpc_handle_t handle = { 0 }; + nvlist_t *pools = NULL; + + handle.lpc_lib_handle = hdl; + handle.lpc_ops = pco; + handle.lpc_printerr = B_TRUE; + + verify(import->poolname == NULL || import->guid == 0); + + if (import->cachefile != NULL) + pools = zpool_find_import_cached(&handle, import->cachefile, + import->poolname, import->guid); + else + pools = zpool_find_import_impl(&handle, import); + + if ((pools == NULL || nvlist_empty(pools)) && + handle.lpc_open_access_error && geteuid() != 0) { + (void) zfs_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN, + "no pools found")); + } + + return (pools); +} + +static boolean_t +pool_match(nvlist_t *cfg, char *tgt) +{ + uint64_t v, guid = strtoull(tgt, NULL, 0); + char *s; + + if (guid != 0) { + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) + return (v == guid); + } else { + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) + return (strcmp(s, tgt) == 0); + } + return (B_FALSE); +} + +int +zpool_find_config(void *hdl, const char *target, nvlist_t **configp, + importargs_t *args, const pool_config_ops_t *pco) +{ + nvlist_t *pools; + nvlist_t *match = NULL; + nvlist_t *config = NULL; + char *name = NULL, *sepp = NULL; + char sep = '\0'; + int count = 0; + char *targetdup = strdup(target); + + *configp = NULL; + + if ((sepp = strpbrk(targetdup, "/@")) != NULL) { + sep = *sepp; + *sepp = '\0'; + } + + pools = zpool_search_import(hdl, args, pco); + + if (pools != NULL) { + nvpair_t *elem = NULL; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + VERIFY0(nvpair_value_nvlist(elem, &config)); + if (pool_match(config, targetdup)) { + count++; + if (match != NULL) { + /* multiple matches found */ + continue; + } else { + match = config; + name = nvpair_name(elem); + } + } + } + } + + if (count == 0) { + free(targetdup); + return (ENOENT); + } + + if (count > 1) { + free(targetdup); + return (EINVAL); + } + + *configp = match; + free(targetdup); + + return (0); +} diff --git a/lib/libzutil/zutil_nicenum.c b/lib/libzutil/zutil_nicenum.c new file mode 100644 index 000000000..9a81011fc --- /dev/null +++ b/lib/libzutil/zutil_nicenum.c @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <math.h> +#include <stdio.h> +#include <libzutil.h> + +/* + * Convert a number to an appropriately human-readable output. + */ +void +zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, + enum zfs_nicenum_format format) +{ + uint64_t n = num; + int index = 0; + const char *u; + const char *units[3][7] = { + [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"}, + [ZFS_NICENUM_BYTES] = {"B", "K", "M", "G", "T", "P", "E"}, + [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"} + }; + + const int units_len[] = {[ZFS_NICENUM_1024] = 6, + [ZFS_NICENUM_BYTES] = 6, + [ZFS_NICENUM_TIME] = 4}; + + const int k_unit[] = { [ZFS_NICENUM_1024] = 1024, + [ZFS_NICENUM_BYTES] = 1024, + [ZFS_NICENUM_TIME] = 1000}; + + double val; + + if (format == ZFS_NICENUM_RAW) { + snprintf(buf, buflen, "%llu", (u_longlong_t)num); + return; + } else if (format == ZFS_NICENUM_RAWTIME && num > 0) { + snprintf(buf, buflen, "%llu", (u_longlong_t)num); + return; + } else if (format == ZFS_NICENUM_RAWTIME && num == 0) { + snprintf(buf, buflen, "%s", "-"); + return; + } + + while (n >= k_unit[format] && index < units_len[format]) { + n /= k_unit[format]; + index++; + } + + u = units[format][index]; + + /* Don't print zero latencies since they're invalid */ + if ((format == ZFS_NICENUM_TIME) && (num == 0)) { + (void) snprintf(buf, buflen, "-"); + } else if ((index == 0) || ((num % + (uint64_t)powl(k_unit[format], index)) == 0)) { + /* + * If this is an even multiple of the base, always display + * without any decimal precision. + */ + (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t)n, u); + + } else { + /* + * We want to choose a precision that reflects the best choice + * for fitting in 5 characters. This can get rather tricky when + * we have numbers that are very close to an order of magnitude. + * For example, when displaying 10239 (which is really 9.999K), + * we want only a single place of precision for 10.0K. We could + * develop some complex heuristics for this, but it's much + * easier just to try each combination in turn. + */ + int i; + for (i = 2; i >= 0; i--) { + val = (double)num / + (uint64_t)powl(k_unit[format], index); + + /* + * Don't print floating point values for time. Note, + * we use floor() instead of round() here, since + * round can result in undesirable results. For + * example, if "num" is in the range of + * 999500-999999, it will print out "1000us". This + * doesn't happen if we use floor(). + */ + if (format == ZFS_NICENUM_TIME) { + if (snprintf(buf, buflen, "%d%s", + (unsigned int) floor(val), u) <= 5) + break; + + } else { + if (snprintf(buf, buflen, "%.*f%s", i, + val, u) <= 5) + break; + } + } + } +} + +/* + * Convert a number to an appropriately human-readable output. + */ +void +zfs_nicenum(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024); +} + +/* + * Convert a time to an appropriately human-readable output. + * @num: Time in nanoseconds + */ +void +zfs_nicetime(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME); +} + +/* + * Print out a raw number with correct column spacing + */ +void +zfs_niceraw(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW); +} + +/* + * Convert a number of bytes to an appropriately human-readable output. + */ +void +zfs_nicebytes(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_BYTES); +} diff --git a/lib/libzutil/zutil_pool.c b/lib/libzutil/zutil_pool.c new file mode 100644 index 000000000..734650f3c --- /dev/null +++ b/lib/libzutil/zutil_pool.c @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/nvpair.h> +#include <sys/fs/zfs.h> + +#include <libzutil.h> + +static void +dump_ddt_stat(const ddt_stat_t *dds, int h) +{ + char refcnt[6]; + char blocks[6], lsize[6], psize[6], dsize[6]; + char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6]; + + if (dds == NULL || dds->dds_blocks == 0) + return; + + if (h == -1) + (void) strcpy(refcnt, "Total"); + else + zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt)); + + zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks)); + zfs_nicebytes(dds->dds_lsize, lsize, sizeof (lsize)); + zfs_nicebytes(dds->dds_psize, psize, sizeof (psize)); + zfs_nicebytes(dds->dds_dsize, dsize, sizeof (dsize)); + zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks)); + zfs_nicebytes(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize)); + zfs_nicebytes(dds->dds_ref_psize, ref_psize, sizeof (ref_psize)); + zfs_nicebytes(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize)); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + refcnt, + blocks, lsize, psize, dsize, + ref_blocks, ref_lsize, ref_psize, ref_dsize); +} + +/* + * Print the DDT histogram and the column totals. + */ +void +zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh) +{ + int h; + + (void) printf("\n"); + + (void) printf("bucket " + " allocated " + " referenced \n"); + (void) printf("______ " + "______________________________ " + "______________________________\n"); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + "refcnt", + "blocks", "LSIZE", "PSIZE", "DSIZE", + "blocks", "LSIZE", "PSIZE", "DSIZE"); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + "------", + "------", "-----", "-----", "-----", + "------", "-----", "-----", "-----"); + + for (h = 0; h < 64; h++) + dump_ddt_stat(&ddh->ddh_stat[h], h); + + dump_ddt_stat(dds_total, -1); + + (void) printf("\n"); +} + +/* + * Process the buffer of nvlists, unpacking and storing each nvlist record + * into 'records'. 'leftover' is set to the number of bytes that weren't + * processed as there wasn't a complete record. + */ +int +zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, + nvlist_t ***records, uint_t *numrecords) +{ + uint64_t reclen; + nvlist_t *nv; + int i; + void *tmp; + + while (bytes_read > sizeof (reclen)) { + + /* get length of packed record (stored as little endian) */ + for (i = 0, reclen = 0; i < sizeof (reclen); i++) + reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i); + + if (bytes_read < sizeof (reclen) + reclen) + break; + + /* unpack record */ + if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0) + return (ENOMEM); + bytes_read -= sizeof (reclen) + reclen; + buf += sizeof (reclen) + reclen; + + /* add record to nvlist array */ + (*numrecords)++; + if (ISP2(*numrecords + 1)) { + tmp = realloc(*records, + *numrecords * 2 * sizeof (nvlist_t *)); + if (tmp == NULL) { + nvlist_free(nv); + (*numrecords)--; + return (ENOMEM); + } + *records = tmp; + } + (*records)[*numrecords - 1] = nv; + } + + *leftover = bytes_read; + return (0); +} diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 87d32a115..7a7401ecd 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -46,6 +46,7 @@ #include <sys/zfs_ioctl.h> #include <sys/vdev_impl.h> #include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> #include <sys/fs/zfs.h> uint32_t zio_injection_enabled = 0; @@ -659,6 +660,63 @@ zio_handle_io_delay(zio_t *zio) return (min_target); } +static int +zio_calculate_range(const char *pool, zinject_record_t *record) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + objset_t *os = NULL; + dnode_t *dn = NULL; + int error; + + /* + * Obtain the dnode for object using pool, objset, and object + */ + error = dsl_pool_hold(pool, FTAG, &dp); + if (error) + return (error); + + error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); + dsl_pool_rele(dp, FTAG); + if (error) + return (error); + + error = dmu_objset_from_ds(ds, &os); + dsl_dataset_rele(ds, FTAG); + if (error) + return (error); + + error = dnode_hold(os, record->zi_object, FTAG, &dn); + if (error) + return (error); + + /* + * Translate the range into block IDs + */ + if (record->zi_start != 0 || record->zi_end != -1ULL) { + record->zi_start >>= dn->dn_datablkshift; + record->zi_end >>= dn->dn_datablkshift; + } + if (record->zi_level > 0) { + if (record->zi_level >= dn->dn_nlevels) { + dnode_rele(dn, FTAG); + return (SET_ERROR(EDOM)); + } + + if (record->zi_start != 0 || record->zi_end != 0) { + int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + for (int level = record->zi_level; level > 0; level--) { + record->zi_start >>= shift; + record->zi_end >>= shift; + } + } + } + + dnode_rele(dn, FTAG); + return (0); +} + /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, @@ -698,6 +756,15 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) return (SET_ERROR(EINVAL)); } + /* + * If the supplied range was in bytes -- calculate the actual blkid + */ + if (flags & ZINJECT_CALC_RANGE) { + error = zio_calculate_range(name, record); + if (error != 0) + return (error); + } + if (!(flags & ZINJECT_NULL)) { /* * spa_inject_ref() will add an injection reference, which will -- cgit v1.2.3