summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/sys/zfs_context.h2
-rw-r--r--lib/libzfs/libzfs_import.c352
-rw-r--r--lib/libzpool/kernel.c4
3 files changed, 294 insertions, 64 deletions
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 6ff4d4302..cc626fdaa 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -652,6 +652,8 @@ extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
extern void kernel_init(int);
extern void kernel_fini(void);
+extern void thread_init(void);
+extern void thread_fini(void);
struct spa;
extern void nicenum(uint64_t num, char *buf);
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index 5dc1482d7..5da42cb5e 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -933,6 +933,242 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels)
return (0);
}
+typedef struct rdsk_node {
+ char *rn_name;
+ int rn_num_labels;
+ int rn_dfd;
+ libzfs_handle_t *rn_hdl;
+ nvlist_t *rn_config;
+ avl_tree_t *rn_avl;
+ avl_node_t rn_node;
+ boolean_t rn_nozpool;
+} rdsk_node_t;
+
+static int
+slice_cache_compare(const void *arg1, const void *arg2)
+{
+ const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
+ const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
+ char *nm1slice, *nm2slice;
+ int rv;
+
+ /*
+ * partitions one and three (slices zero and two) are the most
+ * likely to provide results, so put those first
+ */
+ nm1slice = strstr(nm1, "part1");
+ nm2slice = strstr(nm2, "part1");
+ if (nm1slice && !nm2slice) {
+ return (-1);
+ }
+ if (!nm1slice && nm2slice) {
+ return (1);
+ }
+ nm1slice = strstr(nm1, "part3");
+ nm2slice = strstr(nm2, "part3");
+ if (nm1slice && !nm2slice) {
+ return (-1);
+ }
+ if (!nm1slice && nm2slice) {
+ return (1);
+ }
+
+ rv = strcmp(nm1, nm2);
+ if (rv == 0)
+ return (0);
+ return (rv > 0 ? 1 : -1);
+}
+
+#ifndef __linux__
+static void
+check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
+ diskaddr_t size, uint_t blksz)
+{
+ rdsk_node_t tmpnode;
+ rdsk_node_t *node;
+ char sname[MAXNAMELEN];
+
+ tmpnode.rn_name = &sname[0];
+ (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
+ diskname, partno);
+ /* too small to contain a zpool? */
+ if ((size < (SPA_MINDEVSIZE / blksz)) &&
+ (node = avl_find(r, &tmpnode, NULL)))
+ node->rn_nozpool = B_TRUE;
+}
+#endif
+
+static void
+nozpool_all_slices(avl_tree_t *r, const char *sname)
+{
+#ifndef __linux__
+ char diskname[MAXNAMELEN];
+ char *ptr;
+ int i;
+
+ (void) strncpy(diskname, sname, MAXNAMELEN);
+ if (((ptr = strrchr(diskname, 's')) == NULL) &&
+ ((ptr = strrchr(diskname, 'p')) == NULL))
+ return;
+ ptr[0] = 's';
+ ptr[1] = '\0';
+ for (i = 0; i < NDKMAP; i++)
+ check_one_slice(r, diskname, i, 0, 1);
+ ptr[0] = 'p';
+ for (i = 0; i <= FD_NUMPART; i++)
+ check_one_slice(r, diskname, i, 0, 1);
+#endif
+}
+
+static void
+check_slices(avl_tree_t *r, int fd, const char *sname)
+{
+#ifndef __linux__
+ struct extvtoc vtoc;
+ struct dk_gpt *gpt;
+ char diskname[MAXNAMELEN];
+ char *ptr;
+ int i;
+
+ (void) strncpy(diskname, sname, MAXNAMELEN);
+ if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
+ return;
+ ptr[1] = '\0';
+
+ if (read_extvtoc(fd, &vtoc) >= 0) {
+ for (i = 0; i < NDKMAP; i++)
+ check_one_slice(r, diskname, i,
+ vtoc.v_part[i].p_size, vtoc.v_sectorsz);
+ } else if (efi_alloc_and_read(fd, &gpt) >= 0) {
+ /*
+ * on x86 we'll still have leftover links that point
+ * to slices s[9-15], so use NDKMAP instead
+ */
+ for (i = 0; i < NDKMAP; i++)
+ check_one_slice(r, diskname, i,
+ gpt->efi_parts[i].p_size, gpt->efi_lbasize);
+ /* nodes p[1-4] are never used with EFI labels */
+ ptr[0] = 'p';
+ for (i = 1; i <= FD_NUMPART; i++)
+ check_one_slice(r, diskname, i, 0, 1);
+ efi_free(gpt);
+ }
+#endif
+}
+
+static void
+zpool_open_func(void *arg)
+{
+ rdsk_node_t *rn = arg;
+ struct stat64 statbuf;
+ nvlist_t *config;
+ int num_labels;
+ int fd;
+
+ if (rn->rn_nozpool)
+ return;
+#ifdef __linux__
+ /*
+ * Skip devices with well known prefixes there can be side effects
+ * when opening devices which need to be avoided.
+ *
+ * core - Symlink to /proc/kcore
+ * fd* - Floppy interface.
+ * fuse - Fuse control device.
+ * hpet - High Precision Event Timer
+ * lp* - Printer interface.
+ * parport* - Parallel port interface.
+ * ppp - Generic PPP driver.
+ * random - Random device
+ * rtc - Real Time Clock
+ * tty* - Generic serial interface.
+ * urandom - Random device.
+ * usbmon* - USB IO monitor.
+ * vcs* - Virtual console memory.
+ * watchdog - Watchdog must be closed in a special way.
+ */
+ if ((strncmp(rn->rn_name, "core", 4) == 0) ||
+ (strncmp(rn->rn_name, "fd", 2) == 0) ||
+ (strncmp(rn->rn_name, "fuse", 4) == 0) ||
+ (strncmp(rn->rn_name, "hpet", 4) == 0) ||
+ (strncmp(rn->rn_name, "lp", 2) == 0) ||
+ (strncmp(rn->rn_name, "parport", 7) == 0) ||
+ (strncmp(rn->rn_name, "ppp", 3) == 0) ||
+ (strncmp(rn->rn_name, "random", 6) == 0) ||
+ (strncmp(rn->rn_name, "rtc", 3) == 0) ||
+ (strncmp(rn->rn_name, "tty", 3) == 0) ||
+ (strncmp(rn->rn_name, "urandom", 7) == 0) ||
+ (strncmp(rn->rn_name, "usbmon", 6) == 0) ||
+ (strncmp(rn->rn_name, "vcs", 3) == 0) ||
+ (strncmp(rn->rn_name, "watchdog", 8) == 0))
+ return;
+
+ /*
+ * Ignore failed stats. We only want regular files and block devices.
+ */
+ if (fstatat64(rn->rn_dfd, rn->rn_name, &statbuf, 0) != 0 ||
+ (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
+ return;
+
+ if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
+ /* symlink to a device that's no longer there */
+ if (errno == ENOENT)
+ nozpool_all_slices(rn->rn_avl, rn->rn_name);
+ return;
+ }
+#else
+ if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
+ /* symlink to a device that's no longer there */
+ if (errno == ENOENT)
+ nozpool_all_slices(rn->rn_avl, rn->rn_name);
+ return;
+ }
+ /*
+ * Ignore failed stats. We only want regular
+ * files, character devs and block devs.
+ */
+ if (fstat64(fd, &statbuf) != 0 ||
+ (!S_ISREG(statbuf.st_mode) &&
+ !S_ISCHR(statbuf.st_mode) &&
+ !S_ISBLK(statbuf.st_mode))) {
+ (void) close(fd);
+ return;
+ }
+#endif
+ /* this file is too small to hold a zpool */
+ if (S_ISREG(statbuf.st_mode) &&
+ statbuf.st_size < SPA_MINDEVSIZE) {
+ (void) close(fd);
+ return;
+ } else if (!S_ISREG(statbuf.st_mode)) {
+ /*
+ * Try to read the disk label first so we don't have to
+ * open a bunch of minor nodes that can't have a zpool.
+ */
+ check_slices(rn->rn_avl, fd, rn->rn_name);
+ }
+
+ if ((zpool_read_label(fd, &config, &num_labels)) != 0) {
+ (void) close(fd);
+ (void) no_memory(rn->rn_hdl);
+ return;
+ }
+
+ if (num_labels == 0) {
+ (void) close(fd);
+ nvlist_free(config);
+ return;
+ }
+
+ (void) close(fd);
+
+ rn->rn_config = config;
+ rn->rn_num_labels = num_labels;
+ if (config != NULL) {
+ assert(rn->rn_nozpool == B_FALSE);
+ }
+}
+
/*
* Given a file descriptor, clear (zero) the label information. This function
* is used in the appliance stack as part of the ZFS sysevent module and
@@ -1058,20 +1294,21 @@ zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
static nvlist_t *
zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
{
- int i, num_labels, dirs = iarg->paths;
+ int i, dirs = iarg->paths;
DIR *dirp = NULL;
struct dirent64 *dp;
char path[MAXPATHLEN];
char *end, **dir = iarg->path;
size_t pathleft;
- struct stat64 statbuf;
- nvlist_t *ret = NULL, *config;
- int fd;
+ nvlist_t *ret = NULL;
pool_list_t pools = { 0 };
pool_entry_t *pe, *penext;
vdev_entry_t *ve, *venext;
config_entry_t *ce, *cenext;
name_entry_t *ne, *nenext;
+ avl_tree_t slice_cache;
+ rdsk_node_t *slice;
+ void *cookie;
verify(iarg->poolname == NULL || iarg->guid == 0);
@@ -1096,6 +1333,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
* and toplevel GUID.
*/
for (i = 0; i < dirs; i++) {
+ taskq_t *t;
char *rdsk;
int dfd;
@@ -1135,6 +1373,9 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
goto error;
}
+ avl_create(&slice_cache, slice_cache_compare,
+ sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
+
/*
* This is not MT-safe, but we have no MT consumers of libzfs
*/
@@ -1144,65 +1385,49 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
(name[1] == 0 || (name[1] == '.' && name[2] == 0)))
continue;
- /*
- * Skip checking devices with well known prefixes:
- * watchdog - A special close is required to avoid
- * triggering it and resetting the system.
- * fuse - Fuse control device.
- * ppp - Generic PPP driver.
- * tty* - Generic serial interface.
- * vcs* - Virtual console memory.
- * parport* - Parallel port interface.
- * lp* - Printer interface.
- * fd* - Floppy interface.
- * hpet - High Precision Event Timer, crashes qemu
- * when accessed from a virtual machine.
- * core - Symlink to /proc/kcore, causes a crash
- * when access from Xen dom0.
- */
- if ((strncmp(name, "watchdog", 8) == 0) ||
- (strncmp(name, "fuse", 4) == 0) ||
- (strncmp(name, "ppp", 3) == 0) ||
- (strncmp(name, "tty", 3) == 0) ||
- (strncmp(name, "vcs", 3) == 0) ||
- (strncmp(name, "parport", 7) == 0) ||
- (strncmp(name, "lp", 2) == 0) ||
- (strncmp(name, "fd", 2) == 0) ||
- (strncmp(name, "hpet", 4) == 0) ||
- (strncmp(name, "core", 4) == 0))
- continue;
-
- /*
- * Ignore failed stats. We only want regular
- * files and block devices.
- */
- if ((fstatat64(dfd, name, &statbuf, 0) != 0) ||
- (!S_ISREG(statbuf.st_mode) &&
- !S_ISBLK(statbuf.st_mode)))
- continue;
-
- if ((fd = openat64(dfd, name, O_RDONLY)) < 0)
- continue;
-
- if ((zpool_read_label(fd, &config, &num_labels))) {
- (void) close(fd);
- (void) no_memory(hdl);
- goto error;
- }
-
- (void) close(fd);
-
- if (config != NULL) {
+ slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+ slice->rn_name = zfs_strdup(hdl, name);
+ slice->rn_avl = &slice_cache;
+ slice->rn_dfd = dfd;
+ slice->rn_hdl = hdl;
+ slice->rn_nozpool = B_FALSE;
+ avl_add(&slice_cache, slice);
+ }
+ /*
+ * create a thread pool to do all of this in parallel;
+ * rn_nozpool is not protected, so this is racy in that
+ * multiple tasks could decide that the same slice can
+ * not hold a zpool, which is benign. Also choose
+ * double the number of processors; we hold a lot of
+ * locks in the kernel, so going beyond this doesn't
+ * buy us much.
+ */
+ thread_init();
+ t = taskq_create("z_import", 2 * boot_ncpus, defclsyspri,
+ 2 * boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+ for (slice = avl_first(&slice_cache); slice;
+ (slice = avl_walk(&slice_cache, slice,
+ AVL_AFTER)))
+ (void) taskq_dispatch(t, zpool_open_func, slice,
+ TQ_SLEEP);
+ taskq_wait(t);
+ taskq_destroy(t);
+ thread_fini();
+
+ cookie = NULL;
+ while ((slice = avl_destroy_nodes(&slice_cache,
+ &cookie)) != NULL) {
+ if (slice->rn_config != NULL) {
+ nvlist_t *config = slice->rn_config;
boolean_t matched = B_TRUE;
- char *pname;
-
- if ((iarg->poolname != NULL) &&
- (nvlist_lookup_string(config,
- ZPOOL_CONFIG_POOL_NAME, &pname) == 0)) {
- if (strcmp(iarg->poolname, pname))
- matched = B_FALSE;
+ if (iarg->poolname != NULL) {
+ char *pname;
+ matched = nvlist_lookup_string(config,
+ ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0 &&
+ strcmp(iarg->poolname, pname) == 0;
} else if (iarg->guid != 0) {
uint64_t this_guid;
@@ -1217,12 +1442,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
continue;
}
/* use the non-raw path for the config */
- (void) strlcpy(end, name, pathleft);
+ (void) strlcpy(end, slice->rn_name, pathleft);
if (add_config(hdl, &pools, path, i+1,
- num_labels, config))
+ slice->rn_num_labels, config))
goto error;
}
+ free(slice->rn_name);
+ free(slice);
}
+ avl_destroy(&slice_cache);
(void) closedir(dirp);
dirp = NULL;
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 5d26f7ca8..6ed08bdb0 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -68,7 +68,7 @@ pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER;
pthread_key_t kthread_key;
int kthread_nr = 0;
-static void
+void
thread_init(void)
{
kthread_t *kt;
@@ -87,7 +87,7 @@ thread_init(void)
kthread_nr = 1;
}
-static void
+void
thread_fini(void)
{
kthread_t *kt = curthread;