diff options
-rw-r--r-- | include/sys/zfs_context.h | 2 | ||||
-rw-r--r-- | lib/libzfs/libzfs_import.c | 352 | ||||
-rw-r--r-- | lib/libzpool/kernel.c | 4 |
3 files changed, 294 insertions, 64 deletions
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 6ff4d4302..cc626fdaa 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -652,6 +652,8 @@ extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); extern void kernel_init(int); extern void kernel_fini(void); +extern void thread_init(void); +extern void thread_fini(void); struct spa; extern void nicenum(uint64_t num, char *buf); diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 5dc1482d7..5da42cb5e 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -933,6 +933,242 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) return (0); } +typedef struct rdsk_node { + char *rn_name; + int rn_num_labels; + int rn_dfd; + libzfs_handle_t *rn_hdl; + nvlist_t *rn_config; + avl_tree_t *rn_avl; + avl_node_t rn_node; + boolean_t rn_nozpool; +} rdsk_node_t; + +static int +slice_cache_compare(const void *arg1, const void *arg2) +{ + const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; + const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; + char *nm1slice, *nm2slice; + int rv; + + /* + * partitions one and three (slices zero and two) are the most + * likely to provide results, so put those first + */ + nm1slice = strstr(nm1, "part1"); + nm2slice = strstr(nm2, "part1"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + nm1slice = strstr(nm1, "part3"); + nm2slice = strstr(nm2, "part3"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + + rv = strcmp(nm1, nm2); + if (rv == 0) + return (0); + return (rv > 0 ? 1 : -1); +} + +#ifndef __linux__ +static void +check_one_slice(avl_tree_t *r, char *diskname, uint_t partno, + diskaddr_t size, uint_t blksz) +{ + rdsk_node_t tmpnode; + rdsk_node_t *node; + char sname[MAXNAMELEN]; + + tmpnode.rn_name = &sname[0]; + (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u", + diskname, partno); + /* too small to contain a zpool? */ + if ((size < (SPA_MINDEVSIZE / blksz)) && + (node = avl_find(r, &tmpnode, NULL))) + node->rn_nozpool = B_TRUE; +} +#endif + +static void +nozpool_all_slices(avl_tree_t *r, const char *sname) +{ +#ifndef __linux__ + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if (((ptr = strrchr(diskname, 's')) == NULL) && + ((ptr = strrchr(diskname, 'p')) == NULL)) + return; + ptr[0] = 's'; + ptr[1] = '\0'; + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, 0, 1); + ptr[0] = 'p'; + for (i = 0; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); +#endif +} + +static void +check_slices(avl_tree_t *r, int fd, const char *sname) +{ +#ifndef __linux__ + struct extvtoc vtoc; + struct dk_gpt *gpt; + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1])) + return; + ptr[1] = '\0'; + + if (read_extvtoc(fd, &vtoc) >= 0) { + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + vtoc.v_part[i].p_size, vtoc.v_sectorsz); + } else if (efi_alloc_and_read(fd, &gpt) >= 0) { + /* + * on x86 we'll still have leftover links that point + * to slices s[9-15], so use NDKMAP instead + */ + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + gpt->efi_parts[i].p_size, gpt->efi_lbasize); + /* nodes p[1-4] are never used with EFI labels */ + ptr[0] = 'p'; + for (i = 1; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); + efi_free(gpt); + } +#endif +} + +static void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + struct stat64 statbuf; + nvlist_t *config; + int num_labels; + int fd; + + if (rn->rn_nozpool) + return; +#ifdef __linux__ + /* + * Skip devices with well known prefixes there can be side effects + * when opening devices which need to be avoided. + * + * core - Symlink to /proc/kcore + * fd* - Floppy interface. + * fuse - Fuse control device. + * hpet - High Precision Event Timer + * lp* - Printer interface. + * parport* - Parallel port interface. + * ppp - Generic PPP driver. + * random - Random device + * rtc - Real Time Clock + * tty* - Generic serial interface. + * urandom - Random device. + * usbmon* - USB IO monitor. + * vcs* - Virtual console memory. + * watchdog - Watchdog must be closed in a special way. + */ + if ((strncmp(rn->rn_name, "core", 4) == 0) || + (strncmp(rn->rn_name, "fd", 2) == 0) || + (strncmp(rn->rn_name, "fuse", 4) == 0) || + (strncmp(rn->rn_name, "hpet", 4) == 0) || + (strncmp(rn->rn_name, "lp", 2) == 0) || + (strncmp(rn->rn_name, "parport", 7) == 0) || + (strncmp(rn->rn_name, "ppp", 3) == 0) || + (strncmp(rn->rn_name, "random", 6) == 0) || + (strncmp(rn->rn_name, "rtc", 3) == 0) || + (strncmp(rn->rn_name, "tty", 3) == 0) || + (strncmp(rn->rn_name, "urandom", 7) == 0) || + (strncmp(rn->rn_name, "usbmon", 6) == 0) || + (strncmp(rn->rn_name, "vcs", 3) == 0) || + (strncmp(rn->rn_name, "watchdog", 8) == 0)) + return; + + /* + * Ignore failed stats. We only want regular files and block devices. + */ + if (fstatat64(rn->rn_dfd, rn->rn_name, &statbuf, 0) != 0 || + (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) + return; + + if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { + /* symlink to a device that's no longer there */ + if (errno == ENOENT) + nozpool_all_slices(rn->rn_avl, rn->rn_name); + return; + } +#else + if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { + /* symlink to a device that's no longer there */ + if (errno == ENOENT) + nozpool_all_slices(rn->rn_avl, rn->rn_name); + return; + } + /* + * Ignore failed stats. We only want regular + * files, character devs and block devs. + */ + if (fstat64(fd, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && + !S_ISCHR(statbuf.st_mode) && + !S_ISBLK(statbuf.st_mode))) { + (void) close(fd); + return; + } +#endif + /* this file is too small to hold a zpool */ + if (S_ISREG(statbuf.st_mode) && + statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } else if (!S_ISREG(statbuf.st_mode)) { + /* + * Try to read the disk label first so we don't have to + * open a bunch of minor nodes that can't have a zpool. + */ + check_slices(rn->rn_avl, fd, rn->rn_name); + } + + if ((zpool_read_label(fd, &config, &num_labels)) != 0) { + (void) close(fd); + (void) no_memory(rn->rn_hdl); + return; + } + + if (num_labels == 0) { + (void) close(fd); + nvlist_free(config); + return; + } + + (void) close(fd); + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + if (config != NULL) { + assert(rn->rn_nozpool == B_FALSE); + } +} + /* * Given a file descriptor, clear (zero) the label information. This function * is used in the appliance stack as part of the ZFS sysevent module and @@ -1058,20 +1294,21 @@ zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { static nvlist_t * zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) { - int i, num_labels, dirs = iarg->paths; + int i, dirs = iarg->paths; DIR *dirp = NULL; struct dirent64 *dp; char path[MAXPATHLEN]; char *end, **dir = iarg->path; size_t pathleft; - struct stat64 statbuf; - nvlist_t *ret = NULL, *config; - int fd; + nvlist_t *ret = NULL; pool_list_t pools = { 0 }; pool_entry_t *pe, *penext; vdev_entry_t *ve, *venext; config_entry_t *ce, *cenext; name_entry_t *ne, *nenext; + avl_tree_t slice_cache; + rdsk_node_t *slice; + void *cookie; verify(iarg->poolname == NULL || iarg->guid == 0); @@ -1096,6 +1333,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) * and toplevel GUID. */ for (i = 0; i < dirs; i++) { + taskq_t *t; char *rdsk; int dfd; @@ -1135,6 +1373,9 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) goto error; } + avl_create(&slice_cache, slice_cache_compare, + sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); + /* * This is not MT-safe, but we have no MT consumers of libzfs */ @@ -1144,65 +1385,49 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) (name[1] == 0 || (name[1] == '.' && name[2] == 0))) continue; - /* - * Skip checking devices with well known prefixes: - * watchdog - A special close is required to avoid - * triggering it and resetting the system. - * fuse - Fuse control device. - * ppp - Generic PPP driver. - * tty* - Generic serial interface. - * vcs* - Virtual console memory. - * parport* - Parallel port interface. - * lp* - Printer interface. - * fd* - Floppy interface. - * hpet - High Precision Event Timer, crashes qemu - * when accessed from a virtual machine. - * core - Symlink to /proc/kcore, causes a crash - * when access from Xen dom0. - */ - if ((strncmp(name, "watchdog", 8) == 0) || - (strncmp(name, "fuse", 4) == 0) || - (strncmp(name, "ppp", 3) == 0) || - (strncmp(name, "tty", 3) == 0) || - (strncmp(name, "vcs", 3) == 0) || - (strncmp(name, "parport", 7) == 0) || - (strncmp(name, "lp", 2) == 0) || - (strncmp(name, "fd", 2) == 0) || - (strncmp(name, "hpet", 4) == 0) || - (strncmp(name, "core", 4) == 0)) - continue; - - /* - * Ignore failed stats. We only want regular - * files and block devices. - */ - if ((fstatat64(dfd, name, &statbuf, 0) != 0) || - (!S_ISREG(statbuf.st_mode) && - !S_ISBLK(statbuf.st_mode))) - continue; - - if ((fd = openat64(dfd, name, O_RDONLY)) < 0) - continue; - - if ((zpool_read_label(fd, &config, &num_labels))) { - (void) close(fd); - (void) no_memory(hdl); - goto error; - } - - (void) close(fd); - - if (config != NULL) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_strdup(hdl, name); + slice->rn_avl = &slice_cache; + slice->rn_dfd = dfd; + slice->rn_hdl = hdl; + slice->rn_nozpool = B_FALSE; + avl_add(&slice_cache, slice); + } + /* + * create a thread pool to do all of this in parallel; + * rn_nozpool is not protected, so this is racy in that + * multiple tasks could decide that the same slice can + * not hold a zpool, which is benign. Also choose + * double the number of processors; we hold a lot of + * locks in the kernel, so going beyond this doesn't + * buy us much. + */ + thread_init(); + t = taskq_create("z_import", 2 * boot_ncpus, defclsyspri, + 2 * boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); + for (slice = avl_first(&slice_cache); slice; + (slice = avl_walk(&slice_cache, slice, + AVL_AFTER))) + (void) taskq_dispatch(t, zpool_open_func, slice, + TQ_SLEEP); + taskq_wait(t); + taskq_destroy(t); + thread_fini(); + + cookie = NULL; + while ((slice = avl_destroy_nodes(&slice_cache, + &cookie)) != NULL) { + if (slice->rn_config != NULL) { + nvlist_t *config = slice->rn_config; boolean_t matched = B_TRUE; - char *pname; - - if ((iarg->poolname != NULL) && - (nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, &pname) == 0)) { - if (strcmp(iarg->poolname, pname)) - matched = B_FALSE; + if (iarg->poolname != NULL) { + char *pname; + matched = nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && + strcmp(iarg->poolname, pname) == 0; } else if (iarg->guid != 0) { uint64_t this_guid; @@ -1217,12 +1442,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) continue; } /* use the non-raw path for the config */ - (void) strlcpy(end, name, pathleft); + (void) strlcpy(end, slice->rn_name, pathleft); if (add_config(hdl, &pools, path, i+1, - num_labels, config)) + slice->rn_num_labels, config)) goto error; } + free(slice->rn_name); + free(slice); } + avl_destroy(&slice_cache); (void) closedir(dirp); dirp = NULL; diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 5d26f7ca8..6ed08bdb0 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -68,7 +68,7 @@ pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER; pthread_key_t kthread_key; int kthread_nr = 0; -static void +void thread_init(void) { kthread_t *kt; @@ -87,7 +87,7 @@ thread_init(void) kthread_nr = 1; } -static void +void thread_fini(void) { kthread_t *kt = curthread; |