diff options
author | Brian Behlendorf <[email protected]> | 2016-01-20 15:58:29 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-01-22 09:39:46 -0800 |
commit | 519129ff4e717f3026ae70b58d2db59d937aef06 (patch) | |
tree | c4c29be71b19583ff2c76bc8363103ba0087e4c2 /lib | |
parent | 19d55079aecb5c022c1c09e0eace4f7da7381a62 (diff) |
Illumos 6815179, 6844191
6815179 zpool import with a large number of LUNs is too slow
6844191 zpool import, scanning of disks should be multi-threaded
References:
https://github.com/illumos/illumos-gate/commit/4f67d75
Porting notes:
- This change was originally never ported to Linux due to it
dependence on the thread pool interface. This patch solves
that issue by switching the code to use the existing taskq
implementation which provides the same basic functionality.
However, in order for this to work properly thread_init()
and thread_fini() must be called around to taskq consumer
to perform the needed thread initialization.
- The check_one_slice, nozpool_all_slices, and check_slices
functions have been disabled for Linux. They are difficult,
but possible, to implement for Linux due to how partitions
are get names. Since this is only an optimization this code
can be added at a latter date.
Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libzfs/libzfs_import.c | 352 | ||||
-rw-r--r-- | lib/libzpool/kernel.c | 4 |
2 files changed, 292 insertions, 64 deletions
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 5dc1482d7..5da42cb5e 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -933,6 +933,242 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) return (0); } +typedef struct rdsk_node { + char *rn_name; + int rn_num_labels; + int rn_dfd; + libzfs_handle_t *rn_hdl; + nvlist_t *rn_config; + avl_tree_t *rn_avl; + avl_node_t rn_node; + boolean_t rn_nozpool; +} rdsk_node_t; + +static int +slice_cache_compare(const void *arg1, const void *arg2) +{ + const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; + const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; + char *nm1slice, *nm2slice; + int rv; + + /* + * partitions one and three (slices zero and two) are the most + * likely to provide results, so put those first + */ + nm1slice = strstr(nm1, "part1"); + nm2slice = strstr(nm2, "part1"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + nm1slice = strstr(nm1, "part3"); + nm2slice = strstr(nm2, "part3"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + + rv = strcmp(nm1, nm2); + if (rv == 0) + return (0); + return (rv > 0 ? 1 : -1); +} + +#ifndef __linux__ +static void +check_one_slice(avl_tree_t *r, char *diskname, uint_t partno, + diskaddr_t size, uint_t blksz) +{ + rdsk_node_t tmpnode; + rdsk_node_t *node; + char sname[MAXNAMELEN]; + + tmpnode.rn_name = &sname[0]; + (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u", + diskname, partno); + /* too small to contain a zpool? */ + if ((size < (SPA_MINDEVSIZE / blksz)) && + (node = avl_find(r, &tmpnode, NULL))) + node->rn_nozpool = B_TRUE; +} +#endif + +static void +nozpool_all_slices(avl_tree_t *r, const char *sname) +{ +#ifndef __linux__ + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if (((ptr = strrchr(diskname, 's')) == NULL) && + ((ptr = strrchr(diskname, 'p')) == NULL)) + return; + ptr[0] = 's'; + ptr[1] = '\0'; + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, 0, 1); + ptr[0] = 'p'; + for (i = 0; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); +#endif +} + +static void +check_slices(avl_tree_t *r, int fd, const char *sname) +{ +#ifndef __linux__ + struct extvtoc vtoc; + struct dk_gpt *gpt; + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1])) + return; + ptr[1] = '\0'; + + if (read_extvtoc(fd, &vtoc) >= 0) { + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + vtoc.v_part[i].p_size, vtoc.v_sectorsz); + } else if (efi_alloc_and_read(fd, &gpt) >= 0) { + /* + * on x86 we'll still have leftover links that point + * to slices s[9-15], so use NDKMAP instead + */ + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + gpt->efi_parts[i].p_size, gpt->efi_lbasize); + /* nodes p[1-4] are never used with EFI labels */ + ptr[0] = 'p'; + for (i = 1; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); + efi_free(gpt); + } +#endif +} + +static void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + struct stat64 statbuf; + nvlist_t *config; + int num_labels; + int fd; + + if (rn->rn_nozpool) + return; +#ifdef __linux__ + /* + * Skip devices with well known prefixes there can be side effects + * when opening devices which need to be avoided. + * + * core - Symlink to /proc/kcore + * fd* - Floppy interface. + * fuse - Fuse control device. + * hpet - High Precision Event Timer + * lp* - Printer interface. + * parport* - Parallel port interface. + * ppp - Generic PPP driver. + * random - Random device + * rtc - Real Time Clock + * tty* - Generic serial interface. + * urandom - Random device. + * usbmon* - USB IO monitor. + * vcs* - Virtual console memory. + * watchdog - Watchdog must be closed in a special way. + */ + if ((strncmp(rn->rn_name, "core", 4) == 0) || + (strncmp(rn->rn_name, "fd", 2) == 0) || + (strncmp(rn->rn_name, "fuse", 4) == 0) || + (strncmp(rn->rn_name, "hpet", 4) == 0) || + (strncmp(rn->rn_name, "lp", 2) == 0) || + (strncmp(rn->rn_name, "parport", 7) == 0) || + (strncmp(rn->rn_name, "ppp", 3) == 0) || + (strncmp(rn->rn_name, "random", 6) == 0) || + (strncmp(rn->rn_name, "rtc", 3) == 0) || + (strncmp(rn->rn_name, "tty", 3) == 0) || + (strncmp(rn->rn_name, "urandom", 7) == 0) || + (strncmp(rn->rn_name, "usbmon", 6) == 0) || + (strncmp(rn->rn_name, "vcs", 3) == 0) || + (strncmp(rn->rn_name, "watchdog", 8) == 0)) + return; + + /* + * Ignore failed stats. We only want regular files and block devices. + */ + if (fstatat64(rn->rn_dfd, rn->rn_name, &statbuf, 0) != 0 || + (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) + return; + + if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { + /* symlink to a device that's no longer there */ + if (errno == ENOENT) + nozpool_all_slices(rn->rn_avl, rn->rn_name); + return; + } +#else + if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { + /* symlink to a device that's no longer there */ + if (errno == ENOENT) + nozpool_all_slices(rn->rn_avl, rn->rn_name); + return; + } + /* + * Ignore failed stats. We only want regular + * files, character devs and block devs. + */ + if (fstat64(fd, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && + !S_ISCHR(statbuf.st_mode) && + !S_ISBLK(statbuf.st_mode))) { + (void) close(fd); + return; + } +#endif + /* this file is too small to hold a zpool */ + if (S_ISREG(statbuf.st_mode) && + statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } else if (!S_ISREG(statbuf.st_mode)) { + /* + * Try to read the disk label first so we don't have to + * open a bunch of minor nodes that can't have a zpool. + */ + check_slices(rn->rn_avl, fd, rn->rn_name); + } + + if ((zpool_read_label(fd, &config, &num_labels)) != 0) { + (void) close(fd); + (void) no_memory(rn->rn_hdl); + return; + } + + if (num_labels == 0) { + (void) close(fd); + nvlist_free(config); + return; + } + + (void) close(fd); + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + if (config != NULL) { + assert(rn->rn_nozpool == B_FALSE); + } +} + /* * Given a file descriptor, clear (zero) the label information. This function * is used in the appliance stack as part of the ZFS sysevent module and @@ -1058,20 +1294,21 @@ zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { static nvlist_t * zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) { - int i, num_labels, dirs = iarg->paths; + int i, dirs = iarg->paths; DIR *dirp = NULL; struct dirent64 *dp; char path[MAXPATHLEN]; char *end, **dir = iarg->path; size_t pathleft; - struct stat64 statbuf; - nvlist_t *ret = NULL, *config; - int fd; + nvlist_t *ret = NULL; pool_list_t pools = { 0 }; pool_entry_t *pe, *penext; vdev_entry_t *ve, *venext; config_entry_t *ce, *cenext; name_entry_t *ne, *nenext; + avl_tree_t slice_cache; + rdsk_node_t *slice; + void *cookie; verify(iarg->poolname == NULL || iarg->guid == 0); @@ -1096,6 +1333,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) * and toplevel GUID. */ for (i = 0; i < dirs; i++) { + taskq_t *t; char *rdsk; int dfd; @@ -1135,6 +1373,9 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) goto error; } + avl_create(&slice_cache, slice_cache_compare, + sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); + /* * This is not MT-safe, but we have no MT consumers of libzfs */ @@ -1144,65 +1385,49 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) (name[1] == 0 || (name[1] == '.' && name[2] == 0))) continue; - /* - * Skip checking devices with well known prefixes: - * watchdog - A special close is required to avoid - * triggering it and resetting the system. - * fuse - Fuse control device. - * ppp - Generic PPP driver. - * tty* - Generic serial interface. - * vcs* - Virtual console memory. - * parport* - Parallel port interface. - * lp* - Printer interface. - * fd* - Floppy interface. - * hpet - High Precision Event Timer, crashes qemu - * when accessed from a virtual machine. - * core - Symlink to /proc/kcore, causes a crash - * when access from Xen dom0. - */ - if ((strncmp(name, "watchdog", 8) == 0) || - (strncmp(name, "fuse", 4) == 0) || - (strncmp(name, "ppp", 3) == 0) || - (strncmp(name, "tty", 3) == 0) || - (strncmp(name, "vcs", 3) == 0) || - (strncmp(name, "parport", 7) == 0) || - (strncmp(name, "lp", 2) == 0) || - (strncmp(name, "fd", 2) == 0) || - (strncmp(name, "hpet", 4) == 0) || - (strncmp(name, "core", 4) == 0)) - continue; - - /* - * Ignore failed stats. We only want regular - * files and block devices. - */ - if ((fstatat64(dfd, name, &statbuf, 0) != 0) || - (!S_ISREG(statbuf.st_mode) && - !S_ISBLK(statbuf.st_mode))) - continue; - - if ((fd = openat64(dfd, name, O_RDONLY)) < 0) - continue; - - if ((zpool_read_label(fd, &config, &num_labels))) { - (void) close(fd); - (void) no_memory(hdl); - goto error; - } - - (void) close(fd); - - if (config != NULL) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_strdup(hdl, name); + slice->rn_avl = &slice_cache; + slice->rn_dfd = dfd; + slice->rn_hdl = hdl; + slice->rn_nozpool = B_FALSE; + avl_add(&slice_cache, slice); + } + /* + * create a thread pool to do all of this in parallel; + * rn_nozpool is not protected, so this is racy in that + * multiple tasks could decide that the same slice can + * not hold a zpool, which is benign. Also choose + * double the number of processors; we hold a lot of + * locks in the kernel, so going beyond this doesn't + * buy us much. + */ + thread_init(); + t = taskq_create("z_import", 2 * boot_ncpus, defclsyspri, + 2 * boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); + for (slice = avl_first(&slice_cache); slice; + (slice = avl_walk(&slice_cache, slice, + AVL_AFTER))) + (void) taskq_dispatch(t, zpool_open_func, slice, + TQ_SLEEP); + taskq_wait(t); + taskq_destroy(t); + thread_fini(); + + cookie = NULL; + while ((slice = avl_destroy_nodes(&slice_cache, + &cookie)) != NULL) { + if (slice->rn_config != NULL) { + nvlist_t *config = slice->rn_config; boolean_t matched = B_TRUE; - char *pname; - - if ((iarg->poolname != NULL) && - (nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, &pname) == 0)) { - if (strcmp(iarg->poolname, pname)) - matched = B_FALSE; + if (iarg->poolname != NULL) { + char *pname; + matched = nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && + strcmp(iarg->poolname, pname) == 0; } else if (iarg->guid != 0) { uint64_t this_guid; @@ -1217,12 +1442,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) continue; } /* use the non-raw path for the config */ - (void) strlcpy(end, name, pathleft); + (void) strlcpy(end, slice->rn_name, pathleft); if (add_config(hdl, &pools, path, i+1, - num_labels, config)) + slice->rn_num_labels, config)) goto error; } + free(slice->rn_name); + free(slice); } + avl_destroy(&slice_cache); (void) closedir(dirp); dirp = NULL; diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 5d26f7ca8..6ed08bdb0 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -68,7 +68,7 @@ pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER; pthread_key_t kthread_key; int kthread_nr = 0; -static void +void thread_init(void) { kthread_t *kt; @@ -87,7 +87,7 @@ thread_init(void) kthread_nr = 1; } -static void +void thread_fini(void) { kthread_t *kt = curthread; |