diff options
author | Brian Behlendorf <[email protected]> | 2016-11-07 10:28:57 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2016-11-07 10:28:57 -0800 |
commit | 83bf769d500a231eac023c9f9f88719ad205694e (patch) | |
tree | e919e807bd61476a759c4d54d5d566d25dc3e9e6 /lib | |
parent | 34328f3cf8a4ca9441a1609878500ef1ff2eeb16 (diff) |
Fix 'zpool import' detection issues
This patch addresses multiple 'zpool import' block device
indentification problems which are most likely to occur on a
system configured to use blkid, by_vdev paths, multipath and
failover. The symptom most commonly observed is the import
uses different path names to import the pool than would
normally be expected.
* When using blkid to identify vdevs the listed devices may
be added to the cache in any order. In order to apply the
preferred search order heuristic a zfs_path_order() function
was added to calculate the order given full path names.
* Since it's possible to have multiple block devices with
different vdev guids which refer to the same ZPOOL_CONFIG_PATH
the slice cache must be indexed by guid and name. By avoiding
collisions the preferred ordering can be maintaining even
when multiple block devices claim the same ZPOOL_CONFIG_PATH.
The preferred sorting by partition was never benefitial for
a Linux system and was removed as part of this change.
* When adding entries to the blkid cache avl_find/avl_insert
are used instead of avl_add because collisions are possible
and must be handled gracefully.
* For pools using multipath devices there are, at a minimum,
three devices where a vdev label may be read. They are the
dm-* device and each underlying /dev/sd* device. Due to the
way the block cache is implemented each of these devices may
have a different cached copy of the vdev label. This can
result in "ghost pools" which appear to persist even after
a 'zpool labelclear' has been done to the dm-* device. In
order to prevent this the vdev label is read with O_DIRECT
in order to bypass any caching to get the on-disk version.
* When opening a block device verify that vdev guid read from
the disk matches the expected vdev guid. This allows for bad
labels to be filtered out.
Reviewed-by: Tony Hutter <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #5359
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libzfs/libzfs_import.c | 101 | ||||
-rw-r--r-- | lib/libzfs/libzfs_util.c | 39 |
2 files changed, 105 insertions, 35 deletions
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 3de7fd73d..e23232792 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -1313,6 +1313,7 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) vdev_label_t *label; nvlist_t *expected_config = NULL; uint64_t expected_guid = 0, size; + int error; *config = NULL; @@ -1320,7 +1321,8 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) return (0); size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - if ((label = malloc(sizeof (vdev_label_t))) == NULL) + error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label)); + if (error) return (-1); for (l = 0; l < VDEV_LABELS; l++) { @@ -1378,6 +1380,7 @@ typedef struct rdsk_node { char *rn_name; /* Full path to device */ int rn_order; /* Preferred order (low to high) */ int rn_num_labels; /* Number of valid labels */ + uint64_t rn_vdev_guid; /* Expected vdev guid when set */ libzfs_handle_t *rn_hdl; nvlist_t *rn_config; /* Label config */ avl_tree_t *rn_avl; @@ -1386,39 +1389,29 @@ typedef struct rdsk_node { boolean_t rn_labelpaths; } rdsk_node_t; +/* + * Sorted by vdev guid and full path to allow for multiple entries with + * the same full path name. This is required because it's possible to + * have multiple block devices with labels that refer to the same + * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both + * entries need to be added to the cache. Scenarios where this can occur + * include overwritten pool labels, devices which are visible from multiple + * hosts and multipath devices. + */ static int slice_cache_compare(const void *arg1, const void *arg2) { const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; - char *nm1slice, *nm2slice; + uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; + uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; int rv; - /* - * partitions one and three (slices zero and two) are the most - * likely to provide results, so put those first - */ - nm1slice = strstr(nm1, "part1"); - nm2slice = strstr(nm2, "part1"); - if (nm1slice && !nm2slice) { - return (-1); - } - if (!nm1slice && nm2slice) { - return (1); - } - nm1slice = strstr(nm1, "part3"); - nm2slice = strstr(nm2, "part3"); - if (nm1slice && !nm2slice) { - return (-1); - } - if (!nm1slice && nm2slice) { - return (1); - } + rv = AVL_CMP(guid1, guid2); + if (rv) + return (rv); - rv = strcmp(nm1, nm2); - if (rv == 0) - return (0); - return (rv > 0 ? 1 : -1); + return (AVL_ISIGN(strcmp(nm1, nm2))); } static boolean_t @@ -1506,6 +1499,7 @@ zpool_open_func(void *arg) struct stat64 statbuf; nvlist_t *config; char *bname, *dupname; + uint64_t vdev_guid = 0; int error; int num_labels; int fd; @@ -1531,19 +1525,28 @@ zpool_open_func(void *arg) (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) return; - if ((fd = open(rn->rn_name, O_RDONLY)) < 0) + /* + * Preferentially open using O_DIRECT to bypass the block device + * cache which may be stale for multipath devices. An EINVAL errno + * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. + */ + fd = open(rn->rn_name, O_RDONLY | O_DIRECT); + if ((fd < 0) && (errno == EINVAL)) + fd = open(rn->rn_name, O_RDONLY); + + if (fd < 0) return; /* * This file is too small to hold a zpool */ - if (S_ISREG(statbuf.st_mode) && - statbuf.st_size < SPA_MINDEVSIZE) { + if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) { (void) close(fd); return; } - if ((zpool_read_label(fd, &config, &num_labels)) != 0) { + error = zpool_read_label(fd, &config, &num_labels); + if (error != 0) { (void) close(fd); return; } @@ -1554,6 +1557,18 @@ zpool_open_func(void *arg) return; } + /* + * Check that the vdev is for the expected guid. Additional entries + * are speculatively added based on the paths stored in the labels. + * Entries with valid paths but incorrect guids must be removed. + */ + error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { + (void) close(fd); + nvlist_free(config); + return; + } + (void) close(fd); rn->rn_config = config; @@ -1580,9 +1595,10 @@ zpool_open_func(void *arg) if (path != NULL) { slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zfs_strdup(hdl, path); + slice->rn_vdev_guid = vdev_guid; slice->rn_avl = rn->rn_avl; slice->rn_hdl = hdl; - slice->rn_order = 1; + slice->rn_order = IMPORT_ORDER_PREFERRED_1; slice->rn_labelpaths = B_FALSE; mutex_enter(rn->rn_lock); if (avl_find(rn->rn_avl, slice, &where)) { @@ -1605,9 +1621,10 @@ zpool_open_func(void *arg) return; } + slice->rn_vdev_guid = vdev_guid; slice->rn_avl = rn->rn_avl; slice->rn_hdl = hdl; - slice->rn_order = 2; + slice->rn_order = IMPORT_ORDER_PREFERRED_2; slice->rn_labelpaths = B_FALSE; mutex_enter(rn->rn_lock); if (avl_find(rn->rn_avl, slice, &where)) { @@ -1709,10 +1726,11 @@ zpool_find_import_scan(libzfs_handle_t *hdl, kmutex_t *lock, free(slice); continue; } + slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = cache; slice->rn_hdl = hdl; - slice->rn_order = i+1; + slice->rn_order = i + IMPORT_ORDER_SCAN_OFFSET; slice->rn_labelpaths = B_FALSE; mutex_enter(lock); avl_add(cache, slice); @@ -1747,6 +1765,7 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, kmutex_t *lock, blkid_cache cache; blkid_dev_iterate iter; blkid_dev dev; + avl_index_t where; int error; *slice_cache = NULL; @@ -1781,13 +1800,25 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, kmutex_t *lock, while (blkid_dev_next(iter, &dev) == 0) { slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev)); + slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = *slice_cache; slice->rn_hdl = hdl; - slice->rn_order = 100; slice->rn_labelpaths = B_TRUE; + + error = zfs_path_order(slice->rn_name, &slice->rn_order); + if (error == 0) + slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; + else + slice->rn_order = IMPORT_ORDER_DEFAULT; + mutex_enter(lock); - avl_add(*slice_cache, slice); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } mutex_exit(lock); } diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 6f5dae6c3..95ec0b49d 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1137,6 +1137,45 @@ zfs_strcmp_pathname(char *name, char *cmp, int wholedisk) } /* + * Given a full path to a device determine if that device appears in the + * import search path. If it does return the first match and store the + * index in the passed 'order' variable, otherwise return an error. + */ +int +zfs_path_order(char *name, int *order) +{ + int i = 0, error = ENOENT; + char *dir, *env, *envdup; + + env = getenv("ZPOOL_IMPORT_PATH"); + if (env) { + envdup = strdup(env); + dir = strtok(envdup, ":"); + while (dir) { + if (strncmp(name, dir, strlen(dir)) == 0) { + *order = i; + error = 0; + break; + } + dir = strtok(NULL, ":"); + i++; + } + free(envdup); + } else { + for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) { + if (strncmp(name, zpool_default_import_path[i], + strlen(zpool_default_import_path[i])) == 0) { + *order = i; + error = 0; + break; + } + } + } + + return (error); +} + +/* * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from * an ioctl(). */ |