aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2016-11-07 10:28:57 -0800
committerGitHub <[email protected]>2016-11-07 10:28:57 -0800
commit83bf769d500a231eac023c9f9f88719ad205694e (patch)
treee919e807bd61476a759c4d54d5d566d25dc3e9e6
parent34328f3cf8a4ca9441a1609878500ef1ff2eeb16 (diff)
Fix 'zpool import' detection issues
This patch addresses multiple 'zpool import' block device indentification problems which are most likely to occur on a system configured to use blkid, by_vdev paths, multipath and failover. The symptom most commonly observed is the import uses different path names to import the pool than would normally be expected. * When using blkid to identify vdevs the listed devices may be added to the cache in any order. In order to apply the preferred search order heuristic a zfs_path_order() function was added to calculate the order given full path names. * Since it's possible to have multiple block devices with different vdev guids which refer to the same ZPOOL_CONFIG_PATH the slice cache must be indexed by guid and name. By avoiding collisions the preferred ordering can be maintaining even when multiple block devices claim the same ZPOOL_CONFIG_PATH. The preferred sorting by partition was never benefitial for a Linux system and was removed as part of this change. * When adding entries to the blkid cache avl_find/avl_insert are used instead of avl_add because collisions are possible and must be handled gracefully. * For pools using multipath devices there are, at a minimum, three devices where a vdev label may be read. They are the dm-* device and each underlying /dev/sd* device. Due to the way the block cache is implemented each of these devices may have a different cached copy of the vdev label. This can result in "ghost pools" which appear to persist even after a 'zpool labelclear' has been done to the dm-* device. In order to prevent this the vdev label is read with O_DIRECT in order to bypass any caching to get the on-disk version. * When opening a block device verify that vdev guid read from the disk matches the expected vdev guid. This allows for bad labels to be filtered out. Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #5359
-rw-r--r--include/libzfs.h5
-rw-r--r--lib/libzfs/libzfs_import.c101
-rw-r--r--lib/libzfs/libzfs_util.c39
3 files changed, 110 insertions, 35 deletions
diff --git a/include/libzfs.h b/include/libzfs.h
index d4962dec6..5744ce87a 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -64,6 +64,10 @@ extern "C" {
*/
#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */
+#define IMPORT_ORDER_PREFERRED_1 1
+#define IMPORT_ORDER_PREFERRED_2 2
+#define IMPORT_ORDER_SCAN_OFFSET 10
+#define IMPORT_ORDER_DEFAULT 100
#define DEFAULT_IMPORT_PATH_SIZE 9
extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE];
@@ -728,6 +732,7 @@ extern boolean_t zfs_bookmark_exists(const char *path);
extern int zfs_append_partition(char *path, size_t max_len);
extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen);
extern int zfs_strcmp_pathname(char *name, char *cmp_name, int wholedisk);
+extern int zfs_path_order(char *path, int *order);
/*
* Mount support functions.
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index 3de7fd73d..e23232792 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -1313,6 +1313,7 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels)
vdev_label_t *label;
nvlist_t *expected_config = NULL;
uint64_t expected_guid = 0, size;
+ int error;
*config = NULL;
@@ -1320,7 +1321,8 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels)
return (0);
size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
- if ((label = malloc(sizeof (vdev_label_t))) == NULL)
+ error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
+ if (error)
return (-1);
for (l = 0; l < VDEV_LABELS; l++) {
@@ -1378,6 +1380,7 @@ typedef struct rdsk_node {
char *rn_name; /* Full path to device */
int rn_order; /* Preferred order (low to high) */
int rn_num_labels; /* Number of valid labels */
+ uint64_t rn_vdev_guid; /* Expected vdev guid when set */
libzfs_handle_t *rn_hdl;
nvlist_t *rn_config; /* Label config */
avl_tree_t *rn_avl;
@@ -1386,39 +1389,29 @@ typedef struct rdsk_node {
boolean_t rn_labelpaths;
} rdsk_node_t;
+/*
+ * Sorted by vdev guid and full path to allow for multiple entries with
+ * the same full path name. This is required because it's possible to
+ * have multiple block devices with labels that refer to the same
+ * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both
+ * entries need to be added to the cache. Scenarios where this can occur
+ * include overwritten pool labels, devices which are visible from multiple
+ * hosts and multipath devices.
+ */
static int
slice_cache_compare(const void *arg1, const void *arg2)
{
const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
- char *nm1slice, *nm2slice;
+ uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
+ uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
int rv;
- /*
- * partitions one and three (slices zero and two) are the most
- * likely to provide results, so put those first
- */
- nm1slice = strstr(nm1, "part1");
- nm2slice = strstr(nm2, "part1");
- if (nm1slice && !nm2slice) {
- return (-1);
- }
- if (!nm1slice && nm2slice) {
- return (1);
- }
- nm1slice = strstr(nm1, "part3");
- nm2slice = strstr(nm2, "part3");
- if (nm1slice && !nm2slice) {
- return (-1);
- }
- if (!nm1slice && nm2slice) {
- return (1);
- }
+ rv = AVL_CMP(guid1, guid2);
+ if (rv)
+ return (rv);
- rv = strcmp(nm1, nm2);
- if (rv == 0)
- return (0);
- return (rv > 0 ? 1 : -1);
+ return (AVL_ISIGN(strcmp(nm1, nm2)));
}
static boolean_t
@@ -1506,6 +1499,7 @@ zpool_open_func(void *arg)
struct stat64 statbuf;
nvlist_t *config;
char *bname, *dupname;
+ uint64_t vdev_guid = 0;
int error;
int num_labels;
int fd;
@@ -1531,19 +1525,28 @@ zpool_open_func(void *arg)
(!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
return;
- if ((fd = open(rn->rn_name, O_RDONLY)) < 0)
+ /*
+ * Preferentially open using O_DIRECT to bypass the block device
+ * cache which may be stale for multipath devices. An EINVAL errno
+ * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
+ */
+ fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
+ if ((fd < 0) && (errno == EINVAL))
+ fd = open(rn->rn_name, O_RDONLY);
+
+ if (fd < 0)
return;
/*
* This file is too small to hold a zpool
*/
- if (S_ISREG(statbuf.st_mode) &&
- statbuf.st_size < SPA_MINDEVSIZE) {
+ if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
(void) close(fd);
return;
}
- if ((zpool_read_label(fd, &config, &num_labels)) != 0) {
+ error = zpool_read_label(fd, &config, &num_labels);
+ if (error != 0) {
(void) close(fd);
return;
}
@@ -1554,6 +1557,18 @@ zpool_open_func(void *arg)
return;
}
+ /*
+ * Check that the vdev is for the expected guid. Additional entries
+ * are speculatively added based on the paths stored in the labels.
+ * Entries with valid paths but incorrect guids must be removed.
+ */
+ error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+ if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
+ (void) close(fd);
+ nvlist_free(config);
+ return;
+ }
+
(void) close(fd);
rn->rn_config = config;
@@ -1580,9 +1595,10 @@ zpool_open_func(void *arg)
if (path != NULL) {
slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
slice->rn_name = zfs_strdup(hdl, path);
+ slice->rn_vdev_guid = vdev_guid;
slice->rn_avl = rn->rn_avl;
slice->rn_hdl = hdl;
- slice->rn_order = 1;
+ slice->rn_order = IMPORT_ORDER_PREFERRED_1;
slice->rn_labelpaths = B_FALSE;
mutex_enter(rn->rn_lock);
if (avl_find(rn->rn_avl, slice, &where)) {
@@ -1605,9 +1621,10 @@ zpool_open_func(void *arg)
return;
}
+ slice->rn_vdev_guid = vdev_guid;
slice->rn_avl = rn->rn_avl;
slice->rn_hdl = hdl;
- slice->rn_order = 2;
+ slice->rn_order = IMPORT_ORDER_PREFERRED_2;
slice->rn_labelpaths = B_FALSE;
mutex_enter(rn->rn_lock);
if (avl_find(rn->rn_avl, slice, &where)) {
@@ -1709,10 +1726,11 @@ zpool_find_import_scan(libzfs_handle_t *hdl, kmutex_t *lock,
free(slice);
continue;
}
+ slice->rn_vdev_guid = 0;
slice->rn_lock = lock;
slice->rn_avl = cache;
slice->rn_hdl = hdl;
- slice->rn_order = i+1;
+ slice->rn_order = i + IMPORT_ORDER_SCAN_OFFSET;
slice->rn_labelpaths = B_FALSE;
mutex_enter(lock);
avl_add(cache, slice);
@@ -1747,6 +1765,7 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, kmutex_t *lock,
blkid_cache cache;
blkid_dev_iterate iter;
blkid_dev dev;
+ avl_index_t where;
int error;
*slice_cache = NULL;
@@ -1781,13 +1800,25 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, kmutex_t *lock,
while (blkid_dev_next(iter, &dev) == 0) {
slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
+ slice->rn_vdev_guid = 0;
slice->rn_lock = lock;
slice->rn_avl = *slice_cache;
slice->rn_hdl = hdl;
- slice->rn_order = 100;
slice->rn_labelpaths = B_TRUE;
+
+ error = zfs_path_order(slice->rn_name, &slice->rn_order);
+ if (error == 0)
+ slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
+ else
+ slice->rn_order = IMPORT_ORDER_DEFAULT;
+
mutex_enter(lock);
- avl_add(*slice_cache, slice);
+ if (avl_find(*slice_cache, slice, &where)) {
+ free(slice->rn_name);
+ free(slice);
+ } else {
+ avl_insert(*slice_cache, slice, where);
+ }
mutex_exit(lock);
}
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 6f5dae6c3..95ec0b49d 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -1137,6 +1137,45 @@ zfs_strcmp_pathname(char *name, char *cmp, int wholedisk)
}
/*
+ * Given a full path to a device determine if that device appears in the
+ * import search path. If it does return the first match and store the
+ * index in the passed 'order' variable, otherwise return an error.
+ */
+int
+zfs_path_order(char *name, int *order)
+{
+ int i = 0, error = ENOENT;
+ char *dir, *env, *envdup;
+
+ env = getenv("ZPOOL_IMPORT_PATH");
+ if (env) {
+ envdup = strdup(env);
+ dir = strtok(envdup, ":");
+ while (dir) {
+ if (strncmp(name, dir, strlen(dir)) == 0) {
+ *order = i;
+ error = 0;
+ break;
+ }
+ dir = strtok(NULL, ":");
+ i++;
+ }
+ free(envdup);
+ } else {
+ for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) {
+ if (strncmp(name, zpool_default_import_path[i],
+ strlen(zpool_default_import_path[i])) == 0) {
+ *order = i;
+ error = 0;
+ break;
+ }
+ }
+ }
+
+ return (error);
+}
+
+/*
* Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
* an ioctl().
*/