diff options
-rw-r--r-- | cmd/zpool/zpool_vdev.c | 38 | ||||
-rw-r--r-- | lib/libzfs/libzfs_import.c | 115 | ||||
-rw-r--r-- | lib/libzfs/libzfs_pool.c | 30 | ||||
-rw-r--r-- | module/zfs/vdev_disk.c | 29 |
4 files changed, 166 insertions, 46 deletions
diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 8bbbf6615..cf87554d5 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -1198,12 +1198,10 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) /* * Remove any previously existing symlink from a udev path to - * the device before labeling the disk. This makes - * zpool_label_disk_wait() truly wait for the new link to show - * up instead of returning if it finds an old link still in - * place. Otherwise there is a window between when udev - * deletes and recreates the link during which access attempts - * will fail with ENOENT. + * the device before labeling the disk. This ensures that + * only newly created links are used. Otherwise there is a + * window between when udev deletes and recreates the link + * during which access attempts will fail with ENOENT. */ strncpy(udevpath, path, MAXPATHLEN); (void) zfs_append_partition(udevpath, MAXPATHLEN); @@ -1227,6 +1225,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) * and then block until udev creates the new link. */ if (!is_exclusive || !is_spare(NULL, udevpath)) { + char *devnode = strrchr(devpath, '/') + 1; + ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); if (ret == 0) { ret = lstat64(udevpath, &statbuf); @@ -1234,18 +1234,29 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) (void) unlink(udevpath); } - if (zpool_label_disk(g_zfs, zhp, - strrchr(devpath, '/') + 1) == -1) + /* + * When labeling a pool the raw device node name + * is provided as it appears under /dev/. + */ + if (zpool_label_disk(g_zfs, zhp, devnode) == -1) return (-1); + /* + * Wait for udev to signal the device is available + * by the provided path. + */ ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); if (ret) { - (void) fprintf(stderr, gettext("cannot " - "resolve path '%s': %d\n"), udevpath, ret); - return (-1); + (void) fprintf(stderr, + gettext("missing link: %s was " + "partitioned but %s is missing\n"), + devnode, udevpath); + return (ret); } - (void) zero_label(udevpath); + ret = zero_label(udevpath); + if (ret) + return (ret); } /* @@ -1259,8 +1270,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) /* * Update device id strings for whole disks (Linux only) */ - if (wholedisk) - update_vdev_config_dev_strs(nv); + update_vdev_config_dev_strs(nv); return (0); } diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 8f27ed58c..2776ed29c 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -260,6 +260,86 @@ udev_device_is_ready(struct udev_device *dev) } /* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(char *path, int timeout_ms) +{ + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname = NULL; + int ret = ENODEV; + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + + if ((udev = udev_new()) == NULL) + return (ENXIO); + + start = gethrtime(); + settle = 0; + + do { + if (sysname == NULL) { + if (realpath(path, nodepath) != NULL) { + sysname = strrchr(nodepath, '/') + 1; + } else { + (void) usleep(sleep_ms * MILLISEC); + continue; + } + } + + dev = udev_device_new_from_subsystem_sysname(udev, + "block", sysname); + if ((dev != NULL) && udev_device_is_ready(dev)) { + struct udev_list_entry *links, *link; + + ret = 0; + links = udev_device_get_devlinks_list_entry(dev); + + udev_list_entry_foreach(link, links) { + struct stat64 statbuf; + const char *name; + + name = udev_list_entry_get_name(link); + errno = 0; + if (stat64(name, &statbuf) == 0 && errno == 0) + continue; + + settle = 0; + ret = ENODEV; + break; + } + + if (ret == 0) { + if (settle == 0) { + settle = gethrtime(); + } else if (NSEC2MSEC(gethrtime() - settle) >= + settle_ms) { + udev_device_unref(dev); + break; + } + } + } + + udev_device_unref(dev); + (void) usleep(sleep_ms * MILLISEC); + + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + udev_unref(udev); + + return (ret); +} + + +/* * Encode the persistent devices strings * used for the vdev disk label */ @@ -414,6 +494,41 @@ is_mpath_whole_disk(const char *path) return (B_FALSE); } +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when the provided path have been verified to exist and + * it has been allowed to settle. At this point the device the device can + * be accessed reliably. Depending on the complexity of the udev rules thisi + * process could take several seconds. + */ +int +zpool_label_disk_wait(char *path, int timeout_ms) +{ + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +} + void update_vdev_config_dev_strs(nvlist_t *nv) { diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 214c57ab4..c405abe3e 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -4122,30 +4122,7 @@ find_start_block(nvlist_t *config) return (MAXOFFSET_T); } -int -zpool_label_disk_wait(char *path, int timeout) -{ - struct stat64 statbuf; - int i; - - /* - * Wait timeout miliseconds for a newly created device to be available - * from the given path. There is a small window when a /dev/ device - * will exist and the udev link will not, so we must wait for the - * symlink. Depending on the udev rules this may take a few seconds. - */ - for (i = 0; i < timeout; i++) { - usleep(1000); - - errno = 0; - if ((stat64(path, &statbuf) == 0) && (errno == 0)) - return (0); - } - - return (ENOENT); -} - -int +static int zpool_label_disk_check(char *path) { struct dk_gpt *vtoc; @@ -4310,12 +4287,11 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) (void) close(fd); efi_free(vtoc); - /* Wait for the first expected partition to appear. */ - (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); (void) zfs_append_partition(path, MAXPATHLEN); - rval = zpool_label_disk_wait(path, 3000); + /* Wait to udev to signal use the device has settled. */ + rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT); if (rval) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " "detect device partitions on '%s': %d"), path, rval); diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index cdb8f78e2..9b51ecc1d 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -244,12 +244,12 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, { struct block_device *bdev = ERR_PTR(-ENXIO); vdev_disk_t *vd; - int mode, block_size; + int count = 0, mode, block_size; /* Must have a pathname and it must be absolute. */ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* @@ -264,7 +264,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); if (vd == NULL) - return (ENOMEM); + return (SET_ERROR(ENOMEM)); /* * Devices are always opened by the path provided at configuration @@ -279,16 +279,35 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, * /dev/[hd]d devices which may be reordered due to probing order. * Devices in the wrong locations will be detected by the higher * level vdev validation. + * + * The specified paths may be briefly removed and recreated in + * response to udev events. This should be exceptionally unlikely + * because the zpool command makes every effort to verify these paths + * have already settled prior to reaching this point. Therefore, + * a ENOENT failure at this point is highly likely to be transient + * and it is reasonable to sleep and retry before giving up. In + * practice delays have been observed to be on the order of 100ms. */ mode = spa_mode(v->vdev_spa); if (v->vdev_wholedisk && v->vdev_expanding) bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); - if (IS_ERR(bdev)) + + while (IS_ERR(bdev) && count < 50) { bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), zfs_vdev_holder); + if (unlikely(PTR_ERR(bdev) == -ENOENT)) { + msleep(10); + count++; + } else if (IS_ERR(bdev)) { + break; + } + } + if (IS_ERR(bdev)) { + dprintf("failed open v->vdev_path=%s, error=%d count=%d\n", + v->vdev_path, -PTR_ERR(bdev), count); kmem_free(vd, sizeof (vdev_disk_t)); - return (-PTR_ERR(bdev)); + return (SET_ERROR(-PTR_ERR(bdev))); } v->vdev_tsd = vd; |