aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/vdev_disk.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/vdev_disk.c')
-rw-r--r--module/zfs/vdev_disk.c284
1 files changed, 150 insertions, 134 deletions
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 996bab43c..78741af7f 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -85,50 +85,64 @@ vdev_bdev_mode(int smode)
}
#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
-/* The capacity (in bytes) of a bdev that is available to be used by a vdev */
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
static uint64_t
-bdev_capacity(struct block_device *bdev, boolean_t wholedisk)
+bdev_capacity(struct block_device *bdev)
{
- struct hd_struct *part = bdev->bd_part;
- uint64_t sectors = get_capacity(bdev->bd_disk);
- /* If there are no paritions, return the entire device capacity */
- if (part == NULL)
- return (sectors << SECTOR_BITS);
+ return (i_size_read(bdev->bd_inode));
+}
- /*
- * If there are partitions, decide if we are using a `wholedisk`
- * layout (composed of part1 and part9) or just a single partition.
- */
- if (wholedisk) {
- /* Verify the expected device layout */
- ASSERT3P(bdev, !=, bdev->bd_contains);
+/*
+ * Returns the maximum expansion capacity of the block device (in bytes).
+ *
+ * It is possible to expand a vdev when it has been created as a wholedisk
+ * and the containing block device has increased in capacity. Or when the
+ * partition containing the pool has been manually increased in size.
+ *
+ * This function is only responsible for calculating the potential expansion
+ * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
+ * responsible for verifying the expected partition layout in the wholedisk
+ * case, and updating the partition table if appropriate. Once the partition
+ * size has been increased the additional capacity will be visible using
+ * bdev_capacity().
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+ uint64_t psize;
+ int64_t available;
+
+ if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
/*
- * Sectors used by the EFI partition (part9) as well as
- * partion alignment.
+ * When reporting maximum expansion capacity for a wholedisk
+ * deduct any capacity which is expected to be lost due to
+ * alignment restrictions. Over reporting this value isn't
+ * harmful and would only result in slightly less capacity
+ * than expected post expansion.
*/
- uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
- PARTITION_END_ALIGNMENT;
-
- /* Space available to the vdev, i.e. the size of part1 */
- if (sectors <= used)
- return (0);
- uint64_t available = sectors - used;
- return (available << SECTOR_BITS);
+ available = i_size_read(bdev->bd_contains->bd_inode) -
+ ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+ PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+ if (available > 0)
+ psize = available;
+ else
+ psize = bdev_capacity(bdev);
} else {
- /* The partition capacity referenced by the block device */
- return (part->nr_sects << SECTOR_BITS);
+ psize = bdev_capacity(bdev);
}
+
+ return (psize);
}
static void
vdev_disk_error(zio_t *zio)
{
-#ifdef ZFS_DEBUG
- printk(KERN_WARNING "ZFS: zio error=%d type=%d offset=%llu size=%llu "
+ zfs_dbgmsg(KERN_WARNING "zio error=%d type=%d offset=%llu size=%llu "
"flags=%x\n", zio->io_error, zio->io_type,
(u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
zio->io_flags);
-#endif
}
/*
@@ -200,109 +214,73 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
}
}
-/*
- * Expanding a whole disk vdev involves invoking BLKRRPART on the
- * whole disk device. This poses a problem, because BLKRRPART will
- * return EBUSY if one of the disk's partitions is open. That's why
- * we have to do it here, just before opening the data partition.
- * Unfortunately, BLKRRPART works by dropping all partitions and
- * recreating them, which means that for a short time window, all
- * /dev/sdxN device files disappear (until udev recreates them).
- * This means two things:
- * - When we open the data partition just after a BLKRRPART, we
- * can't do it using the normal device file path because of the
- * obvious race condition with udev. Instead, we use reliable
- * kernel APIs to get a handle to the new partition device from
- * the whole disk device.
- * - Because vdev_disk_open() initially needs to find the device
- * using its path, multiple vdev_disk_open() invocations in
- * short succession on the same disk with BLKRRPARTs in the
- * middle have a high probability of failure (because of the
- * race condition with udev). A typical situation where this
- * might happen is when the zpool userspace tool does a
- * TRYIMPORT immediately followed by an IMPORT. For this
- * reason, we only invoke BLKRRPART in the module when strictly
- * necessary (zpool online -e case), and rely on userspace to
- * do it when possible.
- */
-static struct block_device *
-vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
-{
-#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
- struct block_device *bdev, *result = ERR_PTR(-ENXIO);
- struct gendisk *disk;
- int error, partno;
-
- bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
- if (IS_ERR(bdev))
- return (bdev);
-
- disk = get_gendisk(bdev->bd_dev, &partno);
- vdev_bdev_close(bdev, vdev_bdev_mode(mode));
-
- if (disk) {
- bdev = bdget(disk_devt(disk));
- if (bdev) {
- error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
- if (error == 0)
- error = ioctl_by_bdev(bdev, BLKRRPART, 0);
- vdev_bdev_close(bdev, vdev_bdev_mode(mode));
- }
-
- bdev = bdget_disk(disk, partno);
- if (bdev) {
- error = blkdev_get(bdev,
- vdev_bdev_mode(mode) | FMODE_EXCL, vd);
- if (error == 0)
- result = bdev;
- }
- put_disk(disk);
- }
-
- return (result);
-#else
- return (ERR_PTR(-EOPNOTSUPP));
-#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
-}
-
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
uint64_t *ashift)
{
- struct block_device *bdev = ERR_PTR(-ENXIO);
+ struct block_device *bdev;
+ fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+ int count = 0, block_size;
+ int bdev_retry_count = 50;
vdev_disk_t *vd;
- int count = 0, mode, block_size;
/* Must have a pathname and it must be absolute. */
if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- vdev_dbgmsg(v, "vdev_disk_open: invalid "
- "vdev_path '%s'", v->vdev_path);
+ vdev_dbgmsg(v, "invalid vdev_path");
return (SET_ERROR(EINVAL));
}
/*
- * Reopen the device if it's not currently open. Otherwise,
- * just update the physical size of the device.
+ * Reopen the device if it is currently open. When expanding a
+ * partition force re-scanning the partition table while closed
+ * in order to get an accurate updated block device size. Then
+ * since udev may need to recreate the device links increase the
+ * open retry count before reporting the device as unavailable.
*/
- if (v->vdev_tsd != NULL) {
- ASSERT(v->vdev_reopening);
- vd = v->vdev_tsd;
- goto skip_open;
- }
+ vd = v->vdev_tsd;
+ if (vd) {
+ char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
+ boolean_t reread_part = B_FALSE;
- vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
- if (vd == NULL)
- return (SET_ERROR(ENOMEM));
+ rw_enter(&vd->vd_lock, RW_WRITER);
+ bdev = vd->vd_bdev;
+ vd->vd_bdev = NULL;
+
+ if (bdev) {
+ if (v->vdev_expanding && bdev != bdev->bd_contains) {
+ bdevname(bdev->bd_contains, disk_name + 5);
+ reread_part = B_TRUE;
+ }
+
+ vdev_bdev_close(bdev, mode);
+ }
+
+ if (reread_part) {
+ bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
+ if (!IS_ERR(bdev)) {
+ int error = vdev_bdev_reread_part(bdev);
+ vdev_bdev_close(bdev, mode);
+ if (error == 0)
+ bdev_retry_count = 100;
+ }
+ }
+ } else {
+ vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+ rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
+ rw_enter(&vd->vd_lock, RW_WRITER);
+ }
/*
* Devices are always opened by the path provided at configuration
* time. This means that if the provided path is a udev by-id path
- * then drives may be recabled without an issue. If the provided
+ * then drives may be re-cabled without an issue. If the provided
* path is a udev by-path path, then the physical location information
* will be preserved. This can be critical for more complicated
* configurations where drives are located in specific physical
- * locations to maximize the systems tolerence to component failure.
+ * locations to maximize the systems tolerance to component failure.
+ *
* Alternatively, you can provide your own udev rule to flexibly map
* the drives as you see fit. It is not advised that you use the
* /dev/[hd]d devices which may be reordered due to probing order.
@@ -317,15 +295,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
* and it is reasonable to sleep and retry before giving up. In
* practice delays have been observed to be on the order of 100ms.
*/
- mode = spa_mode(v->vdev_spa);
- if (v->vdev_wholedisk && v->vdev_expanding)
- bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
-
- while (IS_ERR(bdev) && count < 50) {
- bdev = vdev_bdev_open(v->vdev_path,
- vdev_bdev_mode(mode), zfs_vdev_holder);
+ bdev = ERR_PTR(-ENXIO);
+ while (IS_ERR(bdev) && count < bdev_retry_count) {
+ bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
- msleep(10);
+ schedule_timeout(MSEC_TO_TICK(10));
count++;
} else if (IS_ERR(bdev)) {
break;
@@ -333,16 +307,18 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
}
if (IS_ERR(bdev)) {
- dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
- v->vdev_path, -PTR_ERR(bdev), count);
- kmem_free(vd, sizeof (vdev_disk_t));
- return (SET_ERROR(-PTR_ERR(bdev)));
+ int error = -PTR_ERR(bdev);
+ vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
+ vd->vd_bdev = NULL;
+ v->vdev_tsd = vd;
+ rw_exit(&vd->vd_lock);
+ return (SET_ERROR(error));
+ } else {
+ vd->vd_bdev = bdev;
+ v->vdev_tsd = vd;
+ rw_exit(&vd->vd_lock);
}
- v->vdev_tsd = vd;
- vd->vd_bdev = bdev;
-
-skip_open:
/* Determine the physical block size */
block_size = vdev_bdev_block_size(vd->vd_bdev);
@@ -352,9 +328,11 @@ skip_open:
/* Inform the ZIO pipeline that we are non-rotational */
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
- /* Physical volume size in bytes */
- *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk);
- *max_psize = *psize;
+ /* Physical volume size in bytes for the partition */
+ *psize = bdev_capacity(vd->vd_bdev);
+
+ /* Physical volume size in bytes including possible expansion space */
+ *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
/* Based on the minimum sector size set the block size */
*ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
@@ -373,10 +351,12 @@ vdev_disk_close(vdev_t *v)
if (v->vdev_reopening || vd == NULL)
return;
- if (vd->vd_bdev != NULL)
+ if (vd->vd_bdev != NULL) {
vdev_bdev_close(vd->vd_bdev,
vdev_bdev_mode(spa_mode(v->vdev_spa)));
+ }
+ rw_destroy(&vd->vd_lock);
kmem_free(vd, sizeof (vdev_disk_t));
v->vdev_tsd = NULL;
}
@@ -562,9 +542,15 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
struct blk_plug plug;
#endif
-
- ASSERT(zio != NULL);
- ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size);
+ /*
+ * Accessing outside the block device is never allowed.
+ */
+ if (io_offset + io_size > bdev->bd_inode->i_size) {
+ vdev_dbgmsg(zio->io_vd,
+ "Illegal access %llu size %llu, device size %llu",
+ io_offset, io_size, i_size_read(bdev->bd_inode));
+ return (SET_ERROR(EIO));
+ }
retry:
dr = vdev_disk_dio_alloc(bio_count);
@@ -705,10 +691,34 @@ vdev_disk_io_start(zio_t *zio)
vdev_disk_t *vd = v->vdev_tsd;
int rw, flags, error;
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (vd == NULL) {
+ zio->io_error = ENXIO;
+ zio_interrupt(zio);
+ return;
+ }
+
+ rw_enter(&vd->vd_lock, RW_READER);
+
+ /*
+ * If the vdev is closed, it's likely due to a failed reopen and is
+ * in the UNAVAIL state. Nothing to be done here but return failure.
+ */
+ if (vd->vd_bdev == NULL) {
+ rw_exit(&vd->vd_lock);
+ zio->io_error = ENXIO;
+ zio_interrupt(zio);
+ return;
+ }
+
switch (zio->io_type) {
case ZIO_TYPE_IOCTL:
if (!vdev_readable(v)) {
+ rw_exit(&vd->vd_lock);
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return;
@@ -726,8 +736,10 @@ vdev_disk_io_start(zio_t *zio)
}
error = vdev_disk_io_flush(vd->vd_bdev, zio);
- if (error == 0)
+ if (error == 0) {
+ rw_exit(&vd->vd_lock);
return;
+ }
zio->io_error = error;
@@ -737,6 +749,7 @@ vdev_disk_io_start(zio_t *zio)
zio->io_error = SET_ERROR(ENOTSUP);
}
+ rw_exit(&vd->vd_lock);
zio_execute(zio);
return;
case ZIO_TYPE_WRITE:
@@ -762,6 +775,7 @@ vdev_disk_io_start(zio_t *zio)
break;
default:
+ rw_exit(&vd->vd_lock);
zio->io_error = SET_ERROR(ENOTSUP);
zio_interrupt(zio);
return;
@@ -770,6 +784,8 @@ vdev_disk_io_start(zio_t *zio)
zio->io_target_timestamp = zio_handle_io_delay(zio);
error = __vdev_disk_physio(vd->vd_bdev, zio,
zio->io_size, zio->io_offset, rw, flags);
+ rw_exit(&vd->vd_lock);
+
if (error) {
zio->io_error = error;
zio_interrupt(zio);