1 files changed, 150 insertions, 134 deletions
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 996bab43c..78741af7f 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -85,50 +85,64 @@ vdev_bdev_mode(int smode)
 }
 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
 
-/* The capacity (in bytes) of a bdev that is available to be used by a vdev */
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
 static uint64_t
-bdev_capacity(struct block_device *bdev, boolean_t wholedisk)
+bdev_capacity(struct block_device *bdev)
 {
-	struct hd_struct *part = bdev->bd_part;
-	uint64_t sectors = get_capacity(bdev->bd_disk);
-	/* If there are no paritions, return the entire device capacity */
-	if (part == NULL)
-		return (sectors << SECTOR_BITS);
+	return (i_size_read(bdev->bd_inode));
+}
 
-	/*
-	 * If there are partitions, decide if we are using a `wholedisk`
-	 * layout (composed of part1 and part9) or just a single partition.
-	 */
-	if (wholedisk) {
-		/* Verify the expected device layout */
-		ASSERT3P(bdev, !=, bdev->bd_contains);
+/*
+ * Returns the maximum expansion capacity of the block device (in bytes).
+ *
+ * It is possible to expand a vdev when it has been created as a wholedisk
+ * and the containing block device has increased in capacity.  Or when the
+ * partition containing the pool has been manually increased in size.
+ *
+ * This function is only responsible for calculating the potential expansion
+ * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
+ * responsible for verifying the expected partition layout in the wholedisk
+ * case, and updating the partition table if appropriate.  Once the partition
+ * size has been increased the additional capacity will be visible using
+ * bdev_capacity().
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+	uint64_t psize;
+	int64_t available;
+
+	if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
 		/*
-		 * Sectors used by the EFI partition (part9) as well as
-		 * partion alignment.
+		 * When reporting maximum expansion capacity for a wholedisk
+		 * deduct any capacity which is expected to be lost due to
+		 * alignment restrictions.  Over reporting this value isn't
+		 * harmful and would only result in slightly less capacity
+		 * than expected post expansion.
 		 */
-		uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
-		    PARTITION_END_ALIGNMENT;
-
-		/* Space available to the vdev, i.e. the size of part1 */
-		if (sectors <= used)
-			return (0);
-		uint64_t available = sectors - used;
-		return (available << SECTOR_BITS);
+		available = i_size_read(bdev->bd_contains->bd_inode) -
+		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+		if (available > 0)
+			psize = available;
+		else
+			psize = bdev_capacity(bdev);
 	} else {
-		/* The partition capacity referenced by the block device */
-		return (part->nr_sects << SECTOR_BITS);
+		psize = bdev_capacity(bdev);
 	}
+
+	return (psize);
 }
 
 static void
 vdev_disk_error(zio_t *zio)
 {
-#ifdef ZFS_DEBUG
-	printk(KERN_WARNING "ZFS: zio error=%d type=%d offset=%llu size=%llu "
+	zfs_dbgmsg(KERN_WARNING "zio error=%d type=%d offset=%llu size=%llu "
 	    "flags=%x\n", zio->io_error, zio->io_type,
 	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
 	    zio->io_flags);
-#endif
 }
 
 /*
@@ -200,109 +214,73 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
 	}
 }
 
-/*
- * Expanding a whole disk vdev involves invoking BLKRRPART on the
- * whole disk device. This poses a problem, because BLKRRPART will
- * return EBUSY if one of the disk's partitions is open. That's why
- * we have to do it here, just before opening the data partition.
- * Unfortunately, BLKRRPART works by dropping all partitions and
- * recreating them, which means that for a short time window, all
- * /dev/sdxN device files disappear (until udev recreates them).
- * This means two things:
- *  - When we open the data partition just after a BLKRRPART, we
- *    can't do it using the normal device file path because of the
- *    obvious race condition with udev. Instead, we use reliable
- *    kernel APIs to get a handle to the new partition device from
- *    the whole disk device.
- *  - Because vdev_disk_open() initially needs to find the device
- *    using its path, multiple vdev_disk_open() invocations in
- *    short succession on the same disk with BLKRRPARTs in the
- *    middle have a high probability of failure (because of the
- *    race condition with udev). A typical situation where this
- *    might happen is when the zpool userspace tool does a
- *    TRYIMPORT immediately followed by an IMPORT. For this
- *    reason, we only invoke BLKRRPART in the module when strictly
- *    necessary (zpool online -e case), and rely on userspace to
- *    do it when possible.
- */
-static struct block_device *
-vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
-{
-#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
-	struct block_device *bdev, *result = ERR_PTR(-ENXIO);
-	struct gendisk *disk;
-	int error, partno;
-
-	bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
-	if (IS_ERR(bdev))
-		return (bdev);
-
-	disk = get_gendisk(bdev->bd_dev, &partno);
-	vdev_bdev_close(bdev, vdev_bdev_mode(mode));
-
-	if (disk) {
-		bdev = bdget(disk_devt(disk));
-		if (bdev) {
-			error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
-			if (error == 0)
-				error = ioctl_by_bdev(bdev, BLKRRPART, 0);
-			vdev_bdev_close(bdev, vdev_bdev_mode(mode));
-		}
-
-		bdev = bdget_disk(disk, partno);
-		if (bdev) {
-			error = blkdev_get(bdev,
-			    vdev_bdev_mode(mode) | FMODE_EXCL, vd);
-			if (error == 0)
-				result = bdev;
-		}
-		put_disk(disk);
-	}
-
-	return (result);
-#else
-	return (ERR_PTR(-EOPNOTSUPP));
-#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
-}
-
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift)
 {
-	struct block_device *bdev = ERR_PTR(-ENXIO);
+	struct block_device *bdev;
+	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+	int count = 0, block_size;
+	int bdev_retry_count = 50;
 	vdev_disk_t *vd;
-	int count = 0, mode, block_size;
 
 	/* Must have a pathname and it must be absolute. */
 	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
 		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		vdev_dbgmsg(v, "vdev_disk_open: invalid "
-		    "vdev_path '%s'", v->vdev_path);
+		vdev_dbgmsg(v, "invalid vdev_path");
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
-	 * Reopen the device if it's not currently open. Otherwise,
-	 * just update the physical size of the device.
+	 * Reopen the device if it is currently open.  When expanding a
+	 * partition force re-scanning the partition table while closed
+	 * in order to get an accurate updated block device size.  Then
+	 * since udev may need to recreate the device links increase the
+	 * open retry count before reporting the device as unavailable.
 	 */
-	if (v->vdev_tsd != NULL) {
-		ASSERT(v->vdev_reopening);
-		vd = v->vdev_tsd;
-		goto skip_open;
-	}
+	vd = v->vdev_tsd;
+	if (vd) {
+		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
+		boolean_t reread_part = B_FALSE;
 
-	vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
-	if (vd == NULL)
-		return (SET_ERROR(ENOMEM));
+		rw_enter(&vd->vd_lock, RW_WRITER);
+		bdev = vd->vd_bdev;
+		vd->vd_bdev = NULL;
+
+		if (bdev) {
+			if (v->vdev_expanding && bdev != bdev->bd_contains) {
+				bdevname(bdev->bd_contains, disk_name + 5);
+				reread_part = B_TRUE;
+			}
+
+			vdev_bdev_close(bdev, mode);
+		}
+
+		if (reread_part) {
+			bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
+			if (!IS_ERR(bdev)) {
+				int error = vdev_bdev_reread_part(bdev);
+				vdev_bdev_close(bdev, mode);
+				if (error == 0)
+					bdev_retry_count = 100;
+			}
+		}
+	} else {
+		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
+		rw_enter(&vd->vd_lock, RW_WRITER);
+	}
 
 	/*
 	 * Devices are always opened by the path provided at configuration
 	 * time.  This means that if the provided path is a udev by-id path
-	 * then drives may be recabled without an issue.  If the provided
+	 * then drives may be re-cabled without an issue.  If the provided
 	 * path is a udev by-path path, then the physical location information
 	 * will be preserved.  This can be critical for more complicated
 	 * configurations where drives are located in specific physical
-	 * locations to maximize the systems tolerence to component failure.
+	 * locations to maximize the systems tolerance to component failure.
+	 *
 	 * Alternatively, you can provide your own udev rule to flexibly map
 	 * the drives as you see fit.  It is not advised that you use the
 	 * /dev/[hd]d devices which may be reordered due to probing order.
@@ -317,15 +295,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	 * and it is reasonable to sleep and retry before giving up.  In
 	 * practice delays have been observed to be on the order of 100ms.
 	 */
-	mode = spa_mode(v->vdev_spa);
-	if (v->vdev_wholedisk && v->vdev_expanding)
-		bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
-
-	while (IS_ERR(bdev) && count < 50) {
-		bdev = vdev_bdev_open(v->vdev_path,
-		    vdev_bdev_mode(mode), zfs_vdev_holder);
+	bdev = ERR_PTR(-ENXIO);
+	while (IS_ERR(bdev) && count < bdev_retry_count) {
+		bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
 		if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
-			msleep(10);
+			schedule_timeout(MSEC_TO_TICK(10));
 			count++;
 		} else if (IS_ERR(bdev)) {
 			break;
@@ -333,16 +307,18 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	}
 
 	if (IS_ERR(bdev)) {
-		dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
-		    v->vdev_path, -PTR_ERR(bdev), count);
-		kmem_free(vd, sizeof (vdev_disk_t));
-		return (SET_ERROR(-PTR_ERR(bdev)));
+		int error = -PTR_ERR(bdev);
+		vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
+		vd->vd_bdev = NULL;
+		v->vdev_tsd = vd;
+		rw_exit(&vd->vd_lock);
+		return (SET_ERROR(error));
+	} else {
+		vd->vd_bdev = bdev;
+		v->vdev_tsd = vd;
+		rw_exit(&vd->vd_lock);
 	}
 
-	v->vdev_tsd = vd;
-	vd->vd_bdev = bdev;
-
-skip_open:
 	/*  Determine the physical block size */
 	block_size = vdev_bdev_block_size(vd->vd_bdev);
 
@@ -352,9 +328,11 @@ skip_open:
 	/* Inform the ZIO pipeline that we are non-rotational */
 	v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
 
-	/* Physical volume size in bytes */
-	*psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk);
-	*max_psize = *psize;
+	/* Physical volume size in bytes for the partition */
+	*psize = bdev_capacity(vd->vd_bdev);
+
+	/* Physical volume size in bytes including possible expansion space */
+	*max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
 
 	/* Based on the minimum sector size set the block size */
 	*ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
@@ -373,10 +351,12 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
-	if (vd->vd_bdev != NULL)
+	if (vd->vd_bdev != NULL) {
 		vdev_bdev_close(vd->vd_bdev,
 		    vdev_bdev_mode(spa_mode(v->vdev_spa)));
+	}
 
+	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
 	v->vdev_tsd = NULL;
 }
@@ -562,9 +542,15 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
 	struct blk_plug plug;
 #endif
-
-	ASSERT(zio != NULL);
-	ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size);
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (io_offset + io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    io_offset, io_size, i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
 
 retry:
 	dr = vdev_disk_dio_alloc(bio_count);
@@ -705,10 +691,34 @@ vdev_disk_io_start(zio_t *zio)
 	vdev_disk_t *vd = v->vdev_tsd;
 	int rw, flags, error;
 
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (vd == NULL) {
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
+	rw_enter(&vd->vd_lock, RW_READER);
+
+	/*
+	 * If the vdev is closed, it's likely due to a failed reopen and is
+	 * in the UNAVAIL state.  Nothing to be done here but return failure.
+	 */
+	if (vd->vd_bdev == NULL) {
+		rw_exit(&vd->vd_lock);
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
 	switch (zio->io_type) {
 	case ZIO_TYPE_IOCTL:
 
 		if (!vdev_readable(v)) {
+			rw_exit(&vd->vd_lock);
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
@@ -726,8 +736,10 @@ vdev_disk_io_start(zio_t *zio)
 			}
 
 			error = vdev_disk_io_flush(vd->vd_bdev, zio);
-			if (error == 0)
+			if (error == 0) {
+				rw_exit(&vd->vd_lock);
 				return;
+			}
 
 			zio->io_error = error;
 
@@ -737,6 +749,7 @@ vdev_disk_io_start(zio_t *zio)
 			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
+		rw_exit(&vd->vd_lock);
 		zio_execute(zio);
 		return;
 	case ZIO_TYPE_WRITE:
@@ -762,6 +775,7 @@ vdev_disk_io_start(zio_t *zio)
 		break;
 
 	default:
+		rw_exit(&vd->vd_lock);
 		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
@@ -770,6 +784,8 @@ vdev_disk_io_start(zio_t *zio)
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
 	error = __vdev_disk_physio(vd->vd_bdev, zio,
 	    zio->io_size, zio->io_offset, rw, flags);
+	rw_exit(&vd->vd_lock);
+
 	if (error) {
 		zio->io_error = error;
 		zio_interrupt(zio);