aboutsummaryrefslogtreecommitdiffstats
path: root/module/os
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2021-01-28 09:28:20 -0800
committerGitHub <[email protected]>2021-01-28 09:28:20 -0800
commitf8c0d7e1f6a2c7efebea004afbab436d8d20b1e7 (patch)
tree01a602c89e807fdbb4e816b5dfe60b7fcbe40352 /module/os
parent0ae184a6baaf71e155e9b19af81b75474622ff58 (diff)
fix abd_nr_pages_off for gang abd
`__vdev_disk_physio()` uses `abd_nr_pages_off()` to allocate a bio with a sufficient number of iovec's to process this zio (i.e. `nr_iovecs`/`bi_max_vecs`). If there are not enough iovec's in the bio, then additional bio's will be allocated. However, this is a sub-optimal code path. In particular, it requires several abd calls (to `abd_nr_pages_off()` and `abd_bio_map_off()`) which will have to walk the constituents of the ABD (the pages or the gang children) because they are looking for offsets > 0. For gang ABD's, `abd_nr_pages_off()` returns the number of iovec's needed for the first constituent, rather than the sum of all constituents (within the requested range). This always under-estimates the required number of iovec's, which causes us to always need several bio's. The end result is that `__vdev_disk_physio()` is usually O(n^2) for gang ABD's (and occasionally O(n^3), when more than 16 bio's are needed). This commit fixes `abd_nr_pages_off()`'s handling of gang ABD's, to correctly determine how many iovec's are needed, by adding up the number of iovec's for each of the gang children in the requested range. Reviewed-by: Mark Maybee <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Brian Atkinson <[email protected]> Signed-off-by: Matthew Ahrens <[email protected]> Closes #11536
Diffstat (limited to 'module/os')
-rw-r--r--module/os/linux/zfs/abd_os.c24
-rw-r--r--module/os/linux/zfs/vdev_disk.c49
2 files changed, 40 insertions, 33 deletions
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 352d14de1..d82e5f4dc 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -925,17 +925,28 @@ abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
{
unsigned long pos;
- while (abd_is_gang(abd))
- abd = abd_gang_get_offset(abd, &off);
+ if (abd_is_gang(abd)) {
+ unsigned long count = 0;
+
+ for (abd_t *cabd = abd_gang_get_offset(abd, &off);
+ cabd != NULL && size != 0;
+ cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+ ASSERT3U(off, <, cabd->abd_size);
+ int mysize = MIN(size, cabd->abd_size - off);
+ count += abd_nr_pages_off(cabd, mysize, off);
+ size -= mysize;
+ off = 0;
+ }
+ return (count);
+ }
- ASSERT(!abd_is_gang(abd));
if (abd_is_linear(abd))
pos = (unsigned long)abd_to_buf(abd) + off;
else
pos = ABD_SCATTER(abd).abd_offset + off;
- return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
- (pos >> PAGE_SHIFT);
+ return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
+ (pos >> PAGE_SHIFT));
}
static unsigned int
@@ -1010,7 +1021,6 @@ unsigned int
abd_bio_map_off(struct bio *bio, abd_t *abd,
unsigned int io_size, size_t off)
{
- int i;
struct abd_iter aiter;
ASSERT3U(io_size, <=, abd->abd_size - off);
@@ -1024,7 +1034,7 @@ abd_bio_map_off(struct bio *bio, abd_t *abd,
abd_iter_init(&aiter, abd);
abd_iter_advance(&aiter, off);
- for (i = 0; i < bio->bi_max_vecs; i++) {
+ for (int i = 0; i < bio->bi_max_vecs; i++) {
struct page *pg;
size_t len, sgoff, pgoff;
struct scatterlist *sg;
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 4bd27d1b5..b373f2c2e 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -350,19 +350,14 @@ vdev_disk_close(vdev_t *v)
static dio_request_t *
vdev_disk_dio_alloc(int bio_count)
{
- dio_request_t *dr;
- int i;
-
- dr = kmem_zalloc(sizeof (dio_request_t) +
+ dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
sizeof (struct bio *) * bio_count, KM_SLEEP);
- if (dr) {
- atomic_set(&dr->dr_ref, 0);
- dr->dr_bio_count = bio_count;
- dr->dr_error = 0;
+ atomic_set(&dr->dr_ref, 0);
+ dr->dr_bio_count = bio_count;
+ dr->dr_error = 0;
- for (i = 0; i < dr->dr_bio_count; i++)
- dr->dr_bio[i] = NULL;
- }
+ for (int i = 0; i < dr->dr_bio_count; i++)
+ dr->dr_bio[i] = NULL;
return (dr);
}
@@ -536,8 +531,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
dio_request_t *dr;
uint64_t abd_offset;
uint64_t bio_offset;
- int bio_size, bio_count = 16;
- int i = 0, error = 0;
+ int bio_size;
+ int bio_count = 16;
+ int error = 0;
struct blk_plug plug;
/*
@@ -552,8 +548,6 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
retry:
dr = vdev_disk_dio_alloc(bio_count);
- if (dr == NULL)
- return (SET_ERROR(ENOMEM));
if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bio_set_flags_failfast(bdev, &flags);
@@ -561,26 +555,28 @@ retry:
dr->dr_zio = zio;
/*
- * When the IO size exceeds the maximum bio size for the request
- * queue we are forced to break the IO in multiple bio's and wait
- * for them all to complete. Ideally, all pool users will set
- * their volume block size to match the maximum request size and
- * the common case will be one bio per vdev IO request.
+ * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
+ * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
+ * can cover at least 128KB and at most 1MB. When the required number
+ * of iovec's exceeds this, we are forced to break the IO in multiple
+ * bio's and wait for them all to complete. This is likely if the
+ * recordsize property is increased beyond 1MB. The default
+ * bio_count=16 should typically accommodate the maximum-size zio of
+ * 16MB.
*/
abd_offset = 0;
bio_offset = io_offset;
- bio_size = io_size;
- for (i = 0; i <= dr->dr_bio_count; i++) {
+ bio_size = io_size;
+ for (int i = 0; i <= dr->dr_bio_count; i++) {
/* Finished constructing bio's for given buffer */
if (bio_size <= 0)
break;
/*
- * By default only 'bio_count' bio's per dio are allowed.
- * However, if we find ourselves in a situation where more
- * are needed we allocate a larger dio and warn the user.
+ * If additional bio's are required, we have to retry, but
+ * this should be rare - see the comment above.
*/
if (dr->dr_bio_count == i) {
vdev_disk_dio_free(dr);
@@ -622,9 +618,10 @@ retry:
blk_start_plug(&plug);
/* Submit all bio's associated with this dio */
- for (i = 0; i < dr->dr_bio_count; i++)
+ for (int i = 0; i < dr->dr_bio_count; i++) {
if (dr->dr_bio[i])
vdev_submit_bio(dr->dr_bio[i]);
+ }
if (dr->dr_bio_count > 1)
blk_finish_plug(&plug);