diff options
author | Matthew Ahrens <[email protected]> | 2021-01-28 09:28:20 -0800 |
---|---|---|
committer | GitHub <[email protected]> | 2021-01-28 09:28:20 -0800 |
commit | f8c0d7e1f6a2c7efebea004afbab436d8d20b1e7 (patch) | |
tree | 01a602c89e807fdbb4e816b5dfe60b7fcbe40352 /module/os | |
parent | 0ae184a6baaf71e155e9b19af81b75474622ff58 (diff) |
fix abd_nr_pages_off for gang abd
`__vdev_disk_physio()` uses `abd_nr_pages_off()` to allocate a bio with
a sufficient number of iovec's to process this zio (i.e.
`nr_iovecs`/`bi_max_vecs`). If there are not enough iovec's in the bio,
then additional bio's will be allocated. However, this is a sub-optimal
code path. In particular, it requires several abd calls (to
`abd_nr_pages_off()` and `abd_bio_map_off()`) which will have to walk
the constituents of the ABD (the pages or the gang children) because
they are looking for offsets > 0.
For gang ABD's, `abd_nr_pages_off()` returns the number of iovec's
needed for the first constituent, rather than the sum of all
constituents (within the requested range). This always under-estimates
the required number of iovec's, which causes us to always need several
bio's. The end result is that `__vdev_disk_physio()` is usually O(n^2)
for gang ABD's (and occasionally O(n^3), when more than 16 bio's are
needed).
This commit fixes `abd_nr_pages_off()`'s handling of gang ABD's, to
correctly determine how many iovec's are needed, by adding up the number
of iovec's for each of the gang children in the requested range.
Reviewed-by: Mark Maybee <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Brian Atkinson <[email protected]>
Signed-off-by: Matthew Ahrens <[email protected]>
Closes #11536
Diffstat (limited to 'module/os')
-rw-r--r-- | module/os/linux/zfs/abd_os.c | 24 | ||||
-rw-r--r-- | module/os/linux/zfs/vdev_disk.c | 49 |
2 files changed, 40 insertions, 33 deletions
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 352d14de1..d82e5f4dc 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -925,17 +925,28 @@ abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) { unsigned long pos; - while (abd_is_gang(abd)) - abd = abd_gang_get_offset(abd, &off); + if (abd_is_gang(abd)) { + unsigned long count = 0; + + for (abd_t *cabd = abd_gang_get_offset(abd, &off); + cabd != NULL && size != 0; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + ASSERT3U(off, <, cabd->abd_size); + int mysize = MIN(size, cabd->abd_size - off); + count += abd_nr_pages_off(cabd, mysize, off); + size -= mysize; + off = 0; + } + return (count); + } - ASSERT(!abd_is_gang(abd)); if (abd_is_linear(abd)) pos = (unsigned long)abd_to_buf(abd) + off; else pos = ABD_SCATTER(abd).abd_offset + off; - return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT); + return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT)); } static unsigned int @@ -1010,7 +1021,6 @@ unsigned int abd_bio_map_off(struct bio *bio, abd_t *abd, unsigned int io_size, size_t off) { - int i; struct abd_iter aiter; ASSERT3U(io_size, <=, abd->abd_size - off); @@ -1024,7 +1034,7 @@ abd_bio_map_off(struct bio *bio, abd_t *abd, abd_iter_init(&aiter, abd); abd_iter_advance(&aiter, off); - for (i = 0; i < bio->bi_max_vecs; i++) { + for (int i = 0; i < bio->bi_max_vecs; i++) { struct page *pg; size_t len, sgoff, pgoff; struct scatterlist *sg; diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 4bd27d1b5..b373f2c2e 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -350,19 +350,14 @@ vdev_disk_close(vdev_t *v) static dio_request_t * vdev_disk_dio_alloc(int bio_count) { - dio_request_t *dr; - int i; - - dr = kmem_zalloc(sizeof (dio_request_t) + + dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); - if (dr) { - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; - for (i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - } + for (int i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; return (dr); } @@ -536,8 +531,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; - int bio_size, bio_count = 16; - int i = 0, error = 0; + int bio_size; + int bio_count = 16; + int error = 0; struct blk_plug plug; /* @@ -552,8 +548,6 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, retry: dr = vdev_disk_dio_alloc(bio_count); - if (dr == NULL) - return (SET_ERROR(ENOMEM)); if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bio_set_flags_failfast(bdev, &flags); @@ -561,26 +555,28 @@ retry: dr->dr_zio = zio; /* - * When the IO size exceeds the maximum bio size for the request - * queue we are forced to break the IO in multiple bio's and wait - * for them all to complete. Ideally, all pool users will set - * their volume block size to match the maximum request size and - * the common case will be one bio per vdev IO request. + * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which + * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio + * can cover at least 128KB and at most 1MB. When the required number + * of iovec's exceeds this, we are forced to break the IO in multiple + * bio's and wait for them all to complete. This is likely if the + * recordsize property is increased beyond 1MB. The default + * bio_count=16 should typically accommodate the maximum-size zio of + * 16MB. */ abd_offset = 0; bio_offset = io_offset; - bio_size = io_size; - for (i = 0; i <= dr->dr_bio_count; i++) { + bio_size = io_size; + for (int i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ if (bio_size <= 0) break; /* - * By default only 'bio_count' bio's per dio are allowed. - * However, if we find ourselves in a situation where more - * are needed we allocate a larger dio and warn the user. + * If additional bio's are required, we have to retry, but + * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { vdev_disk_dio_free(dr); @@ -622,9 +618,10 @@ retry: blk_start_plug(&plug); /* Submit all bio's associated with this dio */ - for (i = 0; i < dr->dr_bio_count; i++) + for (int i = 0; i < dr->dr_bio_count; i++) { if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); + } if (dr->dr_bio_count > 1) blk_finish_plug(&plug); |