aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2012-08-19 17:17:02 -0700
committerBrian Behlendorf <[email protected]>2012-08-27 12:01:37 -0700
commit86dd0fd9222b6103c6533036c47b908ece944460 (patch)
tree4dc2e9bcfcca485ccbc6dc11e255748b8023faad /module
parent44f21da41c441bfceec7b825991b6e68321d78a2 (diff)
Pre-allocate vdev I/O buffers
The vdev queue layer may require a small number of buffers when attempting to create aggregate I/O requests. Rather than attempting to allocate them from the global zio buffers, which is slow under memory pressure, it makes sense to pre-allocate them because... 1) These buffers are short lived. They are only required for the life of a single I/O at which point they can be used by the next I/O. 2) The maximum number of concurrent buffers needed by a vdev is small. It's roughly limited by the zfs_vdev_max_pending tunable which defaults to 10. By keeping a small list of these buffer per-vdev we can ensure one is always available when we need it. This significantly reduces contention on the vq->vq_lock, because we no longer need to perform a slow allocation under this lock. This is particularly important when memory is already low on the system. It would probably be wise to extend the use of these buffers beyond aggregate I/O and in to the raidz implementation. The inability to quickly allocate buffer for the parity stripes could result in similiar problems. Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'module')
-rw-r--r--module/zfs/vdev_queue.c36
-rw-r--r--module/zfs/zio.c22
2 files changed, 56 insertions, 2 deletions
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index aacc55c49..7ba638952 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -106,6 +106,7 @@ void
vdev_queue_init(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
+ int i;
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -120,18 +121,36 @@ vdev_queue_init(vdev_t *vd)
avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ /*
+ * A list of buffers which can be used for aggregate I/O, this
+ * avoids the need to allocate them on demand when memory is low.
+ */
+ list_create(&vq->vq_io_list, sizeof (vdev_io_t),
+ offsetof(vdev_io_t, vi_node));
+
+ for (i = 0; i < zfs_vdev_max_pending; i++)
+ list_insert_tail(&vq->vq_io_list, zio_vdev_alloc());
}
void
vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
+ vdev_io_t *vi;
avl_destroy(&vq->vq_deadline_tree);
avl_destroy(&vq->vq_read_tree);
avl_destroy(&vq->vq_write_tree);
avl_destroy(&vq->vq_pending_tree);
+ while ((vi = list_head(&vq->vq_io_list)) != NULL) {
+ list_remove(&vq->vq_io_list, vi);
+ zio_vdev_free(vi);
+ }
+
+ list_destroy(&vq->vq_io_list);
+
mutex_destroy(&vq->vq_lock);
}
@@ -152,6 +171,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
static void
vdev_queue_agg_io_done(zio_t *aio)
{
+ vdev_queue_t *vq = &aio->io_vd->vdev_queue;
+ vdev_io_t *vi = aio->io_data;
zio_t *pio;
while ((pio = zio_walk_parents(aio)) != NULL)
@@ -159,7 +180,9 @@ vdev_queue_agg_io_done(zio_t *aio)
bcopy((char *)aio->io_data + (pio->io_offset -
aio->io_offset), pio->io_data, pio->io_size);
- zio_buf_free(aio->io_data, aio->io_size);
+ mutex_enter(&vq->vq_lock);
+ list_insert_tail(&vq->vq_io_list, vi);
+ mutex_exit(&vq->vq_lock);
}
/*
@@ -176,6 +199,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
zio_t *fio, *lio, *aio, *dio, *nio, *mio;
avl_tree_t *t;
+ vdev_io_t *vi;
int flags;
uint64_t maxspan = zfs_vdev_aggregation_limit;
uint64_t maxgap;
@@ -194,6 +218,12 @@ again:
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
+ vi = list_head(&vq->vq_io_list);
+ if (vi == NULL) {
+ vi = zio_vdev_alloc();
+ list_insert_head(&vq->vq_io_list, vi);
+ }
+
if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
/*
* We can aggregate I/Os that are sufficiently adjacent and of
@@ -283,9 +313,10 @@ again:
if (fio != lio) {
uint64_t size = IO_SPAN(fio, lio);
ASSERT(size <= zfs_vdev_aggregation_limit);
+ ASSERT(vi != NULL);
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
- zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
+ vi, size, fio->io_type, ZIO_PRIORITY_AGG,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
@@ -313,6 +344,7 @@ again:
} while (dio != lio);
avl_add(&vq->vq_pending_tree, aio);
+ list_remove(&vq->vq_io_list, vi);
return (aio);
}
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 206ed9a93..fe2bdc867 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -73,6 +73,7 @@ char *zio_type_name[ZIO_TYPES] = {
*/
kmem_cache_t *zio_cache;
kmem_cache_t *zio_link_cache;
+kmem_cache_t *zio_vdev_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
int zio_bulk_flags = 0;
@@ -141,6 +142,8 @@ zio_init(void)
zio_cons, zio_dest, NULL, NULL, NULL, KMC_KMEM);
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM);
+ zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t),
+ PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM);
/*
* For small buffers, we want a cache for each multiple of
@@ -230,6 +233,7 @@ zio_fini(void)
zio_data_buf_cache[c] = NULL;
}
+ kmem_cache_destroy(zio_vdev_cache);
kmem_cache_destroy(zio_link_cache);
kmem_cache_destroy(zio_cache);
@@ -295,6 +299,24 @@ zio_data_buf_free(void *buf, size_t size)
}
/*
+ * Dedicated I/O buffers to ensure that memory fragmentation never prevents
+ * or significantly delays the issuing of a zio. These buffers are used
+ * to aggregate I/O and could be used for raidz stripes.
+ */
+void *
+zio_vdev_alloc(void)
+{
+ return (kmem_cache_alloc(zio_vdev_cache, KM_PUSHPAGE));
+}
+
+void
+zio_vdev_free(void *buf)
+{
+ kmem_cache_free(zio_vdev_cache, buf);
+
+}
+
+/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================