diff options
author | Alan Somers <[email protected]> | 2021-01-11 17:00:19 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2021-01-26 19:35:59 -0800 |
commit | a0e01997ec013fa24a570a36689235ee6c21d9e1 (patch) | |
tree | 3a4f7e1ad7e950e725b161d432283d22aff26eca | |
parent | dfb44c500e0b05b2e3ff058e6e5cbf2431d4f80b (diff) |
Parallelize vdev_load
metaslab_init is the slowest part of importing a mature pool, and it
must be repeated hundreds of times for each top-level vdev. But its
speed is dominated by a few serialized disk accesses. That can lead to
import times of > 1 hour for pools with many top-level vdevs on spinny
disks.
Speed up the import by using a taskqueue to parallelize vdev_load across
all top-level vdevs.
This also requires adding mutex protection to
metaslab_class_t.mc_historgram. The mc_histogram fields were
unprotected when that code was first written in "Illumos 4976-4984 -
metaslab improvements" (OpenZFS
f3a7f6610f2df0217ba3b99099019417a954b673). The lock wasn't added until
3dfb57a35e8cbaa7c424611235d669f3c575ada1, though it's unclear exactly
which fields it's supposed to protect. In any case, it wasn't until
vdev_load was parallelized that any code attempted concurrent access to
those fields.
Sponsored by: Axcient
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Alan Somers <[email protected]>
Closes #11470
-rw-r--r-- | include/sys/vdev_impl.h | 1 | ||||
-rw-r--r-- | module/zfs/metaslab.c | 6 | ||||
-rw-r--r-- | module/zfs/vdev.c | 42 |
3 files changed, 46 insertions, 3 deletions
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 1239451bf..c509d390f 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -269,6 +269,7 @@ struct vdev { boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ + int vdev_load_error; /* error on last load */ int vdev_open_error; /* error on last open */ kthread_t *vdev_open_thread; /* thread opening children */ uint64_t vdev_crtxg; /* txg when top-level was added */ diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index fdc6922b0..bc4f007b6 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -522,6 +522,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); + mutex_enter(&mc->mc_lock); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = vdev_get_mg(tvd, mc); @@ -546,6 +547,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); } + mutex_exit(&mc->mc_lock); kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } @@ -1067,6 +1069,7 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) return; mutex_enter(&mg->mg_lock); + mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); @@ -1075,6 +1078,7 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } @@ -1089,6 +1093,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) return; mutex_enter(&mg->mg_lock); + mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); @@ -1102,6 +1107,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index f305da6f5..018e48c38 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1725,6 +1725,14 @@ vdev_probe(vdev_t *vd, zio_t *zio) } static void +vdev_load_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_load_error = vdev_load(vd); +} + +static void vdev_open_child(void *arg) { vdev_t *vd = arg; @@ -3350,18 +3358,46 @@ vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) int vdev_load(vdev_t *vd) { + int children = vd->vdev_children; int error = 0; + taskq_t *tq = NULL; + + /* + * It's only worthwhile to use the taskq for the root vdev, because the + * slow part is metaslab_init, and that only happens for top-level + * vdevs. + */ + if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { + tq = taskq_create("vdev_load", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { - error = vdev_load(vd->vdev_child[c]); - if (error != 0) { - return (error); + vdev_t *cvd = vd->vdev_child[c]; + + if (tq == NULL || vdev_uses_zvols(cvd)) { + cvd->vdev_load_error = vdev_load(cvd); + } else { + VERIFY(taskq_dispatch(tq, vdev_load_child, + cvd, TQ_SLEEP) != TASKQID_INVALID); } } + if (tq != NULL) { + taskq_wait(tq); + taskq_destroy(tq); + } + + for (int c = 0; c < vd->vdev_children; c++) { + int error = vd->vdev_child[c]->vdev_load_error; + + if (error != 0) + return (error); + } + vdev_set_deflate_ratio(vd); /* |