summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorArne Jansen <[email protected]>2015-05-06 09:07:55 -0700
committerBrian Behlendorf <[email protected]>2015-06-09 13:48:02 -0700
commit9c43027b3f18769f2ace16eaa222ac8b301501f4 (patch)
tree90a3577807c203ca9022234405b30e7a7fdcad2d /module/zfs
parentd050c627b5fdfaf72eac90bc07e03fcb2d8a123f (diff)
Illumos 5269 - zpool import slow
5269 zpool import slow Reviewed by: Matthew Ahrens <[email protected]> Reviewed by: George Wilson <[email protected]> Reviewed by: Dan McDonald <[email protected]> Approved by: Dan McDonald <[email protected]> References: https://www.illumos.org/issues/5269 https://github.com/illumos/illumos-gate/commit/12380e1e Ported-by: DHE <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #3396
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/dmu_objset.c221
-rw-r--r--module/zfs/dsl_pool.c10
-rw-r--r--module/zfs/spa.c11
-rw-r--r--module/zfs/vdev.c21
-rw-r--r--module/zfs/zil.c23
5 files changed, 228 insertions, 58 deletions
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 888c251ce..6ef6adbd9 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -24,6 +24,7 @@
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -48,6 +49,7 @@
#include <sys/sa.h>
#include <sys/zfs_onexit.h>
#include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
/*
* Needed to close a window in dnode_move() that allows the objset to be freed
@@ -55,6 +57,16 @@
*/
krwlock_t os_lock;
+/*
+ * Tunable to overwrite the maximum number of threads for the parallization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+int dmu_find_threads = 0;
+
+static void dmu_objset_find_dp_cb(void *arg);
+
void
dmu_objset_init(void)
{
@@ -504,6 +516,25 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp)
return (err);
}
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+{
+ int err;
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err != 0) {
+ dsl_dataset_disown(ds, tag);
+ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+ dsl_dataset_disown(ds, tag);
+ return (SET_ERROR(EINVAL));
+ } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+ dsl_dataset_disown(ds, tag);
+ return (SET_ERROR(EROFS));
+ }
+ return (err);
+}
+
/*
* dsl_pool must not be held when this is called.
* Upon successful return, there will be a longhold on the dataset,
@@ -525,21 +556,26 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
dsl_pool_rele(dp, FTAG);
return (err);
}
-
- err = dmu_objset_from_ds(ds, osp);
+ err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
dsl_pool_rele(dp, FTAG);
- if (err != 0) {
- dsl_dataset_disown(ds, tag);
- } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
- dsl_dataset_disown(ds, tag);
- return (SET_ERROR(EINVAL));
- } else if (!readonly && ds->ds_is_snapshot) {
- dsl_dataset_disown(ds, tag);
- return (SET_ERROR(EROFS));
- }
+
return (err);
}
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+ if (err != 0)
+ return (err);
+
+ return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+}
+
void
dmu_objset_rele(objset_t *os, void *tag)
{
@@ -1618,30 +1654,41 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
return (0);
}
-/*
- * Find objsets under and including ddobj, call func(ds) on each.
- */
-int
-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
- int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+typedef struct dmu_objset_find_ctx {
+ taskq_t *dc_tq;
+ dsl_pool_t *dc_dp;
+ uint64_t dc_ddobj;
+ int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+ void *dc_arg;
+ int dc_flags;
+ kmutex_t *dc_error_lock;
+ int *dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
{
+ dsl_pool_t *dp = dcp->dc_dp;
+ dmu_objset_find_ctx_t *child_dcp;
dsl_dir_t *dd;
dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
uint64_t thisobj;
- int err;
+ int err = 0;
- ASSERT(dsl_pool_config_held(dp));
+ /* don't process if there already was an error */
+ if (*dcp->dc_error != 0)
+ goto out;
- err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+ err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
if (err != 0)
- return (err);
+ goto out;
/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
if (dd->dd_myname[0] == '$') {
dsl_dir_rele(dd, FTAG);
- return (0);
+ goto out;
}
thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
@@ -1650,7 +1697,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
/*
* Iterate over all children.
*/
- if (flags & DS_FIND_CHILDREN) {
+ if (dcp->dc_flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
@@ -1659,24 +1706,22 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
- err = dmu_objset_find_dp(dp, attr->za_first_integer,
- func, arg, flags);
- if (err != 0)
- break;
+ child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
+ *child_dcp = *dcp;
+ child_dcp->dc_ddobj = attr->za_first_integer;
+ if (dcp->dc_tq != NULL)
+ (void) taskq_dispatch(dcp->dc_tq,
+ dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+ else
+ dmu_objset_find_dp_impl(child_dcp);
}
zap_cursor_fini(&zc);
-
- if (err != 0) {
- dsl_dir_rele(dd, FTAG);
- kmem_free(attr, sizeof (zap_attribute_t));
- return (err);
- }
}
/*
* Iterate over all snapshots.
*/
- if (flags & DS_FIND_SNAPSHOTS) {
+ if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
dsl_dataset_t *ds;
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
@@ -1697,7 +1742,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
attr->za_first_integer, FTAG, &ds);
if (err != 0)
break;
- err = func(dp, ds, arg);
+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0)
break;
@@ -1710,17 +1755,115 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
kmem_free(attr, sizeof (zap_attribute_t));
if (err != 0)
- return (err);
+ goto out;
/*
* Apply to self.
*/
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err != 0)
- return (err);
- err = func(dp, ds, arg);
+ goto out;
+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
- return (err);
+
+out:
+ if (err != 0) {
+ mutex_enter(dcp->dc_error_lock);
+ /* only keep first error */
+ if (*dcp->dc_error == 0)
+ *dcp->dc_error = err;
+ mutex_exit(dcp->dc_error_lock);
+ }
+
+ kmem_free(dcp, sizeof (*dcp));
+}
+
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+ dmu_objset_find_ctx_t *dcp = arg;
+ dsl_pool_t *dp = dcp->dc_dp;
+
+ dsl_pool_config_enter(dp, FTAG);
+
+ dmu_objset_find_dp_impl(dcp);
+
+ dsl_pool_config_exit(dp, FTAG);
+}
+
+/*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+ int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+ int error = 0;
+ taskq_t *tq = NULL;
+ int ntasks;
+ dmu_objset_find_ctx_t *dcp;
+ kmutex_t err_lock;
+
+ mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+ dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
+ dcp->dc_tq = NULL;
+ dcp->dc_dp = dp;
+ dcp->dc_ddobj = ddobj;
+ dcp->dc_func = func;
+ dcp->dc_arg = arg;
+ dcp->dc_flags = flags;
+ dcp->dc_error_lock = &err_lock;
+ dcp->dc_error = &error;
+
+ if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+ /*
+ * In case a write lock is held we can't make use of
+ * parallelism, as down the stack of the worker threads
+ * the lock is asserted via dsl_pool_config_held.
+ * In case of a read lock this is solved by getting a read
+ * lock in each worker thread, which isn't possible in case
+ * of a writer lock. So we fall back to the synchronous path
+ * here.
+ * In the future it might be possible to get some magic into
+ * dsl_pool_config_held in a way that it returns true for
+ * the worker threads so that a single lock held from this
+ * thread suffices. For now, stay single threaded.
+ */
+ dmu_objset_find_dp_impl(dcp);
+
+ return (error);
+ }
+
+ ntasks = dmu_find_threads;
+ if (ntasks == 0)
+ ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+ tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
+ INT_MAX, 0);
+ if (tq == NULL) {
+ kmem_free(dcp, sizeof (*dcp));
+ return (SET_ERROR(ENOMEM));
+ }
+ dcp->dc_tq = tq;
+
+ /* dcp will be freed by task */
+ (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+ /*
+ * PORTING: this code relies on the property of taskq_wait to wait
+ * until no more tasks are queued and no more tasks are active. As
+ * we always queue new tasks from within other tasks, task_wait
+ * reliably waits for the full recursion to finish, even though we
+ * enqueue new tasks after taskq_wait has been called.
+ * On platforms other than illumos, taskq_wait may not have this
+ * property.
+ */
+ taskq_wait(tq);
+ taskq_destroy(tq);
+ mutex_destroy(&err_lock);
+
+ return (error);
}
/*
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 13961918e..7d84d8bf4 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -755,7 +755,7 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
ASSERT(dp->dp_origin_snap != NULL);
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
- tx, DS_FIND_CHILDREN));
+ tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
/* ARGSUSED */
@@ -810,7 +810,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
- upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
void
@@ -1055,6 +1055,12 @@ dsl_pool_config_held(dsl_pool_t *dp)
return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
}
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+ return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
+
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(dsl_pool_config_enter);
EXPORT_SYMBOL(dsl_pool_config_exit);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 2a9ef9ce5..c37feb733 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1773,6 +1773,7 @@ static boolean_t
spa_check_logs(spa_t *spa)
{
boolean_t rv = B_FALSE;
+ dsl_pool_t *dp = spa_get_dsl(spa);
switch (spa->spa_log_state) {
default:
@@ -1780,8 +1781,8 @@ spa_check_logs(spa_t *spa)
case SPA_LOG_MISSING:
/* need to recheck in case slog has been restored */
case SPA_LOG_UNKNOWN:
- rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
- NULL, DS_FIND_CHILDREN) != 0);
+ rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
if (rv)
spa_set_log_state(spa, SPA_LOG_MISSING);
break;
@@ -2763,6 +2764,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
int need_update = B_FALSE;
+ dsl_pool_t *dp = spa_get_dsl(spa);
int c;
ASSERT(state != SPA_LOAD_TRYIMPORT);
@@ -2776,9 +2778,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
*/
spa->spa_claiming = B_TRUE;
- tx = dmu_tx_create_assigned(spa_get_dsl(spa),
- spa_first_txg(spa));
- (void) dmu_objset_find(spa_name(spa),
+ tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+ (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index fe6631925..d53537103 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -179,6 +179,27 @@ vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
return (NULL);
}
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+ int n = 0;
+ int c;
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (1);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+ return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+ return (vdev_count_leaves_impl(spa->spa_root_vdev));
+}
+
void
vdev_add_child(vdev_t *pvd, vdev_t *cvd)
{
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 3ae171e59..ff4d2cec0 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -660,7 +660,7 @@ zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
}
int
-zil_claim(const char *osname, void *txarg)
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
{
dmu_tx_t *tx = txarg;
uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -669,15 +669,16 @@ zil_claim(const char *osname, void *txarg)
objset_t *os;
int error;
- error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os);
+ error = dmu_objset_own_obj(dp, ds->ds_object,
+ DMU_OST_ANY, B_FALSE, FTAG, &os);
if (error != 0) {
/*
* EBUSY indicates that the objset is inconsistent, in which
* case it can not have a ZIL.
*/
if (error != EBUSY) {
- cmn_err(CE_WARN, "can't open objset for %s, error %u",
- osname, error);
+ cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+ (unsigned long long)ds->ds_object, error);
}
return (0);
@@ -725,8 +726,9 @@ zil_claim(const char *osname, void *txarg)
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
+/* ARGSUSED */
int
-zil_check_log_chain(const char *osname, void *tx)
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
{
zilog_t *zilog;
objset_t *os;
@@ -735,9 +737,10 @@ zil_check_log_chain(const char *osname, void *tx)
ASSERT(tx == NULL);
- error = dmu_objset_hold(osname, FTAG, &os);
+ error = dmu_objset_from_ds(ds, &os);
if (error != 0) {
- cmn_err(CE_WARN, "can't open objset for %s", osname);
+ cmn_err(CE_WARN, "can't open objset %llu, error %d",
+ (unsigned long long)ds->ds_object, error);
return (0);
}
@@ -760,10 +763,8 @@ zil_check_log_chain(const char *osname, void *tx)
valid = vdev_log_state_valid(vd);
spa_config_exit(os->os_spa, SCL_STATE, FTAG);
- if (!valid) {
- dmu_objset_rele(os, FTAG);
+ if (!valid)
return (0);
- }
}
/*
@@ -776,8 +777,6 @@ zil_check_log_chain(const char *osname, void *tx)
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
- dmu_objset_rele(os, FTAG);
-
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}