diff options
author | Matthew Ahrens <[email protected]> | 2020-03-12 10:55:02 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2020-03-12 10:55:02 -0700 |
commit | 0fdd6106bbd1b31322be87fee94c0c579ca459e6 (patch) | |
tree | 4568d0acc057501685d7eddbeadeb7f6ccfb7d42 /module/zfs | |
parent | fa130e010c2ff9b33aba11d2699b667e454b3ccb (diff) |
dmu_objset_from_ds must be called with dp_config_rwlock held
The normal lock order is that the dp_config_rwlock must be held before
the ds_opening_lock. For example, dmu_objset_hold() does this.
However, dmu_objset_open_impl() is called with the ds_opening_lock held,
and if the dp_config_rwlock is not already held, it will attempt to
acquire it. This may lead to deadlock, since the lock order is
reversed.
Looking at all the callers of dmu_objset_open_impl() (which is
principally the callers of dmu_objset_from_ds()), almost all callers
already have the dp_config_rwlock. However, there are a few places in
the send and receive code paths that do not. For example:
dsl_crypto_populate_key_nvlist, send_cb, dmu_recv_stream,
receive_write_byref, redact_traverse_thread.
This commit resolves the problem by requiring all callers ot
dmu_objset_from_ds() to hold the dp_config_rwlock. In most cases, the
code has been restructured such that we call dmu_objset_from_ds()
earlier on in the send and receive processes, when we already have the
dp_config_rwlock, and save the objset_t until we need it in the middle
of the send or receive (similar to what we already do with the
dsl_dataset_t). Thus we do not need to acquire the dp_config_rwlock in
many new places.
I also cleaned up code in dmu_redact_snap() and send_traverse_thread().
Reviewed-by: Paul Dagnelie <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Paul Zuchowski <[email protected]>
Signed-off-by: Matthew Ahrens <[email protected]>
Closes #9662
Closes #10115
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/dmu_objset.c | 27 | ||||
-rw-r--r-- | module/zfs/dmu_recv.c | 41 | ||||
-rw-r--r-- | module/zfs/dmu_redact.c | 94 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 41 | ||||
-rw-r--r-- | module/zfs/dsl_crypt.c | 5 |
5 files changed, 94 insertions, 114 deletions
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index a91ecb640..5b8069bd6 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -415,6 +415,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(!BP_IS_REDACTED(bp)); /* + * We need the pool config lock to get properties. + */ + ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool)); + + /* * The $ORIGIN dataset (if it exists) doesn't have an associated * objset, so there's no reason to open it. The $ORIGIN dataset * will not exist on pools older than SPA_VERSION_ORIGIN. @@ -503,20 +508,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * checksum/compression/copies. */ if (ds != NULL) { - boolean_t needlock = B_FALSE; - os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0); - /* - * Note: it's valid to open the objset if the dataset is - * long-held, in which case the pool_config lock will not - * be held. - */ - if (!dsl_pool_config_held(dmu_objset_pool(os))) { - needlock = B_TRUE; - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - } - err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -579,8 +572,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, smallblk_changed_cb, os); } } - if (needlock) - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); @@ -650,11 +641,11 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) int err = 0; /* - * We shouldn't be doing anything with dsl_dataset_t's unless the - * pool_config lock is held, or the dataset is long-held. + * We need the pool_config lock to manipulate the dsl_dataset_t. + * Even if the dataset is long-held, we need the pool_config lock + * to open the objset, as it needs to get properties. */ - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || - dsl_dataset_long_held(ds)); + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 97a3c7cee..0fa0e015b 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -123,7 +123,7 @@ struct receive_writer_arg { typedef struct guid_map_entry { uint64_t guid; boolean_t raw; - dsl_dataset_t *gme_ds; + objset_t *gme_os; avl_node_t avlnode; } guid_map_entry_t; @@ -914,6 +914,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) rrw_exit(&newds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = newds; + drba->drba_cookie->drc_os = os; spa_history_log_internal_ds(newds, "receive", tx, " "); } @@ -1095,6 +1096,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) rrw_exit(&ds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = ds; + VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os)); drba->drba_cookie->drc_should_save = B_TRUE; spa_history_log_internal_ds(ds, "resume receive", tx, " "); @@ -1227,11 +1229,12 @@ free_guid_map_onexit(void *arg) ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT; if (gmep->raw) { - gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE; + gmep->gme_os->os_raw_receive = B_FALSE; dsflags &= ~DS_HOLD_FLAG_DECRYPT; } - dsl_dataset_disown(gmep->gme_ds, dsflags, gmep); + dsl_dataset_disown(gmep->gme_os->os_dsl_dataset, + dsflags, gmep); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); @@ -1797,8 +1800,7 @@ receive_write_byref(struct receive_writer_arg *rwa, &where)) == NULL) { return (SET_ERROR(EINVAL)); } - if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) - return (SET_ERROR(EINVAL)); + ref_os = gmep->gme_os; } else { ref_os = rwa->os; } @@ -2661,10 +2663,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); - /* - * Open the objset we are modifying. - */ - VERIFY0(dmu_objset_from_ds(drc->drc_ds, &drc->drc_os)); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); ASSERT0(drc->drc_os->os_encrypted && (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); @@ -3013,6 +3011,12 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); + /* + * The objset was evicted by dsl_dataset_clone_swap_sync_impl, + * so drc_os is no longer valid. + */ + drc->drc_os = NULL; + dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); @@ -3127,26 +3131,25 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj, err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds); if (err == 0) { + err = dmu_objset_from_ds(snapds, &os); + if (err != 0) { + dsl_dataset_disown(snapds, dsflags, FTAG); + dsl_pool_rele(dp, FTAG); + kmem_free(gmep, sizeof (*gmep)); + return (err); + } /* * If this is a deduplicated raw send stream, we need * to make sure that we can still read raw blocks from * earlier datasets in the stream, so we set the * os_raw_receive flag now. */ - if (raw) { - err = dmu_objset_from_ds(snapds, &os); - if (err != 0) { - dsl_dataset_disown(snapds, dsflags, FTAG); - dsl_pool_rele(dp, FTAG); - kmem_free(gmep, sizeof (*gmep)); - return (err); - } + if (raw) os->os_raw_receive = B_TRUE; - } gmep->raw = raw; gmep->guid = dsl_dataset_phys(snapds)->ds_guid; - gmep->gme_ds = snapds; + gmep->gme_os = os; avl_add(guid_map, gmep); } else { kmem_free(gmep, sizeof (*gmep)); diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 2c4e6117c..df10d8d6f 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -80,6 +80,7 @@ struct redact_record { struct redact_thread_arg { bqueue_t q; + objset_t *os; /* Objset to traverse */ dsl_dataset_t *ds; /* Dataset to traverse */ struct redact_record *current_record; int error_code; @@ -355,11 +356,9 @@ redact_traverse_thread(void *arg) struct redact_thread_arg *rt_arg = arg; int err; struct redact_record *data; - objset_t *os; - VERIFY0(dmu_objset_from_ds(rt_arg->ds, &os)); #ifdef _KERNEL - if (os->os_phys->os_type == DMU_OST_ZFS) - rt_arg->deleted_objs = zfs_get_deleteq(os); + if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS) + rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os); else rt_arg->deleted_objs = objlist_create(); #else @@ -1004,9 +1003,8 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, int err = 0; dsl_pool_t *dp = NULL; dsl_dataset_t *ds = NULL; - objset_t *os; int numsnaps = 0; - dsl_dataset_t **redactsnaparr = NULL; + objset_t *os; struct redact_thread_arg *args = NULL; redaction_list_t *new_rl = NULL; @@ -1026,39 +1024,45 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, err = EALREADY; goto out; } - nvpair_t *pair; - if (fnvlist_num_pairs(redactnvl) > 0 && err == 0) { - redactsnaparr = kmem_zalloc(fnvlist_num_pairs(redactnvl) * - sizeof (dsl_dataset_t *), KM_SLEEP); - } - for (pair = nvlist_next_nvpair(redactnvl, NULL); err == 0 && - pair != NULL; pair = nvlist_next_nvpair(redactnvl, pair)) { + numsnaps = fnvlist_num_pairs(redactnvl); + if (numsnaps > 0) + args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); + + nvpair_t *pair = NULL; + for (int i = 0; i < numsnaps; i++) { + pair = nvlist_next_nvpair(redactnvl, pair); const char *name = nvpair_name(pair); + struct redact_thread_arg *rta = &args[i]; err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT, - FTAG, redactsnaparr + numsnaps); + FTAG, &rta->ds); + if (err != 0) + break; + /* + * We want to do the long hold before we can get any other + * errors, because the cleanup code will release the long + * hold if rta->ds is filled in. + */ + dsl_dataset_long_hold(rta->ds, FTAG); + + err = dmu_objset_from_ds(rta->ds, &rta->os); if (err != 0) break; - dsl_dataset_long_hold(redactsnaparr[numsnaps], FTAG); - if (!dsl_dataset_is_before(redactsnaparr[numsnaps], ds, 0)) { + if (!dsl_dataset_is_before(rta->ds, ds, 0)) { err = EINVAL; - numsnaps++; break; } - if (dsl_dataset_feature_is_active(redactsnaparr[numsnaps], + if (dsl_dataset_feature_is_active(rta->ds, SPA_FEATURE_REDACTED_DATASETS)) { err = EALREADY; - numsnaps++; break; } - numsnaps++; } + VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL); if (err != 0) goto out; - ASSERT3U(fnvlist_num_pairs(redactnvl), ==, numsnaps); - boolean_t resuming = B_FALSE; char newredactbook[ZFS_MAX_DATASET_NAME_LEN]; zfs_bookmark_phys_t bookmark; @@ -1091,15 +1095,14 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, goto out; } for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps, new_rl->rl_phys->rlp_num_snaps, - dsl_dataset_phys(redactsnaparr[i])->ds_guid)) { + dsl_dataset_phys(rta->ds)->ds_guid)) { err = ESRCH; goto out; } } - if (numsnaps > 0) - args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX && new_rl->rl_phys->rlp_last_object == UINT64_MAX) { err = EEXIST; @@ -1112,10 +1115,11 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, if (numsnaps > 0) { guids = kmem_zalloc(numsnaps * sizeof (uint64_t), KM_SLEEP); - args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); } - for (int i = 0; i < numsnaps; i++) - guids[i] = dsl_dataset_phys(redactsnaparr[i])->ds_guid; + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + guids[i] = dsl_dataset_phys(rta->ds)->ds_guid; + } dsl_pool_rele(dp, FTAG); dp = NULL; @@ -1128,18 +1132,18 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, } for (int i = 0; i < numsnaps; i++) { - args[i].ds = redactsnaparr[i]; - (void) bqueue_init(&args[i].q, zfs_redact_queue_ff, + struct redact_thread_arg *rta = &args[i]; + (void) bqueue_init(&rta->q, zfs_redact_queue_ff, zfs_redact_queue_length, offsetof(struct redact_record, ln)); if (resuming) { - args[i].resume.zb_blkid = + rta->resume.zb_blkid = new_rl->rl_phys->rlp_last_blkid; - args[i].resume.zb_object = + rta->resume.zb_object = new_rl->rl_phys->rlp_last_object; } - args[i].txg = dsl_dataset_phys(ds)->ds_creation_txg; - (void) thread_create(NULL, 0, redact_traverse_thread, &args[i], + rta->txg = dsl_dataset_phys(ds)->ds_creation_txg; + (void) thread_create(NULL, 0, redact_traverse_thread, rta, 0, curproc, TS_RUN, minclsyspri); } struct redact_merge_thread_arg rmta = { { {0} } }; @@ -1152,23 +1156,25 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, TS_RUN, minclsyspri); err = perform_redaction(os, new_rl, &rmta); out: - if (args != NULL) { - kmem_free(args, numsnaps * sizeof (*args)); - } if (new_rl != NULL) { dsl_redaction_list_long_rele(new_rl, FTAG); dsl_redaction_list_rele(new_rl, FTAG); } for (int i = 0; i < numsnaps; i++) { - dsl_dataset_long_rele(redactsnaparr[i], FTAG); - dsl_dataset_rele_flags(redactsnaparr[i], DS_HOLD_FLAG_DECRYPT, - FTAG); + struct redact_thread_arg *rta = &args[i]; + /* + * rta->ds may be NULL if we got an error while filling + * it in. + */ + if (rta->ds != NULL) { + dsl_dataset_long_rele(rta->ds, FTAG); + dsl_dataset_rele_flags(rta->ds, + DS_HOLD_FLAG_DECRYPT, FTAG); + } } - if (redactsnaparr != NULL) { - kmem_free(redactsnaparr, fnvlist_num_pairs(redactnvl) * - sizeof (dsl_dataset_t *)); - } + if (args != NULL) + kmem_free(args, numsnaps * sizeof (*args)); if (dp != NULL) dsl_pool_rele(dp, FTAG); if (ds != NULL) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 2c7dca23e..9069f7e7d 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -115,8 +115,7 @@ overflow_multiply(uint64_t a, uint64_t b, uint64_t *c) struct send_thread_arg { bqueue_t q; - dsl_dataset_t *ds; /* Dataset to traverse */ - redaction_list_t *redaction_list; + objset_t *os; /* Objset to traverse */ uint64_t fromtxg; /* Traverse from this txg */ int flags; /* flags to pass to traverse_dataset */ int error_code; @@ -1092,19 +1091,16 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= sta->resume.zb_object); - ASSERT3P(sta->ds, !=, NULL); /* * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ - objset_t *os; - VERIFY0(dmu_objset_from_ds(sta->ds, &os)); - if (os->os_encrypted && + if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { spa_log_error(spa, zb); zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", sta->ds->ds_object); + "object set %llu", dmu_objset_id(sta->os)); return (SET_ERROR(EIO)); } @@ -1211,10 +1207,7 @@ redact_list_cb(redact_block_phys_t *rb, void *arg) /* * This function kicks off the traverse_dataset. It also handles setting the * error code of the thread in case something goes wrong, and pushes the End of - * Stream record when the traverse_dataset call has finished. If there is no - * dataset to traverse, then we traverse the redaction list provided and enqueue - * records for that. If neither is provided, the thread immediately pushes an - * End of Stream marker. + * Stream record when the traverse_dataset call has finished. */ static void send_traverse_thread(void *arg) @@ -1224,20 +1217,9 @@ send_traverse_thread(void *arg) struct send_range *data; fstrans_cookie_t cookie = spl_fstrans_mark(); - if (st_arg->ds != NULL) { - ASSERT3P(st_arg->redaction_list, ==, NULL); - err = traverse_dataset_resume(st_arg->ds, - st_arg->fromtxg, &st_arg->resume, - st_arg->flags, send_cb, st_arg); - } else if (st_arg->redaction_list != NULL) { - struct redact_list_cb_arg rlcba = {0}; - rlcba.cancel = &st_arg->cancel; - rlcba.num_blocks_visited = st_arg->num_blocks_visited; - rlcba.q = &st_arg->q; - rlcba.mark_redact = B_FALSE; - err = dsl_redaction_list_traverse(st_arg->redaction_list, - &st_arg->resume, redact_list_cb, &rlcba); - } + err = traverse_dataset_resume(st_arg->os->os_dsl_dataset, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); if (err != EINTR) st_arg->error_code = err; @@ -2029,7 +2011,7 @@ create_begin_record(struct dmu_send_params *dspp, objset_t *os, } static void -setup_to_thread(struct send_thread_arg *to_arg, dsl_dataset_t *to_ds, +setup_to_thread(struct send_thread_arg *to_arg, objset_t *to_os, dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok) { VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff, @@ -2037,12 +2019,11 @@ setup_to_thread(struct send_thread_arg *to_arg, dsl_dataset_t *to_ds, offsetof(struct send_range, ln))); to_arg->error_code = 0; to_arg->cancel = B_FALSE; - to_arg->ds = to_ds; + to_arg->os = to_os; to_arg->fromtxg = fromtxg; to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA; if (rawok) to_arg->flags |= TRAVERSE_NO_DECRYPT; - to_arg->redaction_list = NULL; to_arg->num_blocks_visited = &dssp->dss_blocks; (void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0, curproc, TS_RUN, minclsyspri); @@ -2489,7 +2470,7 @@ dmu_send_impl(struct dmu_send_params *dspp) nvlist_t *keynvl = NULL; ASSERT(os->os_encrypted); - err = dsl_crypto_populate_key_nvlist(to_ds, ivset_guid, + err = dsl_crypto_populate_key_nvlist(os, ivset_guid, &keynvl); if (err != 0) { fnvlist_free(nvl); @@ -2513,7 +2494,7 @@ dmu_send_impl(struct dmu_send_params *dspp) goto out; } - setup_to_thread(to_arg, to_ds, dssp, fromtxg, dspp->rawok); + setup_to_thread(to_arg, os, dssp, fromtxg, dspp->rawok); setup_from_thread(from_arg, from_rl, dssp); setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp); setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os); diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index 279de84b6..d69ccb122 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -2381,11 +2381,11 @@ dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj, } int -dsl_crypto_populate_key_nvlist(dsl_dataset_t *ds, uint64_t from_ivset_guid, +dsl_crypto_populate_key_nvlist(objset_t *os, uint64_t from_ivset_guid, nvlist_t **nvl_out) { int ret; - objset_t *os; + dsl_dataset_t *ds = os->os_dsl_dataset; dnode_t *mdn; uint64_t rddobj; nvlist_t *nvl = NULL; @@ -2403,7 +2403,6 @@ dsl_crypto_populate_key_nvlist(dsl_dataset_t *ds, uint64_t from_ivset_guid, ASSERT(dckobj != 0); - VERIFY0(dmu_objset_from_ds(ds, &os)); mdn = DMU_META_DNODE(os); nvl = fnvlist_alloc(); |