diff options
author | Matthew Ahrens <[email protected]> | 2013-09-04 07:00:57 -0500 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2013-09-04 16:01:24 -0700 |
commit | 13fe019870c8779bf2f5b3ff731b512cf89133ef (patch) | |
tree | 67a9c6989bcb7c2ca6d0455c14713bcbf1899da6 /module/zfs | |
parent | 6f1ffb06655008c9b519108ed29fbf03acd6e5de (diff) |
Illumos #3464
3464 zfs synctask code needs restructuring
Reviewed by: Dan Kimmel <[email protected]>
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Approved by: Garrett D'Amore <[email protected]>
References:
https://www.illumos.org/issues/3464
illumos/illumos-gate@3b2aab18808792cbd248a12f1edf139b89833c13
Ported-by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #1495
Diffstat (limited to 'module/zfs')
37 files changed, 5316 insertions, 5324 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 81b1680e4..e71228454 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -93,3 +93,5 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_super.o $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_xattr.o $(MODULE)-objs += @top_srcdir@/module/zfs/zrlock.o $(MODULE)-objs += @top_srcdir@/module/zfs/zvol.o +$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_destroy.o +$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_userhold.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index ce4a0239c..1298c5b91 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1643,12 +1643,12 @@ arc_buf_free(arc_buf_t *buf, void *tag) } } -int +boolean_t arc_buf_remove_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = NULL; - int no_callback = (buf->b_efunc == NULL); + boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { ASSERT(hdr->b_datacnt == 1); @@ -1854,7 +1854,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, ARCSTAT_INCR(arcstat_mutex_miss, missed); /* - * We have just evicted some date into the ghost state, make + * We have just evicted some data into the ghost state, make * sure we also adjust the ghost state size if necessary. */ if (arc_no_grow && @@ -2772,7 +2772,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg) == 1); + VERIFY(arc_buf_remove_ref(buf, arg)); } /* a generic arc_done_func_t */ @@ -2781,7 +2781,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg) == 1); + VERIFY(arc_buf_remove_ref(buf, arg)); *bufp = NULL; } else { *bufp = buf; diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c index d196351dc..c3927e74a 100644 --- a/module/zfs/bplist.c +++ b/module/zfs/bplist.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/bplist.h> @@ -52,6 +53,12 @@ bplist_append(bplist_t *bpl, const blkptr_t *bp) mutex_exit(&bpl->bpl_lock); } +/* + * To aid debugging, we keep the most recently removed entry. This way if + * we are in the callback, we can easily locate the entry. + */ +static bplist_entry_t *bplist_iterate_last_removed; + void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) { @@ -59,6 +66,7 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) mutex_enter(&bpl->bpl_lock); while ((bpe = list_head(&bpl->bpl_list))) { + bplist_iterate_last_removed = bpe; list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); func(arg, &bpe->bpe_blk, tx); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 1920da440..4ba9f8002 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -366,6 +366,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) { bpobj_t subbpo; uint64_t used, comp, uncomp, subsubobjs; + ASSERTV(dmu_object_info_t doi); ASSERT(bpo->bpo_havesubobj); ASSERT(bpo->bpo_havecomp); @@ -392,6 +393,9 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); } + ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); + mutex_enter(&bpo->bpo_lock); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index faa6cc345..d655d6621 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -64,7 +64,7 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); static void dbuf_destroy(dmu_buf_impl_t *db); -static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); /* @@ -546,7 +546,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + VERIFY(arc_buf_remove_ref(buf, db)); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -875,10 +875,12 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) continue; /* found a level 0 buffer in the range */ - if (dbuf_undirty(db, tx)) + mutex_enter(&db->db_mtx); + if (dbuf_undirty(db, tx)) { + /* mutex has been dropped and dbuf destroyed */ continue; + } - mutex_enter(&db->db_mtx); if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { @@ -1005,7 +1007,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db) == 1); + VERIFY(arc_buf_remove_ref(obuf, db)); db->db.db_size = size; if (db->db_level == 0) { @@ -1306,7 +1308,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr); } -static int +/* + * Return TRUE if this evicted the dbuf. + */ +static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; @@ -1315,18 +1320,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(txg != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT0(db->db_level); + ASSERT(MUTEX_HELD(&db->db_mtx)); - mutex_enter(&db->db_mtx); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; - if (dr == NULL || dr->dr_txg < txg) { - mutex_exit(&db->db_mtx); - return (0); - } + if (dr == NULL || dr->dr_txg < txg) + return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); @@ -1334,24 +1338,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dn = DB_DNODE(db); /* - * If this buffer is currently held, we cannot undirty - * it, since one of the current holders may be in the - * middle of an update. Note that users of dbuf_undirty() - * should not place a hold on the dbuf before the call. - * Also note: we can get here with a spill block, so - * test for that similar to how dbuf_dirty does. + * Note: This code will probably work even if there are concurrent + * holders, but it is untested in that scenerio, as the ZPL and + * ztest have additional locking (the range locks) that prevents + * that type of concurrent access. */ - if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - mutex_exit(&db->db_mtx); - /* Make sure we don't toss this buffer at sync phase */ - if (db->db_blkid != DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); - mutex_exit(&dn->dn_mtx); - } - DB_DNODE_EXIT(db); - return (0); - } + ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -1380,21 +1372,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } DB_DNODE_EXIT(db); - if (db->db_level == 0) { - if (db->db_state != DB_NOFILL) { - dbuf_unoverride(dr); + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); - } - } else { ASSERT(db->db_buf != NULL); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1406,13 +1390,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); dbuf_set_data(db, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); - return (1); + return (B_TRUE); } - mutex_exit(&db->db_mtx); - return (0); + return (B_FALSE); } #pragma weak dmu_buf_will_dirty = dbuf_will_dirty @@ -1511,7 +1494,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + VERIFY(arc_buf_remove_ref(buf, db)); xuio_stat_wbuf_copied(); return; } @@ -1529,10 +1512,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + VERIFY(arc_buf_remove_ref(db->db_buf, db)); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + VERIFY(arc_buf_remove_ref(db->db_buf, db)); } db->db_buf = NULL; } @@ -2168,10 +2151,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * This dbuf has anonymous data associated with it. */ dbuf_set_data(db, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); + VERIFY(!arc_buf_remove_ref(db->db_buf, db)); /* * A dbuf will be eligible for eviction if either the @@ -2669,7 +2652,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); + db)); else if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 0a9033356..cbf4790b1 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1382,7 +1382,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); + VERIFY(arc_buf_remove_ref(buf, FTAG)); } /* diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index dc237780c..2d1aaa4c4 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -155,51 +156,49 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } int -dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) +dmu_diff(const char *tosnap_name, const char *fromsnap_name, + struct vnode *vp, offset_t *offp) { struct diffarg da; - dsl_dataset_t *ds = tosnap->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap->os_dsl_dataset; - dsl_dataset_t *findds; - dsl_dataset_t *relds; - int err = 0; - - /* make certain we are looking at snapshots */ - if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds)) + dsl_dataset_t *fromsnap; + dsl_dataset_t *tosnap; + dsl_pool_t *dp; + int error; + uint64_t fromtxg; + + if (strchr(tosnap_name, '@') == NULL || + strchr(fromsnap_name, '@') == NULL) return (EINVAL); - /* fromsnap must be earlier and from the same lineage as tosnap */ - if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg) - return (EXDEV); - - relds = NULL; - findds = ds; - - while (fromds->ds_dir != findds->ds_dir) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (!dsl_dir_is_clone(findds->ds_dir)) { - if (relds) - dsl_dataset_rele(relds, FTAG); - return (EXDEV); - } - - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, - findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds); - rw_exit(&dp->dp_config_rwlock); + error = dsl_pool_hold(tosnap_name, FTAG, &dp); + if (error != 0) + return (error); - if (relds) - dsl_dataset_rele(relds, FTAG); + error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } - if (err) - return (EXDEV); + error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } - relds = findds; + if (!dsl_dataset_is_before(tosnap, fromsnap)) { + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (EXDEV); } - if (relds) - dsl_dataset_rele(relds, FTAG); + fromtxg = fromsnap->ds_phys->ds_creation_txg; + dsl_dataset_rele(fromsnap, FTAG); + + dsl_dataset_long_hold(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); da.da_vp = vp; da.da_offp = offp; @@ -207,15 +206,18 @@ dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; da.da_err = 0; - err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg, + error = traverse_dataset(tosnap, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); - if (err) { - da.da_err = err; + if (error != 0) { + da.da_err = error; } else { /* we set the da.da_err we return as side-effect */ (void) write_record(&da); } + dsl_dataset_long_rele(tosnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + return (da.da_err); } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 0f07a4cc9..97a224b91 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -45,6 +45,7 @@ #include <sys/zfs_ioctl.h> #include <sys/sa.h> #include <sys/zfs_onexit.h> +#include <sys/dsl_destroy.h> /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -283,7 +284,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); - if (err) { + if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) @@ -323,34 +324,49 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * checksum/compression/copies. */ if (ds) { - err = dsl_prop_register(ds, "primarycache", + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "secondarycache", + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); + } if (!dsl_dataset_is_snapshot(ds)) { - if (err == 0) - err = dsl_prop_register(ds, "checksum", + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "compression", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "copies", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "dedup", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "logbias", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "sync", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os); + } } - if (err) { + if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf) == 1); + &os->os_phys_buf)); kmem_free(os, sizeof (objset_t)); return (err); } @@ -428,44 +444,66 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) return (err); } -/* called from zpl */ +/* + * Holds the pool while the objset is held. Therefore only one objset + * can be held at a time. + */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { + dsl_pool_t *dp; dsl_dataset_t *ds; int err; - err = dsl_dataset_hold(name, tag, &ds); - if (err) + err = dsl_pool_hold(name, tag, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, name, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, tag); return (err); + } err = dmu_objset_from_ds(ds, osp); - if (err) + if (err != 0) { dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); + } return (err); } -/* called from zpl */ +/* + * dsl_pool must not be held when this is called. + * Upon successful return, there will be a longhold on the dataset, + * and the dsl_pool will not be held. + */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { + dsl_pool_t *dp; dsl_dataset_t *ds; int err; - err = dsl_dataset_own(name, B_FALSE, tag, &ds); - if (err) + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_own(dp, name, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); return (err); + } err = dmu_objset_from_ds(ds, osp); - if (err) { + dsl_pool_rele(dp, FTAG); + if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { - dmu_objset_disown(*osp, tag); + dsl_dataset_disown(ds, tag); return (EINVAL); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { - dmu_objset_disown(*osp, tag); + dsl_dataset_disown(ds, tag); return (EROFS); } return (err); @@ -474,7 +512,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, void dmu_objset_rele(objset_t *os, void *tag) { + dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag); + dsl_pool_rele(dp, tag); } void @@ -483,7 +523,7 @@ dmu_objset_disown(objset_t *os, void *tag) dsl_dataset_disown(os->os_dsl_dataset, tag); } -int +void dmu_objset_evict_dbufs(objset_t *os) { dnode_t *dn; @@ -518,9 +558,7 @@ dmu_objset_evict_dbufs(objset_t *os) mutex_enter(&os->os_lock); dn = next_dn; } - dn = list_head(&os->os_dnodes); mutex_exit(&os->os_lock); - return (dn != DMU_META_DNODE(os)); } void @@ -535,33 +573,37 @@ dmu_objset_evict(objset_t *os) if (ds) { if (!dsl_dataset_is_snapshot(ds)) { - VERIFY(0 == dsl_prop_unregister(ds, "checksum", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "compression", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "copies", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "dedup", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "logbias", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "sync", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os)); } - VERIFY(0 == dsl_prop_unregister(ds, "primarycache", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os)); } if (os->os_sa) sa_tear_down(os); - /* - * We should need only a single pass over the dnode list, since - * nothing can be added to the list at this point. - */ - (void) dmu_objset_evict_dbufs(os); + dmu_objset_evict_dbufs(os); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { @@ -572,7 +614,7 @@ dmu_objset_evict(objset_t *os) ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); + VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* * This is a barrier to prevent the objset from going away in @@ -604,10 +646,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); + if (ds != NULL) - VERIFY(0 == dmu_objset_from_ds(ds, &os)); + VERIFY0(dmu_objset_from_ds(ds, &os)); else - VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); + VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); @@ -655,361 +698,181 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, return (os); } -struct oscarg { - void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); - void *userarg; - dsl_dataset_t *clone_origin; - const char *lastname; - dmu_objset_type_t type; - uint64_t flags; - cred_t *cr; -}; +typedef struct dmu_objset_create_arg { + const char *doca_name; + cred_t *doca_cred; + void (*doca_userfunc)(objset_t *os, void *arg, + cred_t *cr, dmu_tx_t *tx); + void *doca_userarg; + dmu_objset_type_t doca_type; + uint64_t doca_flags; +} dmu_objset_create_arg_t; /*ARGSUSED*/ static int -dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_objset_create_check(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct oscarg *oa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - int err; - uint64_t ddobj; - - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - oa->lastname, sizeof (uint64_t), 1, &ddobj); - if (err != ENOENT) - return (err ? err : EEXIST); + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + int error; - if (oa->clone_origin != NULL) { - /* You can't clone across pools. */ - if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) - return (EXDEV); + if (strchr(doca->doca_name, '@') != NULL) + return (EINVAL); - /* You can only clone snapshots, not the head datasets. */ - if (!dsl_dataset_is_snapshot(oa->clone_origin)) - return (EINVAL); + error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); + if (error != 0) + return (error); + if (tail == NULL) { + dsl_dir_rele(pdd, FTAG); + return (EEXIST); } + dsl_dir_rele(pdd, FTAG); return (0); } static void -dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - spa_t *spa = dd->dd_pool->dp_spa; - struct oscarg *oa = arg2; - uint64_t obj; + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; dsl_dataset_t *ds; + uint64_t obj; blkptr_t *bp; + objset_t *os; - ASSERT(dmu_tx_is_syncing(tx)); + VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); - obj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_origin, oa->flags, oa->cr, tx); + obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, + doca->doca_cred, tx); - VERIFY3U(0, ==, dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); - if (BP_IS_HOLE(bp)) { - objset_t *os = - dmu_objset_create_impl(spa, ds, bp, oa->type, tx); + os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, + ds, bp, doca->doca_type, tx); - if (oa->userfunc) - oa->userfunc(os, oa->userarg, oa->cr, tx); + if (doca->doca_userfunc != NULL) { + doca->doca_userfunc(os, doca->doca_userarg, + doca->doca_cred, tx); } - if (oa->clone_origin == NULL) { - spa_history_log_internal_ds(ds, "create", tx, ""); - } else { - char namebuf[MAXNAMELEN]; - dsl_dataset_name(oa->clone_origin, namebuf); - spa_history_log_internal_ds(ds, "clone", tx, - "origin=%s (%llu)", namebuf, oa->clone_origin->ds_object); - } + spa_history_log_internal_ds(ds, "create", tx, ""); dsl_dataset_rele(ds, FTAG); + dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { - dsl_dir_t *pdd; - const char *tail; - int err = 0; - struct oscarg oa = { 0 }; - - ASSERT(strchr(name, '@') == NULL); - err = dsl_dir_open(name, FTAG, &pdd, &tail); - if (err) - return (err); - if (tail == NULL) { - dsl_dir_close(pdd, FTAG); - return (EEXIST); - } + dmu_objset_create_arg_t doca; - oa.userfunc = func; - oa.userarg = arg; - oa.lastname = tail; - oa.type = type; - oa.flags = flags; - oa.cr = CRED(); + doca.doca_name = name; + doca.doca_cred = CRED(); + doca.doca_flags = flags; + doca.doca_userfunc = func; + doca.doca_userarg = arg; + doca.doca_type = type; - err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, - dmu_objset_create_sync, pdd, &oa, 5); - dsl_dir_close(pdd, FTAG); - return (err); + return (dsl_sync_task(name, + dmu_objset_create_check, dmu_objset_create_sync, &doca, 5)); } -int -dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) +typedef struct dmu_objset_clone_arg { + const char *doca_clone; + const char *doca_origin; + cred_t *doca_cred; +} dmu_objset_clone_arg_t; + +/*ARGSUSED*/ +static int +dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { + dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; - int err = 0; - struct oscarg oa = { 0 }; + int error; + dsl_dataset_t *origin; + dsl_pool_t *dp = dmu_tx_pool(tx); - ASSERT(strchr(name, '@') == NULL); - err = dsl_dir_open(name, FTAG, &pdd, &tail); - if (err) - return (err); + if (strchr(doca->doca_clone, '@') != NULL) + return (EINVAL); + + error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); + if (error != 0) + return (error); if (tail == NULL) { - dsl_dir_close(pdd, FTAG); + dsl_dir_rele(pdd, FTAG); return (EEXIST); } - - oa.lastname = tail; - oa.clone_origin = clone_origin; - oa.flags = flags; - oa.cr = CRED(); - - err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, - dmu_objset_create_sync, pdd, &oa, 5); - dsl_dir_close(pdd, FTAG); - return (err); -} - -int -dmu_objset_destroy(const char *name, boolean_t defer) -{ - dsl_dataset_t *ds; - int error; - - error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); - if (error == 0) { - error = dsl_dataset_destroy(ds, FTAG, defer); - /* dsl_dataset_destroy() closes the ds. */ + /* You can't clone across pools. */ + if (pdd->dd_pool != dp) { + dsl_dir_rele(pdd, FTAG); + return (EXDEV); } + dsl_dir_rele(pdd, FTAG); - return (error); -} - -typedef struct snapallarg { - dsl_sync_task_group_t *saa_dstg; - boolean_t saa_needsuspend; - nvlist_t *saa_props; - - /* the following are used only if 'temporary' is set: */ - boolean_t saa_temporary; - const char *saa_htag; - struct dsl_ds_holdarg *saa_ha; - dsl_dataset_t *saa_newds; -} snapallarg_t; - -typedef struct snaponearg { - const char *soa_longname; /* long snap name */ - const char *soa_snapname; /* short snap name */ - snapallarg_t *soa_saa; -} snaponearg_t; - -static int -snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - snaponearg_t *soa = arg2; - snapallarg_t *saa = soa->soa_saa; - int error; - - /* The props have already been checked by zfs_check_userprops(). */ - - error = dsl_dataset_snapshot_check(os->os_dsl_dataset, - soa->soa_snapname, tx); - if (error) + error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); + if (error != 0) return (error); - if (saa->saa_temporary) { - /* - * Ideally we would just call - * dsl_dataset_user_hold_check() and - * dsl_dataset_destroy_check() here. However the - * dataset we want to hold and destroy is the snapshot - * that we just confirmed we can create, but it won't - * exist until after these checks are run. Do any - * checks we can here and if more checks are added to - * those routines in the future, similar checks may be - * necessary here. - */ - if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) - return (ENOTSUP); - /* - * Not checking number of tags because the tag will be - * unique, as it will be the only tag. - */ - if (strlen(saa->saa_htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) - return (E2BIG); - - saa->saa_ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), - KM_PUSHPAGE); - saa->saa_ha->temphold = B_TRUE; - saa->saa_ha->htag = saa->saa_htag; + /* You can't clone across pools. */ + if (origin->ds_dir->dd_pool != dp) { + dsl_dataset_rele(origin, FTAG); + return (EXDEV); } - return (error); -} -static void -snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - dsl_dataset_t *ds = os->os_dsl_dataset; - snaponearg_t *soa = arg2; - snapallarg_t *saa = soa->soa_saa; - - dsl_dataset_snapshot_sync(ds, soa->soa_snapname, tx); - - if (saa->saa_props != NULL) { - dsl_props_arg_t pa; - pa.pa_props = saa->saa_props; - pa.pa_source = ZPROP_SRC_LOCAL; - dsl_props_set_sync(ds->ds_prev, &pa, tx); + /* You can only clone snapshots, not the head datasets. */ + if (!dsl_dataset_is_snapshot(origin)) { + dsl_dataset_rele(origin, FTAG); + return (EINVAL); } + dsl_dataset_rele(origin, FTAG); - if (saa->saa_temporary) { - struct dsl_ds_destroyarg da; - - dsl_dataset_user_hold_sync(ds->ds_prev, saa->saa_ha, tx); - kmem_free(saa->saa_ha, sizeof (struct dsl_ds_holdarg)); - saa->saa_ha = NULL; - saa->saa_newds = ds->ds_prev; - - da.ds = ds->ds_prev; - da.defer = B_TRUE; - dsl_dataset_destroy_sync(&da, FTAG, tx); - } + return (0); } -static int -snapshot_one_impl(const char *snapname, void *arg) +static void +dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { - char *fsname; - snapallarg_t *saa = arg; - snaponearg_t *soa; - objset_t *os; - int err; - - fsname = kmem_zalloc(MAXPATHLEN, KM_PUSHPAGE); - (void) strlcpy(fsname, snapname, MAXPATHLEN); - strchr(fsname, '@')[0] = '\0'; - - err = dmu_objset_hold(fsname, saa, &os); - kmem_free(fsname, MAXPATHLEN); - if (err != 0) - return (err); - - /* - * If the objset is in an inconsistent state (eg, in the process - * of being destroyed), don't snapshot it. - */ - if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { - dmu_objset_rele(os, saa); - return (EBUSY); - } - - if (saa->saa_needsuspend) { - err = zil_suspend(dmu_objset_zil(os)); - if (err) { - dmu_objset_rele(os, saa); - return (err); - } - } + dmu_objset_clone_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + dsl_dataset_t *origin, *ds; + uint64_t obj; + char namebuf[MAXNAMELEN]; - soa = kmem_zalloc(sizeof (*soa), KM_PUSHPAGE); - soa->soa_saa = saa; - soa->soa_longname = snapname; - soa->soa_snapname = strchr(snapname, '@') + 1; + VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); + VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); - dsl_sync_task_create(saa->saa_dstg, snapshot_check, snapshot_sync, - os, soa, 3); + obj = dsl_dataset_create_sync(pdd, tail, origin, 0, + doca->doca_cred, tx); - return (0); + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); + dsl_dataset_name(origin, namebuf); + spa_history_log_internal_ds(ds, "clone", tx, + "origin=%s (%llu)", namebuf, origin->ds_object); + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); } -/* - * The snapshots must all be in the same pool. - */ int -dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) +dmu_objset_clone(const char *clone, const char *origin) { - dsl_sync_task_t *dst; - snapallarg_t saa = { 0 }; - spa_t *spa; - int rv = 0; - int err; - nvpair_t *pair; - - pair = nvlist_next_nvpair(snaps, NULL); - if (pair == NULL) - return (0); - - err = spa_open(nvpair_name(pair), &spa, FTAG); - if (err) - return (err); - saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - saa.saa_props = props; - saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); - - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - err = snapshot_one_impl(nvpair_name(pair), &saa); - if (err != 0) { - if (errors != NULL) { - fnvlist_add_int32(errors, - nvpair_name(pair), err); - } - rv = err; - } - } + dmu_objset_clone_arg_t doca; - /* - * If any call to snapshot_one_impl() failed, don't execute the - * sync task. The error handling code below will clean up the - * snaponearg_t from any successful calls to - * snapshot_one_impl(). - */ - if (rv == 0) - err = dsl_sync_task_group_wait(saa.saa_dstg); - if (err != 0) - rv = err; - - for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst; - dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) { - objset_t *os = dst->dst_arg1; - snaponearg_t *soa = dst->dst_arg2; - if (dst->dst_err != 0) { - if (errors != NULL) { - fnvlist_add_int32(errors, - soa->soa_longname, dst->dst_err); - } - rv = dst->dst_err; - } - - if (saa.saa_needsuspend) - zil_resume(dmu_objset_zil(os)); - dmu_objset_rele(os, &saa); - kmem_free(soa, sizeof (*soa)); - } + doca.doca_clone = clone; + doca.doca_origin = origin; + doca.doca_cred = CRED(); - dsl_sync_task_group_destroy(saa.saa_dstg); - spa_close(spa, FTAG); - return (rv); + return (dsl_sync_task(clone, + dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5)); } int @@ -1020,59 +883,12 @@ dmu_objset_snapshot_one(const char *fsname, const char *snapname) nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); - err = dmu_objset_snapshot(snaps, NULL, NULL); - fnvlist_free(snaps); strfree(longsnap); + err = dsl_dataset_snapshot(snaps, NULL, NULL); + fnvlist_free(snaps); return (err); } -int -dmu_objset_snapshot_tmp(const char *snapname, const char *tag, int cleanup_fd) -{ - dsl_sync_task_t *dst; - snapallarg_t saa = { 0 }; - spa_t *spa; - minor_t minor; - int err; - - err = spa_open(snapname, &spa, FTAG); - if (err) - return (err); - saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - saa.saa_htag = tag; - saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); - saa.saa_temporary = B_TRUE; - - if (cleanup_fd < 0) { - spa_close(spa, FTAG); - return (EINVAL); - } - if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { - spa_close(spa, FTAG); - return (err); - } - - err = snapshot_one_impl(snapname, &saa); - - if (err == 0) - err = dsl_sync_task_group_wait(saa.saa_dstg); - - for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst; - dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) { - objset_t *os = dst->dst_arg1; - dsl_register_onexit_hold_cleanup(saa.saa_newds, tag, minor); - if (saa.saa_needsuspend) - zil_resume(dmu_objset_zil(os)); - dmu_objset_rele(os, &saa); - } - - zfs_onexit_fd_rele(cleanup_fd); - dsl_sync_task_group_destroy(saa.saa_dstg); - spa_close(spa, FTAG); - return (err); -} - - static void dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { @@ -1110,9 +926,9 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; - ASSERT(bp == os->os_rootbp); - ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); - ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT3P(bp, ==, os->os_rootbp); + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); + ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects @@ -1220,7 +1036,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while ((dr = list_head(list))) { - ASSERT(dr->dr_dbuf->db_level == 0); + ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); if (dr->dr_zio) zio_nowait(dr->dr_zio); @@ -1514,12 +1330,12 @@ dmu_objset_userspace_upgrade(objset_t *os) return (EINTR); objerr = dmu_bonus_hold(os, obj, FTAG, &db); - if (objerr) + if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); - if (objerr) { + if (objerr != 0) { dmu_tx_abort(tx); continue; } @@ -1602,6 +1418,8 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, zap_cursor_t cursor; zap_attribute_t attr; + ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); + if (ds->ds_phys->ds_snapnames_zapobj == 0) return (ENOENT); @@ -1674,42 +1492,122 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, return (0); } -struct findarg { - int (*func)(const char *, void *); - void *arg; -}; - -/* ARGSUSED */ -static int -findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - struct findarg *fa = arg; - return (fa->func(dsname, fa->arg)); -} - /* - * Find all objsets under name, and for each, call 'func(child_name, arg)'. - * Perhaps change all callers to use dmu_objset_find_spa()? + * Find objsets under and including ddobj, call func(ds) on each. */ int -dmu_objset_find(char *name, int func(const char *, void *), void *arg, - int flags) +dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, + int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { - struct findarg fa; - fa.func = func; - fa.arg = arg; - return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); + dsl_dir_t *dd; + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + uint64_t thisobj; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); + if (err != 0) + return (err); + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_rele(dd, FTAG); + return (0); + } + + thisobj = dd->dd_phys->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* + * Iterate over all children. + */ + if (flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + err = dmu_objset_find_dp(dp, attr->za_first_integer, + func, arg, flags); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + + if (err != 0) { + dsl_dir_rele(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + return (err); + } + } + + /* + * Iterate over all snapshots. + */ + if (flags & DS_FIND_SNAPSHOTS) { + dsl_dataset_t *ds; + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + if (err == 0) { + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + err = dsl_dataset_hold_obj(dp, + attr->za_first_integer, FTAG, &ds); + if (err != 0) + break; + err = func(dp, ds, arg); + dsl_dataset_rele(ds, FTAG); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + } + } + + dsl_dir_rele(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + + if (err != 0) + return (err); + + /* + * Apply to self. + */ + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + if (err != 0) + return (err); + err = func(dp, ds, arg); + dsl_dataset_rele(ds, FTAG); + return (err); } /* - * Find all objsets under name, call func on each + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * The dp_config_rwlock must not be held when this is called, and it + * will not be held when the callback is called. + * Therefore this function should only be used when the pool is not changing + * (e.g. in syncing context), or the callback can deal with the possible races. */ -int -dmu_objset_find_spa(spa_t *spa, const char *name, - int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) +static int +dmu_objset_find_impl(spa_t *spa, const char *name, + int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; - dsl_pool_t *dp; + dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; @@ -1717,21 +1615,23 @@ dmu_objset_find_spa(spa_t *spa, const char *name, uint64_t thisobj; int err; - if (name == NULL) - name = spa_name(spa); - err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); - if (err) + dsl_pool_config_enter(dp, FTAG); + + err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); return (err); + } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); return (0); } thisobj = dd->dd_phys->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); - dp = dd->dd_pool; /* * Iterate over all children. @@ -1741,19 +1641,24 @@ dmu_objset_find_spa(spa_t *spa, const char *name, dd->dd_phys->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); - err = dmu_objset_find_spa(spa, child, func, arg, flags); + dsl_pool_config_exit(dp, FTAG); + err = dmu_objset_find_impl(spa, child, + func, arg, flags); + dsl_pool_config_enter(dp, FTAG); strfree(child); - if (err) + if (err != 0) break; } zap_cursor_fini(&zc); - if (err) { - dsl_dir_close(dd, FTAG); + if (err != 0) { + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } @@ -1763,11 +1668,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name, * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { - if (!dsl_pool_sync_context(dp)) - rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); - if (!dsl_pool_sync_context(dp)) - rw_exit(&dp->dp_config_rwlock); if (err == 0) { uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; @@ -1776,64 +1677,50 @@ dmu_objset_find_spa(spa_t *spa, const char *name, for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == + ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); + ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); - err = func(spa, attr->za_first_integer, - child, arg); + dsl_pool_config_exit(dp, FTAG); + err = func(child, arg); + dsl_pool_config_enter(dp, FTAG); strfree(child); - if (err) + if (err != 0) break; } zap_cursor_fini(&zc); } } - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); + dsl_pool_config_exit(dp, FTAG); - if (err) + if (err != 0) return (err); - /* - * Apply to self if appropriate. - */ - err = func(spa, thisobj, name, arg); - return (err); + /* Apply to self. */ + return (func(name, arg)); } -/* ARGSUSED */ +/* + * See comment above dmu_objset_find_impl(). + */ int -dmu_objset_prefetch(const char *name, void *arg) +dmu_objset_find(char *name, int func(const char *, void *), void *arg, + int flags) { - dsl_dataset_t *ds; - - if (dsl_dataset_hold(name, FTAG, &ds)) - return (0); - - if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { - mutex_enter(&ds->ds_opening_lock); - if (ds->ds_objset == NULL) { - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; - zbookmark_t zb; - - SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, - ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - - (void) arc_read(NULL, dsl_dataset_get_spa(ds), - &ds->ds_phys->ds_bp, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zb); - } - mutex_exit(&ds->ds_opening_lock); - } + spa_t *spa; + int error; - dsl_dataset_rele(ds, FTAG); - return (0); + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + error = dmu_objset_find_impl(spa, name, func, arg, flags); + spa_close(spa, FTAG); + return (error); } void @@ -1850,6 +1737,22 @@ dmu_objset_get_user(objset_t *os) return (os->os_user_ptr); } +/* + * Determine name of filesystem, given name of snapshot. + * buf must be at least MAXNAMELEN bytes + */ +int +dmu_fsname(const char *snapname, char *buf) +{ + char *atp = strchr(snapname, '@'); + if (atp == NULL) + return (EINVAL); + if (atp - snapname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strlcpy(buf, snapname, atp - snapname + 1); + return (0); +} + #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_pool); @@ -1863,16 +1766,12 @@ EXPORT_SYMBOL(dmu_objset_disown); EXPORT_SYMBOL(dmu_objset_from_ds); EXPORT_SYMBOL(dmu_objset_create); EXPORT_SYMBOL(dmu_objset_clone); -EXPORT_SYMBOL(dmu_objset_destroy); -EXPORT_SYMBOL(dmu_objset_snapshot); EXPORT_SYMBOL(dmu_objset_stats); EXPORT_SYMBOL(dmu_objset_fast_stat); EXPORT_SYMBOL(dmu_objset_spa); EXPORT_SYMBOL(dmu_objset_space); EXPORT_SYMBOL(dmu_objset_fsid_guid); EXPORT_SYMBOL(dmu_objset_find); -EXPORT_SYMBOL(dmu_objset_find_spa); -EXPORT_SYMBOL(dmu_objset_prefetch); EXPORT_SYMBOL(dmu_objset_byteswap); EXPORT_SYMBOL(dmu_objset_evict_dbufs); EXPORT_SYMBOL(dmu_objset_snap_cmtime); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 6552e1d9d..2945be89b 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -48,11 +48,14 @@ #include <sys/avl.h> #include <sys/ddt.h> #include <sys/zfs_onexit.h> +#include <sys/dmu_send.h> +#include <sys/dsl_destroy.h> /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; static char *dmu_recv_tag = "dmu_recv_tag"; +static const char *recv_clone_name = "%recv"; typedef struct dump_bytes_io { dmu_sendarg_t *dbi_dsp; @@ -319,7 +322,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) return (EINTR); - if (dsp->dsa_err) + if (dsp->dsa_err != 0) return (EINTR); return (0); } @@ -369,7 +372,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = dump_dnode(dsp, dnobj, blk+i); - if (err) + if (err != 0) break; } (void) arc_buf_remove_ref(abuf, &abuf); @@ -417,65 +420,33 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } /* - * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. - * For example, they could both be snapshots of the same filesystem, and - * 'earlier' is before 'later'. Or 'earlier' could be the origin of - * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's - * filesystem. Or 'earlier' could be the origin's origin. + * Releases dp, ds, and fromds, using the specified tag. */ -static boolean_t -is_before(dsl_dataset_t *later, dsl_dataset_t *earlier) -{ - dsl_pool_t *dp = later->ds_dir->dd_pool; - int error; - boolean_t ret; - dsl_dataset_t *origin; - - if (earlier->ds_phys->ds_creation_txg >= - later->ds_phys->ds_creation_txg) - return (B_FALSE); - - if (later->ds_dir == earlier->ds_dir) - return (B_TRUE); - if (!dsl_dir_is_clone(later->ds_dir)) - return (B_FALSE); - - rw_enter(&dp->dp_config_rwlock, RW_READER); - if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) { - rw_exit(&dp->dp_config_rwlock); - return (B_TRUE); - } - error = dsl_dataset_hold_obj(dp, - later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin); - rw_exit(&dp->dp_config_rwlock); - if (error != 0) - return (B_FALSE); - ret = is_before(origin, earlier); - dsl_dataset_rele(origin, FTAG); - return (ret); -} - -int -dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, - offset_t *off) +static int +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, + dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off) { - dsl_dataset_t *ds = tosnap->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; + objset_t *os; dmu_replay_record_t *drr; dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; - /* tosnap must be a snapshot */ - if (ds->ds_phys->ds_next_snap_obj == 0) - return (EINVAL); - - /* - * fromsnap must be an earlier snapshot from the same fs as tosnap, - * or the origin's fs. - */ - if (fromds != NULL && !is_before(ds, fromds)) + if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) { + dsl_dataset_rele(fromds, tag); + dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); return (EXDEV); + } + + err = dmu_objset_from_ds(ds, &os); + if (err != 0) { + if (fromds != NULL) + dsl_dataset_rele(fromds, tag); + dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); + return (err); + } drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; @@ -484,13 +455,17 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, DMU_SUBSTREAM); #ifdef _KERNEL - if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { + if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; - if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); + if (fromds != NULL) + dsl_dataset_rele(fromds, tag); + dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); return (EINVAL); } - if (version == ZPL_VERSION_SA) { + if (version >= ZPL_VERSION_SA) { DMU_SET_FEATUREFLAGS( drr->drr_u.drr_begin.drr_versioninfo, DMU_BACKUP_FEATURE_SA_SPILL); @@ -500,19 +475,22 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; - drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; + drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (fromds != NULL && ds->ds_dir != fromds->ds_dir) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (fromds) + if (fromds != NULL) drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (fromds) + if (fromds != NULL) { fromtxg = fromds->ds_phys->ds_creation_txg; + dsl_dataset_rele(fromds, tag); + fromds = NULL; + } dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); @@ -520,7 +498,7 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, dsp->dsa_vp = vp; dsp->dsa_outfd = outfd; dsp->dsa_proc = curproc; - dsp->dsa_os = tosnap; + dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); @@ -535,6 +513,9 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, goto out; } + dsl_dataset_long_hold(ds, FTAG); + dsl_pool_rele(dp, tag); + err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, dsp); @@ -542,8 +523,8 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) err = EINTR; - if (err) { - if (err == EINTR && dsp->dsa_err) + if (err != 0) { + if (err == EINTR && dsp->dsa_err != 0) err = dsp->dsa_err; goto out; } @@ -566,27 +547,96 @@ out: kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); + dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_rele(ds, tag); + return (err); } int -dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep) +dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + int outfd, vnode_t *vp, offset_t *off) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + dsl_dataset_t *fromds = NULL; + int err; + + err = dsl_pool_hold(pool, FTAG, &dp); + if (err != 0) + return (err); + + err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (fromsnap != 0) { + err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); + if (err != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + } + + return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); +} + +int +dmu_send(const char *tosnap, const char *fromsnap, + int outfd, vnode_t *vp, offset_t *off) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + dsl_dataset_t *fromds = NULL; + int err; + + if (strchr(tosnap, '@') == NULL) + return (EINVAL); + if (fromsnap != NULL && strchr(fromsnap, '@') == NULL) + return (EINVAL); + + err = dsl_pool_hold(tosnap, FTAG, &dp); + if (err != 0) + return (err); + + err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (fromsnap != NULL) { + err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); + if (err != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + } + return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); +} + +int +dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) { - dsl_dataset_t *ds = tosnap->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; - dsl_pool_t *dp = ds->ds_dir->dd_pool; int err; uint64_t size, recordsize; + ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); + + ASSERT(dsl_pool_config_held(dp)); /* tosnap must be a snapshot */ - if (ds->ds_phys->ds_next_snap_obj == 0) + if (!dsl_dataset_is_snapshot(ds)) return (EINVAL); /* * fromsnap must be an earlier snapshot from the same fs as tosnap, * or the origin's fs. */ - if (fromds != NULL && !is_before(ds, fromds)) + if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) return (EXDEV); /* Get uncompressed size estimate of changed data. */ @@ -596,7 +646,7 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep) uint64_t used, comp; err = dsl_dataset_space_written(fromds, ds, &used, &comp, &size); - if (err) + if (err != 0) return (err); } @@ -615,11 +665,8 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep) * Therefore, space used by indirect blocks is sizeof(blkptr_t) per * block, which we observe in practice. */ - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_prop_get_ds(ds, "recordsize", - sizeof (recordsize), 1, &recordsize, NULL); - rw_exit(&dp->dp_config_rwlock); - if (err) + err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); + if (err != 0) return (err); size -= size / recordsize * sizeof (blkptr_t); @@ -631,93 +678,40 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep) return (0); } -struct recvbeginsyncarg { - const char *tofs; - const char *tosnap; - dsl_dataset_t *origin; - uint64_t fromguid; - dmu_objset_type_t type; - void *tag; - boolean_t force; - uint64_t dsflags; - char clonelastname[MAXNAMELEN]; - dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ - cred_t *cr; -}; - -/* ARGSUSED */ -static int -recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct recvbeginsyncarg *rbsa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val; - int err; - - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); - - if (err != ENOENT) - return (err ? err : EEXIST); - - if (rbsa->origin) { - /* make sure it's a snap in the same pool */ - if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) - return (EXDEV); - if (!dsl_dataset_is_snapshot(rbsa->origin)) - return (EINVAL); - if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); - } - - return (0); -} - -static void -recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct recvbeginsyncarg *rbsa = arg2; - uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; - uint64_t dsobj; - - /* Create and open new dataset. */ - dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, - rbsa->origin, flags, rbsa->cr, tx); - VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, - B_TRUE, dmu_recv_tag, &rbsa->ds)); - - if (rbsa->origin == NULL) { - (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, - rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); - } - - spa_history_log_internal_ds(rbsa->ds, "receive new", tx, ""); -} +typedef struct dmu_recv_begin_arg { + const char *drba_origin; + dmu_recv_cookie_t *drba_cookie; + cred_t *drba_cred; +} dmu_recv_begin_arg_t; -/* ARGSUSED */ static int -recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, + uint64_t fromguid) { - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - int err; uint64_t val; + int error; + dsl_pool_t *dp = ds->ds_dir->dd_pool; /* must not have any changes since most recent snapshot */ - if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) + if (!drba->drba_cookie->drc_force && + dsl_dataset_modified_since_lastsnap(ds)) return (ETXTBSY); + /* temporary clone name must not exist */ + error = zap_lookup(dp->dp_meta_objset, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, + 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? EBUSY : error); + /* new snapshot name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); + error = zap_lookup(dp->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, + 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? EEXIST : error); - if (rbsa->fromguid) { + if (fromguid != 0) { /* if incremental, most recent snapshot must match fromguid */ if (ds->ds_prev == NULL) return (ENODEV); @@ -726,20 +720,20 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) * most recent snapshot must match fromguid, or there are no * changes since the fromguid one */ - if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { + if (ds->ds_prev->ds_phys->ds_guid != fromguid) { uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; while (obj != 0) { dsl_dataset_t *snap; - err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - obj, FTAG, &snap); - if (err) + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) return (ENODEV); if (snap->ds_phys->ds_creation_txg < birth) { dsl_dataset_rele(snap, FTAG); return (ENODEV); } - if (snap->ds_phys->ds_guid == rbsa->fromguid) { + if (snap->ds_phys->ds_guid == fromguid) { dsl_dataset_rele(snap, FTAG); break; /* it's ok */ } @@ -755,58 +749,153 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) return (ENODEV); } - /* temporary clone name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_dir->dd_phys->dd_child_dir_zapobj, - rbsa->clonelastname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - return (0); + +} + +static int +dmu_recv_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + uint64_t fromguid = drrb->drr_fromguid; + int flags = drrb->drr_flags; + int error; + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) + return (EINVAL); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) { + return (ENOTSUP); + } + + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error == 0) { + /* target fs already exists; recv into temp clone */ + + /* Can't recv a clone into an existing fs */ + if (flags & DRR_FLAG_CLONE) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + + error = recv_begin_check_existing_impl(drba, ds, fromguid); + dsl_dataset_rele(ds, FTAG); + } else if (error == ENOENT) { + /* target fs does not exist; must be a full backup or clone */ + char buf[MAXNAMELEN]; + + /* + * If it's a non-clone incremental, we are missing the + * target fs, so fail the recv. + */ + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) + return (ENOENT); + + /* Open the parent of tofs */ + ASSERT3U(strlen(tofs), <, MAXNAMELEN); + (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); + error = dsl_dataset_hold(dp, buf, FTAG, &ds); + if (error != 0) + return (error); + + if (drba->drba_origin != NULL) { + dsl_dataset_t *origin; + error = dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + if (!dsl_dataset_is_snapshot(origin)) { + dsl_dataset_rele(origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + if (origin->ds_phys->ds_guid != fromguid) { + dsl_dataset_rele(origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (ENODEV); + } + dsl_dataset_rele(origin, FTAG); + } + dsl_dataset_rele(ds, FTAG); + error = 0; + } + return (error); } -/* ARGSUSED */ static void -recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ohds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - dsl_pool_t *dp = ohds->ds_dir->dd_pool; - dsl_dataset_t *cds; - uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds, *newds; uint64_t dsobj; + int error; + uint64_t crflags; + + crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? + DS_FLAG_CI_DATASET : 0; - /* create and open the temporary clone */ - dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, - ohds->ds_prev, flags, rbsa->cr, tx); - VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error == 0) { + /* create temporary clone */ + dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, + ds->ds_prev, crflags, drba->drba_cred, tx); + dsl_dataset_rele(ds, FTAG); + } else { + dsl_dir_t *dd; + const char *tail; + dsl_dataset_t *origin = NULL; + + VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); + + if (drba->drba_origin != NULL) { + VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin)); + } + + /* Create new dataset. */ + dsobj = dsl_dataset_create_sync(dd, + strrchr(tofs, '/') + 1, + origin, crflags, drba->drba_cred, tx); + if (origin != NULL) + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(dd, FTAG); + drba->drba_cookie->drc_newfs = B_TRUE; + } + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + + dmu_buf_will_dirty(newds->ds_dbuf, tx); + newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; /* * If we actually created a non-clone, we need to create the * objset in our new dataset. */ - if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { + if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { (void) dmu_objset_create_impl(dp->dp_spa, - cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); + newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } - rbsa->ds = cds; - - spa_history_log_internal_ds(cds, "receive over existing", tx, ""); -} - -static boolean_t -dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) -{ - int featureflags; - - featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + drba->drba_cookie->drc_ds = newds; - /* Verify pool version supports SA if SA_SPILL feature set */ - return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && - (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); + spa_history_log_internal_ds(newds, "receive", tx, ""); } /* @@ -814,132 +903,55 @@ dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, - boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) +dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, + boolean_t force, char *origin, dmu_recv_cookie_t *drc) { - int err = 0; - boolean_t byteswap; - struct recvbeginsyncarg rbsa = { 0 }; - uint64_t versioninfo; - int flags; - dsl_dataset_t *ds; - - if (drrb->drr_magic == DMU_BACKUP_MAGIC) - byteswap = FALSE; - else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) - byteswap = TRUE; - else - return (EINVAL); - - rbsa.tofs = tofs; - rbsa.tosnap = tosnap; - rbsa.origin = origin ? origin->os_dsl_dataset : NULL; - rbsa.fromguid = drrb->drr_fromguid; - rbsa.type = drrb->drr_type; - rbsa.tag = FTAG; - rbsa.dsflags = 0; - rbsa.cr = CRED(); - versioninfo = drrb->drr_versioninfo; - flags = drrb->drr_flags; - - if (byteswap) { - rbsa.type = BSWAP_32(rbsa.type); - rbsa.fromguid = BSWAP_64(rbsa.fromguid); - versioninfo = BSWAP_64(versioninfo); - flags = BSWAP_32(flags); - } - - if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || - rbsa.type >= DMU_OST_NUMTYPES || - ((flags & DRR_FLAG_CLONE) && origin == NULL)) - return (EINVAL); - - if (flags & DRR_FLAG_CI_DATA) - rbsa.dsflags = DS_FLAG_CI_DATASET; + dmu_recv_begin_arg_t drba = { 0 }; + dmu_replay_record_t *drr; bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drrb = drrb; drc->drc_tosnap = tosnap; - drc->drc_top_ds = top_ds; + drc->drc_tofs = tofs; drc->drc_force = force; - /* - * Process the begin in syncing context. - */ - - /* open the dataset we are logically receiving into */ - err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); - if (err == 0) { - if (dmu_recv_verify_features(ds, drrb)) { - dsl_dataset_rele(ds, dmu_recv_tag); - return (ENOTSUP); - } - /* target fs already exists; recv into temp clone */ - - /* Can't recv a clone into an existing fs */ - if (flags & DRR_FLAG_CLONE) { - dsl_dataset_rele(ds, dmu_recv_tag); - return (EINVAL); - } - - /* must not have an incremental recv already in progress */ - if (!mutex_tryenter(&ds->ds_recvlock)) { - dsl_dataset_rele(ds, dmu_recv_tag); - return (EBUSY); - } - - /* tmp clone name is: tofs/%tosnap" */ - (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), - "%%%s", tosnap); - rbsa.force = force; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_existing_check, recv_existing_sync, ds, &rbsa, 5); - if (err) { - mutex_exit(&ds->ds_recvlock); - dsl_dataset_rele(ds, dmu_recv_tag); - return (err); - } - drc->drc_logical_ds = ds; - drc->drc_real_ds = rbsa.ds; - } else if (err == ENOENT) { - /* target fs does not exist; must be a full backup or clone */ - char *cp; - - /* - * If it's a non-clone incremental, we are missing the - * target fs, so fail the recv. - */ - if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) - return (ENOENT); - - /* Open the parent of tofs */ - cp = strrchr(tofs, '/'); - *cp = '\0'; - err = dsl_dataset_hold(tofs, FTAG, &ds); - *cp = '/'; - if (err) - return (err); + if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + drc->drc_byteswap = B_TRUE; + else if (drrb->drr_magic != DMU_BACKUP_MAGIC) + return (EINVAL); - if (dmu_recv_verify_features(ds, drrb)) { - dsl_dataset_rele(ds, FTAG); - return (ENOTSUP); - } + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin = *drc->drc_drrb; + if (drc->drc_byteswap) { + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + } else { + fletcher_4_incremental_native(drr, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + } + kmem_free(drr, sizeof (dmu_replay_record_t)); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; - drc->drc_newfs = B_TRUE; + if (drc->drc_byteswap) { + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } - return (err); + drba.drba_origin = origin; + drba.drba_cookie = drc; + drba.drba_cred = CRED(); + + return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5)); } struct restorearg { int err; - int byteswap; + boolean_t byteswap; vnode_t *vp; char *buf; uint64_t voff; @@ -975,7 +987,7 @@ free_guid_map_onexit(void *arg) guid_map_entry_t *gmep; while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { - dsl_dataset_rele(gmep->gme_ds, ca); + dsl_dataset_long_rele(gmep->gme_ds, gmep); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); @@ -1003,7 +1015,7 @@ restore_read(struct restorearg *ra, int len) ra->err = EINVAL; ra->voff += len - done - resid; done = len - resid; - if (ra->err) + if (ra->err != 0) return (NULL); } @@ -1124,7 +1136,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) if (drro->drr_bonuslen) { data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); - if (ra->err) + if (ra->err != 0) return (ra->err); } @@ -1133,7 +1145,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } @@ -1147,14 +1159,14 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen); } - if (err) { + if (err != 0) { return (EINVAL); } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, drro->drr_object); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } @@ -1202,7 +1214,7 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, continue; err = dmu_free_object(os, obj); - if (err) + if (err != 0) return (err); } return (0); @@ -1232,7 +1244,7 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } @@ -1295,7 +1307,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } @@ -1336,7 +1348,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); @@ -1375,6 +1387,16 @@ restore_free(struct restorearg *ra, objset_t *os, return (err); } +/* used to destroy the drc_ds on error */ +static void +dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) +{ + char name[MAXNAMELEN]; + dsl_dataset_name(drc->drc_ds, name); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + (void) dsl_destroy_head(name); +} + /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ @@ -1388,52 +1410,24 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, zio_cksum_t pcksum; int featureflags; - if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) - ra.byteswap = TRUE; - - { - /* compute checksum of drr_begin record */ - dmu_replay_record_t *drr; - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin = *drc->drc_drrb; - if (ra.byteswap) { - fletcher_4_incremental_byteswap(drr, - sizeof (dmu_replay_record_t), &ra.cksum); - } else { - fletcher_4_incremental_native(drr, - sizeof (dmu_replay_record_t), &ra.cksum); - } - kmem_free(drr, sizeof (dmu_replay_record_t)); - } - - if (ra.byteswap) { - struct drr_begin *drrb = drc->drc_drrb; - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); - } - + ra.byteswap = drc->drc_byteswap; + ra.cksum = drc->drc_cksum; ra.vp = vp; ra.voff = *voffp; ra.bufsize = 1<<20; ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ - ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == + ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); - ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); + ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ - VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); + VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); - ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); @@ -1446,7 +1440,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, goto out; } ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (ra.err) { + if (ra.err != 0) { cleanup_fd = -1; goto out; } @@ -1460,12 +1454,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ra.err = zfs_onexit_add_cb(minor, free_guid_map_onexit, ra.guid_to_ds_map, action_handlep); - if (ra.err) + if (ra.err != 0) goto out; } else { ra.err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&ra.guid_to_ds_map); - if (ra.err) + if (ra.err != 0) goto out; } @@ -1559,14 +1553,7 @@ out: * destroy what we created, so we don't leave it in the * inconsistent restoring state. */ - txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, - B_FALSE); - if (drc->drc_real_ds != drc->drc_logical_ds) { - mutex_exit(&drc->drc_logical_ds->ds_recvlock); - dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); - } + dmu_recv_cleanup_ds(drc); } vmem_free(ra.buf, ra.bufsize); @@ -1574,142 +1561,179 @@ out: return (ra.err); } -struct recvendsyncarg { - char *tosnap; - uint64_t creation_time; - uint64_t toguid; -}; - static int -recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_recv_end_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - struct recvendsyncarg *resa = arg2; + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int error; + + ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); + if (error != 0) + return (error); + error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, + origin_head, drc->drc_force); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + error = dsl_dataset_snapshot_check_impl(origin_head, + drc->drc_tosnap, tx); + dsl_dataset_rele(origin_head, FTAG); + if (error != 0) + return (error); - return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); + error = dsl_destroy_head_check_impl(drc->drc_ds, 1); + } else { + error = dsl_dataset_snapshot_check_impl(drc->drc_ds, + drc->drc_tosnap, tx); + } + return (error); } static void -recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - struct recvendsyncarg *resa = arg2; + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + + spa_history_log_internal_ds(drc->drc_ds, "finish receiving", + tx, "snap=%s", drc->drc_tosnap); + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, + &origin_head)); + dsl_dataset_clone_swap_sync_impl(drc->drc_ds, + origin_head, tx); + dsl_dataset_snapshot_sync_impl(origin_head, + drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); + origin_head->ds_prev->ds_phys->ds_creation_time = + drc->drc_drrb->drr_creation_time; + origin_head->ds_prev->ds_phys->ds_guid = + drc->drc_drrb->drr_toguid; + origin_head->ds_prev->ds_phys->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); + origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + + dsl_dataset_rele(origin_head, FTAG); + dsl_destroy_head_sync_impl(drc->drc_ds, tx); + } else { + dsl_dataset_t *ds = drc->drc_ds; - dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); + dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); - /* set snapshot's creation time and guid */ - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; - ds->ds_prev->ds_phys->ds_guid = resa->toguid; - ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_creation_time = + drc->drc_drrb->drr_creation_time; + ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; + ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; - spa_history_log_internal_ds(ds, "finished receiving", tx, ""); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + } + drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; + /* + * Release the hold from dmu_recv_begin. This must be done before + * we return to open context, so that when we free the dataset's dnode, + * we can evict its bonus buffer. + */ + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + drc->drc_ds = NULL; } static int -add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) +add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; + dsl_pool_t *dp; dsl_dataset_t *snapds; guid_map_entry_t *gmep; int err; ASSERT(guid_map != NULL); - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snapds); if (err == 0) { gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); gmep->guid = snapds->ds_phys->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); + dsl_dataset_long_hold(snapds, gmep); + dsl_dataset_rele(snapds, FTAG); } - rw_exit(&dp->dp_config_rwlock); + dsl_pool_rele(dp, FTAG); return (err); } +static int dmu_recv_end_modified_blocks = 3; + static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { - struct recvendsyncarg resa; - dsl_dataset_t *ds = drc->drc_logical_ds; - int err, myerr; - - if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { - err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, - drc->drc_force); - if (err) - goto out; - } else { - mutex_exit(&ds->ds_recvlock); - dsl_dataset_rele(ds, dmu_recv_tag); - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, - B_FALSE); - return (EBUSY); - } + int error; - resa.creation_time = drc->drc_drrb->drr_creation_time; - resa.toguid = drc->drc_drrb->drr_toguid; - resa.tosnap = drc->drc_tosnap; +#ifdef _KERNEL + char *name; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_end_check, recv_end_sync, ds, &resa, 3); - if (err) { - /* swap back */ - (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); - } + /* + * We will be destroying the ds; make sure its origin is unmounted if + * necessary. + */ + name = kmem_alloc(MAXNAMELEN, KM_SLEEP); + dsl_dataset_name(drc->drc_ds, name); + zfs_destroy_unmount_origin(name); + kmem_free(name, MAXNAMELEN); +#endif -out: - mutex_exit(&ds->ds_recvlock); - if (err == 0 && drc->drc_guid_to_ds_map != NULL) - (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); - dsl_dataset_disown(ds, dmu_recv_tag); - myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); - ASSERT0(myerr); - return (err); + error = dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks); + + if (error != 0) + dmu_recv_cleanup_ds(drc); + return (error); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { - struct recvendsyncarg resa; - dsl_dataset_t *ds = drc->drc_logical_ds; - int err; - - /* - * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() - * expects it to have a ds_user_ptr (and zil), but clone_swap() - * can close it. - */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); + int error; - resa.creation_time = drc->drc_drrb->drr_creation_time; - resa.toguid = drc->drc_drrb->drr_toguid; - resa.tosnap = drc->drc_tosnap; + error = dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_end_check, recv_end_sync, ds, &resa, 3); - if (err) { - /* clean up the fs we just recv'd into */ - (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); - } else { - if (drc->drc_guid_to_ds_map != NULL) - (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); - /* release the hold from dmu_recv_begin */ - dsl_dataset_disown(ds, dmu_recv_tag); + if (error != 0) { + dmu_recv_cleanup_ds(drc); + } else if (drc->drc_guid_to_ds_map != NULL) { + (void) add_ds_to_guidmap(drc->drc_tofs, + drc->drc_guid_to_ds_map, + drc->drc_newsnapobj); } - return (err); + return (error); } int dmu_recv_end(dmu_recv_cookie_t *drc) { - if (drc->drc_logical_ds != drc->drc_real_ds) - return (dmu_recv_existing_end(drc)); - else + if (drc->drc_newfs) return (dmu_recv_new_end(drc)); + else + return (dmu_recv_existing_end(drc)); } diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 1c3972371..32b3e50fc 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -265,7 +265,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) + if (err != 0) return (err); cbp = buf->b_data; @@ -282,7 +282,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &cbp[i], &czb); - if (err) { + if (err != 0) { if (!hard) break; lasterr = err; @@ -295,7 +295,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) + if (err != 0) return (err); dnp = buf->b_data; @@ -308,7 +308,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, for (i = 0; i < epb; i++) { err = traverse_dnode(td, &dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); - if (err) { + if (err != 0) { if (!hard) break; lasterr = err; @@ -321,7 +321,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) + if (err != 0) return (err); osp = buf->b_data; @@ -405,7 +405,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); - if (err) { + if (err != 0) { if (!hard) break; lasterr = err; @@ -415,7 +415,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); - if (err) { + if (err != 0) { if (!hard) return (err); lasterr = err; @@ -518,14 +518,20 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); /* See comment on ZIL traversal in dsl_scan_visitds. */ - if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { - objset_t *os; + if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + arc_buf_t *buf; - err = dmu_objset_from_ds(ds, &os); - if (err) + err = arc_read(NULL, td->td_spa, rootbp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + if (err != 0) return (err); - traverse_zil(td, &os->os_zil_header); + osp = buf->b_data; + traverse_zil(td, &osp->os_zil_header); + (void) arc_buf_remove_ref(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || @@ -591,7 +597,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, /* visit the MOS */ err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), txg_start, NULL, flags, func, arg); - if (err) + if (err != 0) return (err); /* visit each dataset */ @@ -600,7 +606,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); - if (err) { + if (err != 0) { if (!hard) return (err); lasterr = err; @@ -611,10 +617,10 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, dsl_dataset_t *ds; uint64_t txg = txg_start; - rw_enter(&dp->dp_config_rwlock, RW_READER); + dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (err) { + dsl_pool_config_exit(dp, FTAG); + if (err != 0) { if (!hard) return (err); lasterr = err; @@ -624,7 +630,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, txg = ds->ds_phys->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); - if (err) { + if (err != 0) { if (!hard) return (err); lasterr = err; diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 30867f9d7..3e46a02f8 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -917,7 +917,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) #endif static int -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) +dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) { dmu_tx_hold_t *txh; spa_t *spa = tx->tx_pool->dp_spa; @@ -986,15 +986,6 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) } /* - * NB: This check must be after we've held the dnodes, so that - * the dmu_tx_unassign() logic will work properly - */ - if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) { - DMU_TX_STAT_BUMP(dmu_tx_how); - return (ERESTART); - } - - /* * If a snapshot has been taken since we made our estimates, * assume that we won't be able to free or overwrite anything. */ @@ -1076,29 +1067,28 @@ dmu_tx_unassign(dmu_tx_t *tx) * * (1) TXG_WAIT. If the current open txg is full, waits until there's * a new one. This should be used when you're not holding locks. - * If will only fail if we're truly out of space (or over quota). + * It will only fail if we're truly out of space (or over quota). * * (2) TXG_NOWAIT. If we can't assign into the current open txg without * blocking, returns immediately with ERESTART. This should be used * whenever you're holding locks. On an ERESTART error, the caller * should drop locks, do a dmu_tx_wait(tx), and try again. - * - * (3) A specific txg. Use this if you need to ensure that multiple - * transactions all sync in the same txg. Like TXG_NOWAIT, it - * returns ERESTART if it can't assign you into the requested txg. */ int -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { hrtime_t before, after; int err; ASSERT(tx->tx_txg == 0); - ASSERT(txg_how != 0); + ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); before = gethrtime(); + /* If we might wait, we must not hold the config lock. */ + ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -1124,6 +1114,7 @@ dmu_tx_wait(dmu_tx_t *tx) spa_t *spa = tx->tx_pool->dp_spa; ASSERT(tx->tx_txg == 0); + ASSERT(!dsl_pool_config_held(tx->tx_pool)); /* * It's possible that the pool has become active after this thread @@ -1250,6 +1241,14 @@ dmu_tx_get_txg(dmu_tx_t *tx) return (tx->tx_txg); } +dsl_pool_t * +dmu_tx_pool(dmu_tx_t *tx) +{ + ASSERT(tx->tx_pool != NULL); + return (tx->tx_pool); +} + + void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index d8d66513d..d88134d72 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -74,7 +74,11 @@ dnode_cons(void *arg, void *unused, int kmflag) mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); - refcount_create(&dn->dn_holds); + /* + * Every dbuf has a reference, and dropping a tracked reference is + * O(number of references), so don't track dn_holds. + */ + refcount_create_untracked(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 76e603753..a1c71d487 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -481,6 +481,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); /* * XXX - It would be nice to assert this, but we may still diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 2eca2b204..5c0ca4d96 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -45,12 +45,8 @@ #include <sys/zvol.h> #include <sys/dsl_scan.h> #include <sys/dsl_deadlist.h> - -static char *dsl_reaper = "the grim reaper"; - -static dsl_checkfunc_t dsl_dataset_destroy_begin_check; -static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; -static dsl_syncfunc_t dsl_dataset_set_reservation_sync; +#include <sys/dsl_destroy.h> +#include <sys/dsl_userhold.h> #define SWITCH64(x, y) \ { \ @@ -63,9 +59,6 @@ static dsl_syncfunc_t dsl_dataset_set_reservation_sync; #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE -#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) - - /* * Figure out how much of this delta should be propogated to the dsl_dir * layer. If there's a refreservation, that space has already been @@ -256,7 +249,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) { dsl_dataset_t *ds = dsv; - ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); + ASSERT(ds->ds_owner == NULL); unique_remove(ds->ds_fsid_guid); @@ -264,32 +257,26 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { - dsl_dataset_drop_ref(ds->ds_prev, ds); + dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = NULL; } bplist_destroy(&ds->ds_pending_deadlist); - if (db != NULL) { + if (ds->ds_phys->ds_deadlist_obj != 0) dsl_deadlist_close(&ds->ds_deadlist); - } else { - ASSERT(ds->ds_deadlist.dl_dbuf == NULL); - ASSERT(!ds->ds_deadlist.dl_oldfmt); - } if (ds->ds_dir) - dsl_dir_close(ds->ds_dir, ds); + dsl_dir_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); + refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); } -static int +int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; @@ -305,7 +292,7 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &headdbuf); - if (err) + if (err != 0) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, @@ -334,8 +321,8 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) return (err); } -static int -dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) +int +dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; @@ -355,8 +342,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) return (err); } -static int -dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, +int +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; @@ -365,11 +352,10 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, int err; dmu_object_info_t doi; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); + ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); - if (err) + if (err != 0) return (err); /* Make sure dsobj has the correct object type. */ @@ -388,12 +374,9 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, list_link_init(&ds->ds_synced_link); mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); - - rw_init(&ds->ds_rwlock, NULL, RW_DEFAULT, NULL); - cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&ds->ds_longholds); bplist_create(&ds->ds_pending_deadlist); dsl_deadlist_open(&ds->ds_deadlist, @@ -403,15 +386,13 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, offsetof(dmu_sendarg_t, dsa_link)); if (err == 0) { - err = dsl_dir_open_obj(dp, + err = dsl_dir_hold_obj(dp, ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); } - if (err) { + if (err != 0) { mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); + refcount_destroy(&ds->ds_longholds); bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); @@ -421,8 +402,8 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, if (!dsl_dataset_is_snapshot(ds)) { ds->ds_snapname[0] = '\0'; - if (ds->ds_phys->ds_prev_snap_obj) { - err = dsl_dataset_get_ref(dp, + if (ds->ds_phys->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev); } @@ -438,29 +419,14 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, } if (err == 0 && !dsl_dataset_is_snapshot(ds)) { - /* - * In sync context, we're called with either no lock - * or with the write lock. If we're not syncing, - * we're always called with the read lock held. - */ - boolean_t need_lock = - !RW_WRITE_HELD(&dp->dp_config_rwlock) && - dsl_pool_sync_context(dp); - - if (need_lock) - rw_enter(&dp->dp_config_rwlock, RW_READER); - - err = dsl_prop_get_ds(ds, - "refreservation", sizeof (uint64_t), 1, - &ds->ds_reserved, NULL); + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + &ds->ds_reserved); if (err == 0) { - err = dsl_prop_get_ds(ds, - "refquota", sizeof (uint64_t), 1, - &ds->ds_quota, NULL); + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + &ds->ds_quota); } - - if (need_lock) - rw_exit(&dp->dp_config_rwlock); } else { ds->ds_reserved = ds->ds_quota = 0; } @@ -473,15 +439,13 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) - dsl_dataset_drop_ref(ds->ds_prev, ds); - dsl_dir_close(ds->ds_dir, ds); + dsl_dataset_rele(ds->ds_prev, ds); + dsl_dir_rele(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); + refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); - if (err) { + if (err != 0) { dmu_buf_rele(dbuf, tag); return (err); } @@ -496,170 +460,118 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); - mutex_enter(&ds->ds_lock); - if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { - mutex_exit(&ds->ds_lock); - dmu_buf_rele(ds->ds_dbuf, tag); - return (ENOENT); - } - mutex_exit(&ds->ds_lock); *dsp = ds; return (0); } -static int -dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - /* - * In syncing context we don't want the rwlock lock: there - * may be an existing writer waiting for sync phase to - * finish. We don't need to worry about such writers, since - * sync phase is single-threaded, so the writer can't be - * doing anything while we are active. - */ - if (dsl_pool_sync_context(dp)) { - ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - return (0); - } - - /* - * Normal users will hold the ds_rwlock as a READER until they - * are finished (i.e., call dsl_dataset_rele()). "Owners" will - * drop their READER lock after they set the ds_owner field. - * - * If the dataset is being destroyed, the destroy thread will - * obtain a WRITER lock for exclusive access after it's done its - * open-context work and then change the ds_owner to - * dsl_reaper once destruction is assured. So threads - * may block here temporarily, until the "destructability" of - * the dataset is determined. - */ - ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); - mutex_enter(&ds->ds_lock); - while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { - rw_exit(&dp->dp_config_rwlock); - cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); - if (DSL_DATASET_IS_DESTROYED(ds)) { - mutex_exit(&ds->ds_lock); - dsl_dataset_drop_ref(ds, tag); - rw_enter(&dp->dp_config_rwlock, RW_READER); - return (ENOENT); - } - /* - * The dp_config_rwlock lives above the ds_lock. And - * we need to check DSL_DATASET_IS_DESTROYED() while - * holding the ds_lock, so we have to drop and reacquire - * the ds_lock here. - */ - mutex_exit(&ds->ds_lock); - rw_enter(&dp->dp_config_rwlock, RW_READER); - mutex_enter(&ds->ds_lock); - } - mutex_exit(&ds->ds_lock); - return (0); -} - int -dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **dsp) -{ - int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); - - if (err) - return (err); - return (dsl_dataset_hold_ref(*dsp, tag)); -} - -int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, +dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); - if (err) - return (err); - if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { - dsl_dataset_rele(*dsp, tag); - *dsp = NULL; - return (EBUSY); - } - return (0); -} - -int -dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) -{ dsl_dir_t *dd; - dsl_pool_t *dp; const char *snapname; uint64_t obj; int err = 0; - err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); - if (err) + err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); + if (err != 0) return (err); - dp = dd->dd_pool; + ASSERT(dsl_pool_config_held(dp)); obj = dd->dd_phys->dd_head_dataset_obj; - rw_enter(&dp->dp_config_rwlock, RW_READER); - if (obj) - err = dsl_dataset_get_ref(dp, obj, tag, dsp); + if (obj != 0) + err = dsl_dataset_hold_obj(dp, obj, tag, dsp); else err = ENOENT; - if (err) - goto out; - - err = dsl_dataset_hold_ref(*dsp, tag); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { - dsl_dataset_t *ds = NULL; + dsl_dataset_t *ds; if (*snapname++ != '@') { dsl_dataset_rele(*dsp, tag); - err = ENOENT; - goto out; + dsl_dir_rele(dd, FTAG); + return (ENOENT); } dprintf("looking for snapshot '%s'\n", snapname); err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); if (err == 0) - err = dsl_dataset_get_ref(dp, obj, tag, &ds); + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); dsl_dataset_rele(*dsp, tag); - ASSERT3U((err == 0), ==, (ds != NULL)); - - if (ds) { + if (err == 0) { mutex_enter(&ds->ds_lock); if (ds->ds_snapname[0] == 0) (void) strlcpy(ds->ds_snapname, snapname, sizeof (ds->ds_snapname)); mutex_exit(&ds->ds_lock); - err = dsl_dataset_hold_ref(ds, tag); - *dsp = err ? NULL : ds; + *dsp = ds; } } -out: - rw_exit(&dp->dp_config_rwlock); - dsl_dir_close(dd, FTAG); + + dsl_dir_rele(dd, FTAG); return (err); } int -dsl_dataset_own(const char *name, boolean_t inconsistentok, +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); + if (err != 0) + return (err); + if (!dsl_dataset_tryown(*dsp, tag)) { + dsl_dataset_rele(*dsp, tag); + *dsp = NULL; + return (EBUSY); + } + return (0); +} + +int +dsl_dataset_own(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold(name, tag, dsp); - if (err) + int err = dsl_dataset_hold(dp, name, tag, dsp); + if (err != 0) return (err); - if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { + if (!dsl_dataset_tryown(*dsp, tag)) { dsl_dataset_rele(*dsp, tag); return (EBUSY); } return (0); } +/* + * See the comment above dsl_pool_hold() for details. In summary, a long + * hold is used to prevent destruction of a dataset while the pool hold + * is dropped, allowing other concurrent operations (e.g. spa_sync()). + * + * The dataset and pool must be held when this function is called. After it + * is called, the pool hold may be released while the dataset is still held + * and accessed. + */ +void +dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + (void) refcount_add(&ds->ds_longholds, tag); +} + +void +dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) +{ + (void) refcount_remove(&ds->ds_longholds, tag); +} + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_dataset_long_held(dsl_dataset_t *ds) +{ + return (!refcount_is_zero(&ds->ds_longholds)); +} + void dsl_dataset_name(dsl_dataset_t *ds, char *name) { @@ -667,7 +579,7 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) (void) strcpy(name, "mos"); } else { dsl_dir_name(ds->ds_dir, name); - VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { (void) strcat(name, "@"); /* @@ -685,90 +597,42 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) } } -static int -dsl_dataset_namelen(dsl_dataset_t *ds) -{ - int result; - - if (ds == NULL) { - result = 3; /* "mos" */ - } else { - result = dsl_dir_namelen(ds->ds_dir); - VERIFY(0 == dsl_dataset_get_snapname(ds)); - if (ds->ds_snapname[0]) { - ++result; /* adding one for the @-sign */ - if (!MUTEX_HELD(&ds->ds_lock)) { - mutex_enter(&ds->ds_lock); - result += strlen(ds->ds_snapname); - mutex_exit(&ds->ds_lock); - } else { - result += strlen(ds->ds_snapname); - } - } - } - - return (result); -} - -void -dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) -{ - dmu_buf_rele(ds->ds_dbuf, tag); -} - void dsl_dataset_rele(dsl_dataset_t *ds, void *tag) { - if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { - rw_exit(&ds->ds_rwlock); - } - dsl_dataset_drop_ref(ds, tag); + dmu_buf_rele(ds->ds_dbuf, tag); } void dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { - ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || - (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); + ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; - if (RW_WRITE_HELD(&ds->ds_rwlock)) { - rw_exit(&ds->ds_rwlock); - cv_broadcast(&ds->ds_exclusive_cv); - } mutex_exit(&ds->ds_lock); - if (ds->ds_dbuf) - dsl_dataset_drop_ref(ds, tag); + dsl_dataset_long_rele(ds, tag); + if (ds->ds_dbuf != NULL) + dsl_dataset_rele(ds, tag); else dsl_dataset_evict(NULL, ds); } boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) +dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) { boolean_t gotit = FALSE; mutex_enter(&ds->ds_lock); - if (ds->ds_owner == NULL && - (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { + if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { ds->ds_owner = tag; - if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) - rw_exit(&ds->ds_rwlock); + dsl_dataset_long_hold(ds, tag); gotit = TRUE; } mutex_exit(&ds->ds_lock); return (gotit); } -void -dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) -{ - ASSERT3P(owner, ==, ds->ds_owner); - if (!RW_WRITE_HELD(&ds->ds_rwlock)) - rw_enter(&ds->ds_rwlock, RW_WRITER); -} - uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx) @@ -789,7 +653,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); @@ -807,7 +671,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, if (origin == NULL) { dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); } else { - dsl_dataset_t *ohds; + dsl_dataset_t *ohds; /* head of the origin snapshot */ dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = @@ -824,7 +688,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dmu_buf_will_dirty(origin->ds_dbuf, tx); origin->ds_phys->ds_num_children++; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + VERIFY0(dsl_dataset_hold_obj(dp, origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); @@ -836,9 +700,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY(0 == zap_add_int(mos, - origin->ds_phys->ds_next_clones_obj, - dsobj, tx)); + VERIFY0(zap_add_int(mos, + origin->ds_phys->ds_next_clones_obj, dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -850,7 +713,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY3U(0, ==, zap_add_int(mos, + VERIFY0(zap_add_int(mos, origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); } } @@ -866,6 +729,16 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, return (dsobj); } +static void +dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + dsl_dataset_dirty(ds, tx); +} + uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) @@ -874,29 +747,28 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, uint64_t dsobj, ddobj; dsl_dir_t *dd; + ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); - VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); + VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); - dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); + dsobj = dsl_dataset_create_sync_dd(dd, origin, + flags & ~DS_CREATE_FLAG_NODIRTY, tx); dsl_deleg_set_create_perms(dd, tx, cr); - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); /* * If we are creating a clone, make sure we zero out any stale * data from the origin snapshots zil header. */ - if (origin != NULL) { + if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { dsl_dataset_t *ds; - objset_t *os; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); - bzero(&os->os_zil_header, sizeof (os->os_zil_header)); - dsl_dataset_dirty(ds, tx); + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_zero_zil(ds, tx); dsl_dataset_rele(ds, FTAG); } @@ -904,332 +776,6 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, } /* - * The snapshots must all be in the same pool. - */ -int -dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, - nvlist_t *errlist) -{ - int err; - dsl_sync_task_t *dst; - spa_t *spa; - nvpair_t *pair; - dsl_sync_task_group_t *dstg; - - pair = nvlist_next_nvpair(snaps, NULL); - if (pair == NULL) - return (0); - - err = spa_open(nvpair_name(pair), &spa, FTAG); - if (err) - return (err); - dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - dsl_dataset_t *ds; - - err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); - if (err == 0) { - struct dsl_ds_destroyarg *dsda; - - dsl_dataset_make_exclusive(ds, dstg); - dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), - KM_SLEEP); - dsda->ds = ds; - dsda->defer = defer; - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, dsda, dstg, 0); - } else if (err == ENOENT) { - err = 0; - } else { - fnvlist_add_int32(errlist, nvpair_name(pair), err); - break; - } - } - - if (err == 0) - err = dsl_sync_task_group_wait(dstg); - - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - struct dsl_ds_destroyarg *dsda = dst->dst_arg1; - dsl_dataset_t *ds = dsda->ds; - - /* - * Return the snapshots that triggered the error. - */ - if (dst->dst_err != 0) { - char name[ZFS_MAXNAMELEN]; - dsl_dataset_name(ds, name); - fnvlist_add_int32(errlist, name, dst->dst_err); - } - ASSERT3P(dsda->rm_origin, ==, NULL); - dsl_dataset_disown(ds, dstg); - kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); - } - - dsl_sync_task_group_destroy(dstg); - spa_close(spa, FTAG); - return (err); - -} - -static boolean_t -dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) -{ - boolean_t might_destroy = B_FALSE; - - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && - DS_IS_DEFER_DESTROY(ds)) - might_destroy = B_TRUE; - mutex_exit(&ds->ds_lock); - - return (might_destroy); -} - -/* - * If we're removing a clone, and these three conditions are true: - * 1) the clone's origin has no other children - * 2) the clone's origin has no user references - * 3) the clone's origin has been marked for deferred destruction - * Then, prepare to remove the origin as part of this sync task group. - */ -static int -dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) -{ - dsl_dataset_t *ds = dsda->ds; - dsl_dataset_t *origin = ds->ds_prev; - - if (dsl_dataset_might_destroy_origin(origin)) { - char *name; - int namelen; - int error; - - namelen = dsl_dataset_namelen(origin) + 1; - name = kmem_alloc(namelen, KM_SLEEP); - dsl_dataset_name(origin, name); -#ifdef _KERNEL - error = zfs_unmount_snap(name, NULL); - if (error) { - kmem_free(name, namelen); - return (error); - } -#endif - error = dsl_dataset_own(name, B_TRUE, tag, &origin); - kmem_free(name, namelen); - if (error) - return (error); - dsda->rm_origin = origin; - dsl_dataset_make_exclusive(origin, tag); - } - - return (0); -} - -/* - * ds must be opened as OWNER. On return (whether successful or not), - * ds will be closed and caller can no longer dereference it. - */ -int -dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) -{ - int err; - dsl_sync_task_group_t *dstg; - objset_t *os; - dsl_dir_t *dd; - uint64_t obj; - struct dsl_ds_destroyarg dsda = { 0 }; - - dsda.ds = ds; - - if (dsl_dataset_is_snapshot(ds)) { - /* Destroying a snapshot is simpler */ - dsl_dataset_make_exclusive(ds, tag); - - dsda.defer = defer; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - &dsda, tag, 0); - ASSERT3P(dsda.rm_origin, ==, NULL); - goto out; - } else if (defer) { - err = EINVAL; - goto out; - } - - dd = ds->ds_dir; - - if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { - /* - * Check for errors and mark this ds as inconsistent, in - * case we crash while freeing the objects. - */ - err = dsl_sync_task_do(dd->dd_pool, - dsl_dataset_destroy_begin_check, - dsl_dataset_destroy_begin_sync, ds, NULL, 0); - if (err) - goto out; - - err = dmu_objset_from_ds(ds, &os); - if (err) - goto out; - - /* - * Remove all objects while in the open context so that - * there is less work to do in the syncing context. - */ - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, - ds->ds_phys->ds_prev_snap_txg)) { - /* - * Ignore errors, if there is not enough disk space - * we will deal with it in dsl_dataset_destroy_sync(). - */ - (void) dmu_free_object(os, obj); - } - if (err != ESRCH) - goto out; - - /* - * Sync out all in-flight IO. - */ - txg_wait_synced(dd->dd_pool, 0); - - /* - * If we managed to free all the objects in open - * context, the user space accounting should be zero. - */ - if (ds->ds_phys->ds_bp.blk_fill == 0 && - dmu_objset_userused_enabled(os)) { - ASSERTV(uint64_t count); - - ASSERT(zap_count(os, DMU_USERUSED_OBJECT, - &count) != 0 || count == 0); - ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, - &count) != 0 || count == 0); - } - } - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); - rw_exit(&dd->dd_pool->dp_config_rwlock); - - if (err) - goto out; - - /* - * Blow away the dsl_dir + head dataset. - */ - dsl_dataset_make_exclusive(ds, tag); - /* - * If we're removing a clone, we might also need to remove its - * origin. - */ - do { - dsda.need_prep = B_FALSE; - if (dsl_dir_is_clone(dd)) { - err = dsl_dataset_origin_rm_prep(&dsda, tag); - if (err) { - dsl_dir_close(dd, FTAG); - goto out; - } - } - - dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, &dsda, tag, 0); - dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, dd, FTAG, 0); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); - - /* - * We could be racing against 'zfs release' or 'zfs destroy -d' - * on the origin snap, in which case we can get EBUSY if we - * needed to destroy the origin snap but were not ready to - * do so. - */ - if (dsda.need_prep) { - ASSERT(err == EBUSY); - ASSERT(dsl_dir_is_clone(dd)); - ASSERT(dsda.rm_origin == NULL); - } - } while (dsda.need_prep); - - if (dsda.rm_origin != NULL) - dsl_dataset_disown(dsda.rm_origin, tag); - - /* if it is successful, dsl_dir_destroy_sync will close the dd */ - if (err) - dsl_dir_close(dd, FTAG); - -out: - dsl_dataset_disown(ds, tag); - return (err); -} - -blkptr_t * -dsl_dataset_get_blkptr(dsl_dataset_t *ds) -{ - return (&ds->ds_phys->ds_bp); -} - -void -dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - /* If it's the meta-objset, set dp_meta_rootbp */ - if (ds == NULL) { - tx->tx_pool->dp_meta_rootbp = *bp; - } else { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_bp = *bp; - } -} - -spa_t * -dsl_dataset_get_spa(dsl_dataset_t *ds) -{ - return (ds->ds_dir->dd_pool->dp_spa); -} - -void -dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp; - - if (ds == NULL) /* this is the meta-objset */ - return; - - ASSERT(ds->ds_objset != NULL); - - if (ds->ds_phys->ds_next_snap_obj != 0) - panic("dirtying snapshot!"); - - dp = ds->ds_dir->dd_pool; - - if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { - /* up the hold count until we can be written out */ - dmu_buf_add_ref(ds->ds_dbuf, ds); - } -} - -boolean_t -dsl_dataset_is_dirty(dsl_dataset_t *ds) -{ - int t; - - for (t = 0; t < TXG_SIZE; t++) { - if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, - ds, t)) - return (B_TRUE); - } - return (B_FALSE); -} - -/* * The unique space in the head dataset can be calculated by subtracting * the space used in the most recent snapshot, that is still being used * in this file system, from the space currently in use. To figure out @@ -1237,7 +783,7 @@ dsl_dataset_is_dirty(dsl_dataset_t *ds) * the total space used in the snapshot and subtract out the space that * has been freed up since the snapshot was taken. */ -static void +void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) { uint64_t mrs_used; @@ -1261,234 +807,10 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } -struct killarg { - dsl_dataset_t *ds; - dmu_tx_t *tx; -}; - -/* ARGSUSED */ -static int -kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) -{ - struct killarg *ka = arg; - dmu_tx_t *tx = ka->tx; - - if (bp == NULL) - return (0); - - if (zb->zb_level == ZB_ZIL_LEVEL) { - ASSERT(zilog != NULL); - /* - * It's a block in the intent log. It has no - * accounting, so just free it. - */ - dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); - } else { - ASSERT(zilog == NULL); - ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); - (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); - } - - return (0); -} - -/* ARGSUSED */ -static int -dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t count; - int err; - - /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) - */ - if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EBUSY); - - /* - * This is really a dsl_dir thing, but check it here so that - * we'll be less likely to leave this dataset inconsistent & - * nearly destroyed. - */ - err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); - if (err) - return (err); - if (count != 0) - return (EEXIST); - - return (0); -} - -/* ARGSUSED */ -static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - /* Mark it as inconsistent on-disk, in case we crash */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - - spa_history_log_internal_ds(ds, "destroy begin", tx, ""); -} - -static int -dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, +void +dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { - dsl_dataset_t *ds = dsda->ds; - dsl_dataset_t *ds_prev = ds->ds_prev; - - if (dsl_dataset_might_destroy_origin(ds_prev)) { - struct dsl_ds_destroyarg ndsda = {0}; - - /* - * If we're not prepared to remove the origin, don't remove - * the clone either. - */ - if (dsda->rm_origin == NULL) { - dsda->need_prep = B_TRUE; - return (EBUSY); - } - - ndsda.ds = ds_prev; - ndsda.is_origin_rm = B_TRUE; - return (dsl_dataset_destroy_check(&ndsda, tag, tx)); - } - - /* - * If we're not going to remove the origin after all, - * undo the open context setup. - */ - if (dsda->rm_origin != NULL) { - dsl_dataset_disown(dsda->rm_origin, tag); - dsda->rm_origin = NULL; - } - - return (0); -} - -/* - * If you add new checks here, you may need to add - * additional checks to the "temporary" case in - * snapshot_check() in dmu_objset.c. - */ -/* ARGSUSED */ -int -dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - struct dsl_ds_destroyarg *dsda = arg1; - dsl_dataset_t *ds = dsda->ds; - - /* we have an owner hold, so noone else can destroy us */ - ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - - /* - * Only allow deferred destroy on pools that support it. - * NOTE: deferred destroy is only supported on snapshots. - */ - if (dsda->defer) { - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < - SPA_VERSION_USERREFS) - return (ENOTSUP); - ASSERT(dsl_dataset_is_snapshot(ds)); - return (0); - } - - /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) - */ - if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EBUSY); - - /* - * If we made changes this txg, traverse_dsl_dataset won't find - * them. Try again. - */ - if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) - return (EAGAIN); - - if (dsl_dataset_is_snapshot(ds)) { - /* - * If this snapshot has an elevated user reference count, - * we can't destroy it yet. - */ - if (ds->ds_userrefs > 0 && !dsda->releasing) - return (EBUSY); - - mutex_enter(&ds->ds_lock); - /* - * Can't delete a branch point. However, if we're destroying - * a clone and removing its origin due to it having a user - * hold count of 0 and having been marked for deferred destroy, - * it's OK for the origin to have a single clone. - */ - if (ds->ds_phys->ds_num_children > - (dsda->is_origin_rm ? 2 : 1)) { - mutex_exit(&ds->ds_lock); - return (EEXIST); - } - mutex_exit(&ds->ds_lock); - } else if (dsl_dir_is_clone(ds->ds_dir)) { - return (dsl_dataset_origin_check(dsda, arg2, tx)); - } - - /* XXX we should do some i/o error checking... */ - return (0); -} - -struct refsarg { - kmutex_t lock; - boolean_t gone; - kcondvar_t cv; -}; - -/* ARGSUSED */ -static void -dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) -{ - struct refsarg *arg = argv; - - mutex_enter(&arg->lock); - arg->gone = TRUE; - cv_signal(&arg->cv); - mutex_exit(&arg->lock); -} - -static void -dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) -{ - struct refsarg arg; - - mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); - arg.gone = FALSE; - (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, - dsl_dataset_refs_gone); - dmu_buf_rele(ds->ds_dbuf, tag); - mutex_enter(&arg.lock); - while (!arg.gone) - cv_wait(&arg.cv, &arg.lock); - ASSERT(arg.gone); - mutex_exit(&arg.lock); - ds->ds_dbuf = NULL; - ds->ds_phys = NULL; - mutex_destroy(&arg.lock); - cv_destroy(&arg.cv); -} - -static void -remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) -{ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; ASSERTV(uint64_t count); @@ -1505,491 +827,71 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) * too many entries in the next_clones_obj even after failing to * remove this one. */ - if (err != ENOENT) { + if (err != ENOENT) VERIFY0(err); - } - ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, + ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count)); ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); } -static void -dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - zap_cursor_t zc; - zap_attribute_t za; - - /* - * If it is the old version, dd_clones doesn't exist so we can't - * find the clones, but deadlist_remove_key() is a no-op so it - * doesn't matter. - */ - if (ds->ds_dir->dd_phys->dd_clones == 0) - return; - - for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_dataset_t *clone; - VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone)); - if (clone->ds_dir->dd_origin_txg > mintxg) { - dsl_deadlist_remove_key(&clone->ds_deadlist, - mintxg, tx); - dsl_dataset_remove_clones_key(clone, mintxg, tx); - } - dsl_dataset_rele(clone, FTAG); - } - zap_cursor_fini(&zc); +blkptr_t * +dsl_dataset_get_blkptr(dsl_dataset_t *ds) +{ + return (&ds->ds_phys->ds_bp); } -struct process_old_arg { - dsl_dataset_t *ds; - dsl_dataset_t *ds_prev; - boolean_t after_branch_point; - zio_t *pio; - uint64_t used, comp, uncomp; -}; - -static int -process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +void +dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { - struct process_old_arg *poa = arg; - dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; - - if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { - dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); - if (poa->ds_prev && !poa->after_branch_point && - bp->blk_birth > - poa->ds_prev->ds_phys->ds_prev_snap_txg) { - poa->ds_prev->ds_phys->ds_unique_bytes += - bp_get_dsize_sync(dp->dp_spa, bp); - } + ASSERT(dmu_tx_is_syncing(tx)); + /* If it's the meta-objset, set dp_meta_rootbp */ + if (ds == NULL) { + tx->tx_pool->dp_meta_rootbp = *bp; } else { - poa->used += bp_get_dsize_sync(dp->dp_spa, bp); - poa->comp += BP_GET_PSIZE(bp); - poa->uncomp += BP_GET_UCSIZE(bp); - dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_bp = *bp; } - return (0); -} - -static void -process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, - dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) -{ - struct process_old_arg poa = { 0 }; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - - ASSERT(ds->ds_deadlist.dl_oldfmt); - ASSERT(ds_next->ds_deadlist.dl_oldfmt); - - poa.ds = ds; - poa.ds_prev = ds_prev; - poa.after_branch_point = after_branch_point; - poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, - process_old_cb, &poa, tx)); - VERIFY0(zio_wait(poa.pio)); - ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); - - /* change snapused */ - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -poa.used, -poa.comp, -poa.uncomp, tx); - - /* swap next's deadlist to our deadlist */ - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_close(&ds_next->ds_deadlist); - SWITCH64(ds_next->ds_phys->ds_deadlist_obj, - ds->ds_phys->ds_deadlist_obj); - dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); - dsl_deadlist_open(&ds_next->ds_deadlist, mos, - ds_next->ds_phys->ds_deadlist_obj); } -static int -old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +spa_t * +dsl_dataset_get_spa(dsl_dataset_t *ds) { - int err; - struct killarg ka; - - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * NB: this should be very quick, because we already - * freed all the objects in open context. - */ - ka.ds = ds; - ka.tx = tx; - err = traverse_dataset(ds, - ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, - kill_blkptr, &ka); - ASSERT0(err); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); - - return (err); + return (ds->ds_dir->dd_pool->dp_spa); } void -dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) +dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) { - struct dsl_ds_destroyarg *dsda = arg1; - dsl_dataset_t *ds = dsda->ds; - int err = 0; - int after_branch_point = FALSE; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - dsl_dataset_t *ds_prev = NULL; - boolean_t wont_destroy; - uint64_t obj; - - wont_destroy = (dsda->defer && - (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); - - ASSERT(ds->ds_owner || wont_destroy); - ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); - ASSERT(ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); - ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + dsl_pool_t *dp; - if (wont_destroy) { - ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; - spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); + if (ds == NULL) /* this is the meta-objset */ return; - } - - /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, ""); - - /* signal any waiters that this dataset is going away */ - mutex_enter(&ds->ds_lock); - ds->ds_owner = dsl_reaper; - cv_broadcast(&ds->ds_exclusive_cv); - mutex_exit(&ds->ds_lock); - - /* Remove our reservation */ - if (ds->ds_reserved != 0) { - dsl_prop_setarg_t psa; - uint64_t value = 0; - - dsl_prop_setarg_init_uint64(&psa, "refreservation", - (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), - &value); - psa.psa_effective_value = 0; /* predict default value */ - - dsl_dataset_set_reservation_sync(ds, &psa, tx); - ASSERT0(ds->ds_reserved); - } - - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - - dsl_scan_ds_destroyed(ds, tx); - - obj = ds->ds_object; - - if (ds->ds_phys->ds_prev_snap_obj != 0) { - if (ds->ds_prev) { - ds_prev = ds->ds_prev; - } else { - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); - } - after_branch_point = - (ds_prev->ds_phys->ds_next_snap_obj != obj); - - dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); - if (after_branch_point && - ds_prev->ds_phys->ds_next_clones_obj != 0) { - remove_from_next_clones(ds_prev, obj, tx); - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(0 == zap_add_int(mos, - ds_prev->ds_phys->ds_next_clones_obj, - ds->ds_phys->ds_next_snap_obj, tx)); - } - } - if (after_branch_point && - ds->ds_phys->ds_next_snap_obj == 0) { - /* This clone is toast. */ - ASSERT(ds_prev->ds_phys->ds_num_children > 1); - ds_prev->ds_phys->ds_num_children--; - - /* - * If the clone's origin has no other clones, no - * user holds, and has been marked for deferred - * deletion, then we should have done the necessary - * destroy setup for it. - */ - if (ds_prev->ds_phys->ds_num_children == 1 && - ds_prev->ds_userrefs == 0 && - DS_IS_DEFER_DESTROY(ds_prev)) { - ASSERT3P(dsda->rm_origin, !=, NULL); - } else { - ASSERT3P(dsda->rm_origin, ==, NULL); - } - } else if (!after_branch_point) { - ds_prev->ds_phys->ds_next_snap_obj = - ds->ds_phys->ds_next_snap_obj; - } - } - - if (dsl_dataset_is_snapshot(ds)) { - dsl_dataset_t *ds_next; - uint64_t old_unique; - uint64_t used = 0, comp = 0, uncomp = 0; - - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); - ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); - - old_unique = ds_next->ds_phys->ds_unique_bytes; - - dmu_buf_will_dirty(ds_next->ds_dbuf, tx); - ds_next->ds_phys->ds_prev_snap_obj = - ds->ds_phys->ds_prev_snap_obj; - ds_next->ds_phys->ds_prev_snap_txg = - ds->ds_phys->ds_prev_snap_txg; - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, - ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); - - - if (ds_next->ds_deadlist.dl_oldfmt) { - process_old_deadlist(ds, ds_prev, ds_next, - after_branch_point, tx); - } else { - /* Adjust prev's unique space. */ - if (ds_prev && !after_branch_point) { - dsl_deadlist_space_range(&ds_next->ds_deadlist, - ds_prev->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_prev_snap_txg, - &used, &comp, &uncomp); - ds_prev->ds_phys->ds_unique_bytes += used; - } - - /* Adjust snapused. */ - dsl_deadlist_space_range(&ds_next->ds_deadlist, - ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &used, &comp, &uncomp); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -used, -comp, -uncomp, tx); - - /* Move blocks to be freed to pool's free list. */ - dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, - &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, - tx); - dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, - DD_USED_HEAD, used, comp, uncomp, tx); - - /* Merge our deadlist into next's and free it. */ - dsl_deadlist_merge(&ds_next->ds_deadlist, - ds->ds_phys->ds_deadlist_obj, tx); - } - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); - /* Collapse range in clone heads */ - dsl_dataset_remove_clones_key(ds, - ds->ds_phys->ds_creation_txg, tx); - - if (dsl_dataset_is_snapshot(ds_next)) { - dsl_dataset_t *ds_nextnext; - dsl_dataset_t *hds; - - /* - * Update next's unique to include blocks which - * were previously shared by only this snapshot - * and it. Those blocks will be born after the - * prev snap and before this snap, and will have - * died after the next snap and before the one - * after that (ie. be on the snap after next's - * deadlist). - */ - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds_next->ds_phys->ds_next_snap_obj, - FTAG, &ds_nextnext)); - dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, - ds->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_creation_txg, - &used, &comp, &uncomp); - ds_next->ds_phys->ds_unique_bytes += used; - dsl_dataset_rele(ds_nextnext, FTAG); - ASSERT3P(ds_next->ds_prev, ==, NULL); - - /* Collapse range in this head. */ - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, - FTAG, &hds)); - dsl_deadlist_remove_key(&hds->ds_deadlist, - ds->ds_phys->ds_creation_txg, tx); - dsl_dataset_rele(hds, FTAG); - - } else { - ASSERT3P(ds_next->ds_prev, ==, ds); - dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); - ds_next->ds_prev = NULL; - if (ds_prev) { - VERIFY(0 == dsl_dataset_get_ref(dp, - ds->ds_phys->ds_prev_snap_obj, - ds_next, &ds_next->ds_prev)); - } - - dsl_dataset_recalc_head_uniq(ds_next); - - /* - * Reduce the amount of our unconsmed refreservation - * being charged to our parent by the amount of - * new unique data we have gained. - */ - if (old_unique < ds_next->ds_reserved) { - int64_t mrsdelta; - uint64_t new_unique = - ds_next->ds_phys->ds_unique_bytes; - - ASSERT(old_unique <= new_unique); - mrsdelta = MIN(new_unique - old_unique, - ds_next->ds_reserved - old_unique); - dsl_dir_diduse_space(ds->ds_dir, - DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); - } - } - dsl_dataset_rele(ds_next, FTAG); - } else { - zfeature_info_t *async_destroy = - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; - objset_t *os; - - /* - * There's no next snapshot, so this is a head dataset. - * Destroy the deadlist. Unless it's a clone, the - * deadlist should be empty. (If it's a clone, it's - * safe to ignore the deadlist contents.) - */ - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); - ds->ds_phys->ds_deadlist_obj = 0; - - VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); - - if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { - err = old_synchronous_dataset_destroy(ds, tx); - } else { - /* - * Move the bptree into the pool's list of trees to - * clean up and update space accounting information. - */ - uint64_t used, comp, uncomp; - - zil_destroy_sync(dmu_objset_zil(os), tx); - - if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { - spa_feature_incr(dp->dp_spa, async_destroy, tx); - dp->dp_bptree_obj = bptree_alloc(mos, tx); - VERIFY(zap_add(mos, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, - &dp->dp_bptree_obj, tx) == 0); - } - - used = ds->ds_dir->dd_phys->dd_used_bytes; - comp = ds->ds_dir->dd_phys->dd_compressed_bytes; - uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; - - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - ds->ds_phys->ds_unique_bytes == used); - - bptree_add(mos, dp->dp_bptree_obj, - &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, - used, comp, uncomp, tx); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - -used, -comp, -uncomp, tx); - dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, - used, comp, uncomp, tx); - } + ASSERT(ds->ds_objset != NULL); - if (ds->ds_prev != NULL) { - if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - VERIFY3U(0, ==, zap_remove_int(mos, - ds->ds_prev->ds_dir->dd_phys->dd_clones, - ds->ds_object, tx)); - } - dsl_dataset_rele(ds->ds_prev, ds); - ds->ds_prev = ds_prev = NULL; - } - } + if (ds->ds_phys->ds_next_snap_obj != 0) + panic("dirtying snapshot!"); - /* - * This must be done after the dsl_traverse(), because it will - * re-open the objset. - */ - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } + dp = ds->ds_dir->dd_pool; - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { - /* Erase the link in the dir */ - dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; - ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); - err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); - ASSERT(err == 0); - } else { - /* remove from snapshot namespace */ - dsl_dataset_t *ds_head; - ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); - VERIFY(0 == dsl_dataset_get_snapname(ds)); -#ifdef ZFS_DEBUG - { - uint64_t val; - - err = dsl_dataset_snap_lookup(ds_head, - ds->ds_snapname, &val); - ASSERT0(err); - ASSERT3U(val, ==, obj); - } -#endif - err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); - ASSERT(err == 0); - dsl_dataset_rele(ds_head, FTAG); + if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, ds); } +} - if (ds_prev && ds->ds_prev != ds_prev) - dsl_dataset_rele(ds_prev, FTAG); - - spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - - if (ds->ds_phys->ds_next_clones_obj != 0) { - ASSERTV(uint64_t count); - ASSERT(0 == zap_count(mos, - ds->ds_phys->ds_next_clones_obj, &count) && count == 0); - VERIFY(0 == dmu_object_free(mos, - ds->ds_phys->ds_next_clones_obj, tx)); - } - if (ds->ds_phys->ds_props_obj != 0) - VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); - if (ds->ds_phys->ds_userrefs_obj != 0) - VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); - dsl_dir_close(ds->ds_dir, ds); - ds->ds_dir = NULL; - dsl_dataset_drain_refs(ds, tag); - VERIFY(0 == dmu_object_free(mos, obj, tx)); - - if (dsda->rm_origin) { - /* - * Remove the origin of the clone we just destroyed. - */ - struct dsl_ds_destroyarg ndsda = {0}; +boolean_t +dsl_dataset_is_dirty(dsl_dataset_t *ds) +{ + int t; - ndsda.ds = dsda->rm_origin; - dsl_dataset_destroy_sync(&ndsda, tag, tx); + for (t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, + ds, t)) + return (B_TRUE); } + return (B_FALSE); } static int @@ -2020,13 +922,24 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) return (0); } +typedef struct dsl_dataset_snapshot_arg { + nvlist_t *ddsa_snaps; + nvlist_t *ddsa_props; + nvlist_t *ddsa_errors; +} dsl_dataset_snapshot_arg_t; + int -dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname, +dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx) { - int err; + int error; uint64_t value; + ds->ds_trysnap_txg = tx->tx_txg; + + if (!dmu_tx_is_syncing(tx)) + return (0); + /* * We don't allow multiple snapshots of the same txg. If there * is already one, try again. @@ -2037,29 +950,65 @@ dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname, /* * Check for conflicting snapshot name. */ - err = dsl_dataset_snap_lookup(ds, snapname, &value); - if (err == 0) + error = dsl_dataset_snap_lookup(ds, snapname, &value); + if (error == 0) return (EEXIST); - if (err != ENOENT) - return (err); - - /* - * Check that the dataset's name is not too long. Name consists - * of the dataset's length + 1 for the @-sign + snapshot name's length - */ - if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) - return (ENAMETOOLONG); + if (error != ENOENT) + return (error); - err = dsl_dataset_snapshot_reserve_space(ds, tx); - if (err) - return (err); + error = dsl_dataset_snapshot_reserve_space(ds, tx); + if (error != 0) + return (error); - ds->ds_trysnap_txg = tx->tx_txg; return (0); } +static int +dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; + + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + int error = 0; + dsl_dataset_t *ds; + char *name, *atp; + char dsname[MAXNAMELEN]; + + name = nvpair_name(pair); + if (strlen(name) >= MAXNAMELEN) + error = ENAMETOOLONG; + if (error == 0) { + atp = strchr(name, '@'); + if (atp == NULL) + error = EINVAL; + if (error == 0) + (void) strlcpy(dsname, name, atp - name + 1); + } + if (error == 0) + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + error = dsl_dataset_snapshot_check_impl(ds, + atp + 1, tx); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + if (ddsa->ddsa_errors != NULL) { + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + } + rv = error; + } + } + return (rv); +} + void -dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, +dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; @@ -2067,9 +1016,20 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; - int err; + ASSERTV(static zil_header_t zero_zil); + ASSERTV(objset_t *os); + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* + * If we are on an old pool, the zil must not be active, in which + * case it will be zeroed. Usually zil_suspend() accomplishes this. + */ + ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || + dmu_objset_from_ds(ds, &os) != 0 || + bcmp(&os->os_phys->os_zil_header, &zero_zil, + sizeof (zero_zil)) == 0); - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); /* * The origin's ds_creation_txg has to be < TXG_INITIAL @@ -2081,7 +1041,7 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); @@ -2116,9 +1076,9 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, ds->ds_prev->ds_phys->ds_creation_txg); ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { - remove_from_next_clones(ds->ds_prev, + dsl_dataset_remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); - VERIFY3U(0, ==, zap_add_int(mos, + VERIFY0(zap_add_int(mos, next_clones_obj, dsobj, tx)); } } @@ -2137,9 +1097,6 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, } dmu_buf_will_dirty(ds->ds_dbuf, tx); - zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", - ds->ds_dir->dd_myname, snapname, dsobj, - ds->ds_phys->ds_prev_snap_txg); ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); @@ -2154,13 +1111,12 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, - snapname, 8, 1, &dsobj, tx); - ASSERT(err == 0); + VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &dsobj, tx)); if (ds->ds_prev) - dsl_dataset_drop_ref(ds->ds_prev, ds); - VERIFY(0 == dsl_dataset_get_ref(dp, + dsl_dataset_rele(ds->ds_prev, ds); + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); dsl_scan_ds_snapshotted(ds, tx); @@ -2170,6 +1126,198 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); } +static void +dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + dsl_dataset_t *ds; + char *name, *atp; + char dsname[MAXNAMELEN]; + + name = nvpair_name(pair); + atp = strchr(name, '@'); + (void) strlcpy(dsname, name, atp - name + 1); + VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); + if (ddsa->ddsa_props != NULL) { + dsl_props_set_sync_impl(ds->ds_prev, + ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); + } + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The snapshots must all be in the same pool. + * All-or-nothing: if there are any failures, nothing will be modified. + */ +int +dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) +{ + dsl_dataset_snapshot_arg_t ddsa; + nvpair_t *pair; + boolean_t needsuspend; + int error; + spa_t *spa; + char *firstname; + nvlist_t *suspended = NULL; + + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + firstname = nvpair_name(pair); + + error = spa_open(firstname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + suspended = fnvlist_alloc(); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char fsname[MAXNAMELEN]; + char *snapname = nvpair_name(pair); + char *atp; + void *cookie; + + atp = strchr(snapname, '@'); + if (atp == NULL) { + error = EINVAL; + break; + } + (void) strlcpy(fsname, snapname, atp - snapname + 1); + + error = zil_suspend(fsname, &cookie); + if (error != 0) + break; + fnvlist_add_uint64(suspended, fsname, + (uintptr_t)cookie); + } + } + + ddsa.ddsa_snaps = snaps; + ddsa.ddsa_props = props; + ddsa.ddsa_errors = errors; + + if (error == 0) { + error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, &ddsa, + fnvlist_num_pairs(snaps) * 3); + } + + if (suspended != NULL) { + for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; + pair = nvlist_next_nvpair(suspended, pair)) { + zil_resume((void *)(uintptr_t) + fnvpair_value_uint64(pair)); + } + fnvlist_free(suspended); + } + + return (error); +} + +typedef struct dsl_dataset_snapshot_tmp_arg { + const char *ddsta_fsname; + const char *ddsta_snapname; + minor_t ddsta_cleanup_minor; + const char *ddsta_htag; +} dsl_dataset_snapshot_tmp_arg_t; + +static int +dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } + error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, + B_TRUE, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); + dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, + ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); + dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); + + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, + minor_t cleanup_minor, const char *htag) +{ + dsl_dataset_snapshot_tmp_arg_t ddsta; + int error; + spa_t *spa; + boolean_t needsuspend; + void *cookie; + + ddsta.ddsta_fsname = fsname; + ddsta.ddsta_snapname = snapname; + ddsta.ddsta_cleanup_minor = cleanup_minor; + ddsta.ddsta_htag = htag; + + error = spa_open(fsname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + error = zil_suspend(fsname, &cookie); + if (error != 0) + return (error); + } + + error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, + dsl_dataset_snapshot_tmp_sync, &ddsta, 3); + + if (needsuspend) + zil_resume(cookie); + return (error); +} + + void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { @@ -2194,64 +1342,48 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; - nvlist_t *propval; - nvlist_t *val; + nvlist_t *propval = fnvlist_alloc(); + nvlist_t *val = fnvlist_alloc(); - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* - * There may me missing entries in ds_next_clones_obj + * There may be missing entries in ds_next_clones_obj * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. */ if (ds->ds_phys->ds_next_clones_obj != 0) { - ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, + ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count)); } - if (count != ds->ds_phys->ds_num_children - 1) { + if (count != ds->ds_phys->ds_num_children - 1) goto fail; - } for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAXNAMELEN]; - /* - * Even though we hold the dp_config_rwlock, the dataset - * may fail to open, returning ENOENT. If there is a - * thread concurrently attempting to destroy this - * dataset, it will have the ds_rwlock held for - * RW_WRITER. Our call to dsl_dataset_hold_obj() -> - * dsl_dataset_hold_ref() will fail its - * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the - * dp_config_rwlock, and wait for the destroy progress - * and signal ds_exclusive_cv. If the destroy was - * successful, we will see that - * DSL_DATASET_IS_DESTROYED(), and return ENOENT. - */ - if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone) != 0) - continue; + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); dsl_dir_name(clone->ds_dir, buf); - VERIFY(nvlist_add_boolean(val, buf) == 0); + fnvlist_add_boolean(val, buf); dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); - VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); - VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), - propval) == 0); + fnvlist_add_nvlist(propval, ZPROP_VALUE, val); + fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); fail: nvlist_free(val); nvlist_free(propval); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { uint64_t refd, avail, uobjs, aobjs, ratio; + ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); + + ASSERT(dsl_pool_config_held(dp)); ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : (ds->ds_phys->ds_uncompressed_bytes * 100 / @@ -2297,10 +1429,8 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_dataset_t *prev; int err; - rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - rw_exit(&dp->dp_config_rwlock); if (err == 0) { err = dsl_dataset_space_written(prev, ds, &written, &comp, &uncomp); @@ -2317,6 +1447,9 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + ASSERT(dsl_pool_config_held(dp)); + stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; stat->dds_guid = ds->ds_phys->ds_guid; @@ -2328,16 +1461,14 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) stat->dds_is_snapshot = B_FALSE; stat->dds_num_clones = 0; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_t *ods; - VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); - dsl_dataset_drop_ref(ods, FTAG); + dsl_dataset_rele(ods, FTAG); } - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } } @@ -2375,8 +1506,7 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) { ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); + ASSERT(dsl_pool_config_held(dp)); if (ds->ds_prev == NULL) return (B_FALSE); if (ds->ds_phys->ds_bp.blk_birth > @@ -2398,237 +1528,225 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) return (B_FALSE); } +typedef struct dsl_dataset_rename_snapshot_arg { + const char *ddrsa_fsname; + const char *ddrsa_oldsnapname; + const char *ddrsa_newsnapname; + boolean_t ddrsa_recursive; + dmu_tx_t *ddrsa_tx; +} dsl_dataset_rename_snapshot_arg_t; + /* ARGSUSED */ static int -dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) { - dsl_dataset_t *ds = arg1; - char *newsnapname = arg2; - dsl_dir_t *dd = ds->ds_dir; - dsl_dataset_t *hds; + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + int error; uint64_t val; - int err; - err = dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); - if (err) - return (err); - - /* new name better not be in use */ - err = dsl_dataset_snap_lookup(hds, newsnapname, &val); - dsl_dataset_rele(hds, FTAG); + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + if (error != 0) { + /* ignore nonexistent snapshots */ + return (error == ENOENT ? 0 : error); + } - if (err == 0) - err = EEXIST; - else if (err == ENOENT) - err = 0; + /* new name should not exist */ + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); + if (error == 0) + error = EEXIST; + else if (error == ENOENT) + error = 0; /* dataset name + 1 for the "@" + the new snapshot name must fit */ - if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) - err = ENAMETOOLONG; + if (dsl_dir_namelen(hds->ds_dir) + 1 + + strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN) + error = ENAMETOOLONG; - return (err); + return (error); } -static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +static int +dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - const char *newsnapname = arg2; - dsl_dir_t *dd = ds->ds_dir; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; - int err; - - ASSERT(ds->ds_phys->ds_next_snap_obj != 0); - - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); + int error; - VERIFY(0 == dsl_dataset_get_snapname(ds)); - err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); - ASSERT0(err); - mutex_enter(&ds->ds_lock); - (void) strcpy(ds->ds_snapname, newsnapname); - mutex_exit(&ds->ds_lock); - err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, 8, 1, &ds->ds_object, tx); - ASSERT0(err); + error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); + if (error != 0) + return (error); - spa_history_log_internal_ds(ds, "rename", tx, - "-> @%s", newsnapname); + if (ddrsa->ddrsa_recursive) { + error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_check_impl, ddrsa, + DS_FIND_CHILDREN); + } else { + error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); + } dsl_dataset_rele(hds, FTAG); + return (error); } -struct renamesnaparg { - dsl_sync_task_group_t *dstg; - char failed[MAXPATHLEN]; - char *oldsnap; - char *newsnap; -}; - static int -dsl_snapshot_rename_one(const char *name, void *arg) +dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) { - struct renamesnaparg *ra = arg; - dsl_dataset_t *ds = NULL; - char *snapname; - int err; - - snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); - (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_dataset_t *ds; + uint64_t val; + dmu_tx_t *tx = ddrsa->ddrsa_tx; + int error; - /* - * For recursive snapshot renames the parent won't be changing - * so we just pass name for both the to/from argument. - */ - err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); - if (err != 0) { - strfree(snapname); - return (err == ENOENT ? 0 : err); + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) { + /* ignore nonexistent snapshots */ + return (0); } -#ifdef _KERNEL - /* - * For all filesystems undergoing rename, we'll need to unmount it. - */ - (void) zfs_unmount_snap(snapname, NULL); -#endif - err = dsl_dataset_hold(snapname, ra->dstg, &ds); - strfree(snapname); - if (err != 0) - return (err == ENOENT ? 0 : err); + VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); - dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, - dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); + /* log before we change the name */ + spa_history_log_internal_ds(ds, "rename", tx, + "-> @%s", ddrsa->ddrsa_newsnapname); + + VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx)); + mutex_enter(&ds->ds_lock); + (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); + mutex_exit(&ds->ds_lock); + VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, + ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + dsl_dataset_rele(ds, FTAG); return (0); } -static int -dsl_recursive_rename(char *oldname, const char *newname) +static void +dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) { - int err; - struct renamesnaparg *ra; - dsl_sync_task_t *dst; - spa_t *spa; - char *cp, *fsname = spa_strdup(oldname); - int len = strlen(oldname) + 1; - - /* truncate the snapshot name to get the fsname */ - cp = strchr(fsname, '@'); - *cp = '\0'; - - err = spa_open(fsname, &spa, FTAG); - if (err) { - kmem_free(fsname, len); - return (err); - } - ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); - ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - - ra->oldsnap = strchr(oldname, '@') + 1; - ra->newsnap = strchr(newname, '@') + 1; - *ra->failed = '\0'; - - err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, - DS_FIND_CHILDREN); - kmem_free(fsname, len); - - if (err == 0) { - err = dsl_sync_task_group_wait(ra->dstg); - } + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; - for (dst = list_head(&ra->dstg->dstg_tasks); dst; - dst = list_next(&ra->dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - if (dst->dst_err) { - dsl_dir_name(ds->ds_dir, ra->failed); - (void) strlcat(ra->failed, "@", sizeof (ra->failed)); - (void) strlcat(ra->failed, ra->newsnap, - sizeof (ra->failed)); - } - dsl_dataset_rele(ds, ra->dstg); + VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); + ddrsa->ddrsa_tx = tx; + if (ddrsa->ddrsa_recursive) { + VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_sync_impl, ddrsa, + DS_FIND_CHILDREN)); + } else { + VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); } - - if (err) - (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); - - dsl_sync_task_group_destroy(ra->dstg); - kmem_free(ra, sizeof (struct renamesnaparg)); - spa_close(spa, FTAG); - return (err); + dsl_dataset_rele(hds, FTAG); } -static int -dsl_valid_rename(const char *oldname, void *arg) +int +dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive) { - int delta = *(int *)arg; + dsl_dataset_rename_snapshot_arg_t ddrsa; - if (strlen(oldname) + delta >= MAXNAMELEN) - return (ENAMETOOLONG); + ddrsa.ddrsa_fsname = fsname; + ddrsa.ddrsa_oldsnapname = oldsnapname; + ddrsa.ddrsa_newsnapname = newsnapname; + ddrsa.ddrsa_recursive = recursive; - return (0); + return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, + dsl_dataset_rename_snapshot_sync, &ddrsa, 1)); } -#pragma weak dmu_objset_rename = dsl_dataset_rename -int -dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) +static int +dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd; + const char *fsname = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; - const char *tail; - int err; + int64_t unused_refres_delta; + int error; - err = dsl_dir_open(oldname, FTAG, &dd, &tail); - if (err) - return (err); + error = dsl_dataset_hold(dp, fsname, FTAG, &ds); + if (error != 0) + return (error); + + /* must not be a snapshot */ + if (dsl_dataset_is_snapshot(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } - if (tail == NULL) { - int delta = strlen(newname) - strlen(oldname); + /* must have a most recent snapshot */ + if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } - /* if we're growing, validate child name lengths */ - if (delta > 0) - err = dmu_objset_find(oldname, dsl_valid_rename, - &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (dsl_dataset_long_held(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EBUSY); + } - if (err == 0) - err = dsl_dir_rename(dd, newname); - dsl_dir_close(dd, FTAG); - return (err); + /* + * Check if the snap we are rolling back to uses more than + * the refquota. + */ + if (ds->ds_quota != 0 && + ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) { + dsl_dataset_rele(ds, FTAG); + return (EDQUOT); } - if (tail[0] != '@') { - /* the name ended in a nonexistent component */ - dsl_dir_close(dd, FTAG); - return (ENOENT); + /* + * When we do the clone swap, we will temporarily use more space + * due to the refreservation (the head will no longer have any + * unique space, so the entire amount of the refreservation will need + * to be free). We will immediately destroy the clone, freeing + * this space, but the freeing happens over many txg's. + */ + unused_refres_delta = (int64_t)MIN(ds->ds_reserved, + ds->ds_phys->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { + dsl_dataset_rele(ds, FTAG); + return (ENOSPC); } - dsl_dir_close(dd, FTAG); + dsl_dataset_rele(ds, FTAG); + return (0); +} - /* new name must be snapshot in same filesystem */ - tail = strchr(newname, '@'); - if (tail == NULL) - return (EINVAL); - tail++; - if (strncmp(oldname, newname, tail - newname) != 0) - return (EXDEV); +static void +dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) +{ + const char *fsname = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds, *clone; + uint64_t cloneobj; - if (recursive) { - err = dsl_recursive_rename(oldname, newname); - } else { - err = dsl_dataset_hold(oldname, FTAG, &ds); - if (err) - return (err); + VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds)); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_snapshot_rename_check, - dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); + cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", + ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); - dsl_dataset_rele(ds, FTAG); - } + VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); - return (err); + dsl_dataset_clone_swap_sync_impl(clone, ds, tx); + dsl_dataset_zero_zil(ds, tx); + + dsl_destroy_head_sync_impl(clone, tx); + + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_rollback(const char *fsname) +{ + return (dsl_sync_task(fsname, dsl_dataset_rollback_check, + dsl_dataset_rollback_sync, (void *)fsname, 1)); } struct promotenode { @@ -2636,48 +1754,66 @@ struct promotenode { dsl_dataset_t *ds; }; -struct promotearg { +typedef struct dsl_dataset_promote_arg { + const char *ddpa_clonename; + dsl_dataset_t *ddpa_clone; list_t shared_snaps, origin_snaps, clone_snaps; - dsl_dataset_t *origin_origin; + dsl_dataset_t *origin_origin; /* origin of the origin */ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; -}; +} dsl_dataset_promote_arg_t; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); +static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, + void *tag); +static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); static int -dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *hds = arg1; - struct promotearg *pa = arg2; - struct promotenode *snap = list_head(&pa->shared_snaps); - dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; int err; uint64_t unused; - /* Check that it is a real clone */ - if (!dsl_dir_is_clone(hds->ds_dir)) - return (EINVAL); + err = promote_hold(ddpa, dp, FTAG); + if (err != 0) + return (err); - /* Since this is so expensive, don't do the preliminary check */ - if (!dmu_tx_is_syncing(tx)) - return (0); + hds = ddpa->ddpa_clone; - if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) + if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { + promote_rele(ddpa, FTAG); return (EXDEV); + } + + /* + * Compute and check the amount of space to transfer. Since this is + * so expensive, don't do the preliminary check. + */ + if (!dmu_tx_is_syncing(tx)) { + promote_rele(ddpa, FTAG); + return (0); + } + + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; /* compute origin's new unique space */ - snap = list_tail(&pa->clone_snaps); + snap = list_tail(&ddpa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_deadlist_space_range(&snap->ds->ds_deadlist, origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &pa->unique, &unused, &unused); + &ddpa->unique, &unused, &unused); /* * Walk the snapshots that we are moving * * Compute space to transfer. Consider the incremental changes - * to used for each snapshot: + * to used by each snapshot: * (my used) = (prev's used) + (blocks born) - (blocks killed) * So each snapshot gave birth to: * (blocks born) = (my used) - (prev's used) + (blocks killed) @@ -2688,18 +1824,28 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ - pa->used = origin_ds->ds_phys->ds_referenced_bytes; - pa->comp = origin_ds->ds_phys->ds_compressed_bytes; - pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; - for (snap = list_head(&pa->shared_snaps); snap; - snap = list_next(&pa->shared_snaps, snap)) { + ddpa->used = origin_ds->ds_phys->ds_referenced_bytes; + ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes; + ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; + /* + * If there are long holds, we won't be able to evict + * the objset. + */ + if (dsl_dataset_long_held(ds)) { + err = EBUSY; + goto out; + } + /* Check that the snapshot name does not conflict */ - VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) { + (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname); err = EEXIST; goto out; } @@ -2712,26 +1858,27 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); - pa->used += dlused; - pa->comp += dlcomp; - pa->uncomp += dluncomp; + ddpa->used += dlused; + ddpa->comp += dlcomp; + ddpa->uncomp += dluncomp; } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ - if (pa->origin_origin) { - pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; - pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; - pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; + if (ddpa->origin_origin) { + ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes; + ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes; + ddpa->uncomp -= + ddpa->origin_origin->ds_phys->ds_uncompressed_bytes; } /* Check that there is enough space here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, - pa->used); - if (err) - return (err); + ddpa->used); + if (err != 0) + goto out; /* * Compute the amounts of space that will be used by snapshots @@ -2749,68 +1896,75 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * calls will be fast because they do not have to * iterate over all bps. */ - snap = list_head(&pa->origin_snaps); - err = snaplist_space(&pa->shared_snaps, - snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); - if (err) - return (err); + snap = list_head(&ddpa->origin_snaps); + err = snaplist_space(&ddpa->shared_snaps, + snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); + if (err != 0) + goto out; - err = snaplist_space(&pa->clone_snaps, + err = snaplist_space(&ddpa->clone_snaps, snap->ds->ds_dir->dd_origin_txg, &space); - if (err) - return (err); - pa->cloneusedsnap += space; + if (err != 0) + goto out; + ddpa->cloneusedsnap += space; } if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { - err = snaplist_space(&pa->origin_snaps, - origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); - if (err) - return (err); + err = snaplist_space(&ddpa->origin_snaps, + origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap); + if (err != 0) + goto out; } - return (0); out: - pa->err_ds = snap->ds->ds_snapname; + promote_rele(ddpa, FTAG); return (err); } static void -dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *hds = arg1; - struct promotearg *pa = arg2; - struct promotenode *snap = list_head(&pa->shared_snaps); - dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; dsl_dataset_t *origin_head; - dsl_dir_t *dd = hds->ds_dir; - dsl_pool_t *dp = hds->ds_dir->dd_pool; + dsl_dir_t *dd; dsl_dir_t *odd = NULL; uint64_t oldnext_obj; int64_t delta; - ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); + VERIFY0(promote_hold(ddpa, dp, FTAG)); + hds = ddpa->ddpa_clone; + + ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE); + + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; + dd = hds->ds_dir; - snap = list_head(&pa->origin_snaps); + snap = list_head(&ddpa->origin_snaps); origin_head = snap->ds; /* * We need to explicitly open odd, since origin_ds's dd will be * changing. */ - VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, + VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; - snap = list_tail(&pa->clone_snaps); + snap = list_tail(&ddpa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ if (origin_ds->ds_phys->ds_next_clones_obj) { - remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + dsl_dataset_remove_from_next_clones(origin_ds, + snap->ds->ds_object, tx); + VERIFY0(zap_add_int(dp->dp_meta_objset, origin_ds->ds_phys->ds_next_clones_obj, oldnext_obj, tx)); } @@ -2827,39 +1981,43 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) /* change dd_clone entries */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + VERIFY0(zap_remove_int(dp->dp_meta_objset, odd->dd_phys->dd_clones, hds->ds_object, tx)); - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, - pa->origin_origin->ds_dir->dd_phys->dd_clones, + VERIFY0(zap_add_int(dp->dp_meta_objset, + ddpa->origin_origin->ds_dir->dd_phys->dd_clones, hds->ds_object, tx)); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - pa->origin_origin->ds_dir->dd_phys->dd_clones, + VERIFY0(zap_remove_int(dp->dp_meta_objset, + ddpa->origin_origin->ds_dir->dd_phys->dd_clones, origin_head->ds_object, tx)); if (dd->dd_phys->dd_clones == 0) { dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + VERIFY0(zap_add_int(dp->dp_meta_objset, dd->dd_phys->dd_clones, origin_head->ds_object, tx)); - } /* move snapshots to this dir */ - for (snap = list_head(&pa->shared_snaps); snap; - snap = list_next(&pa->shared_snaps, snap)) { + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; - /* unregister props as dsl_dir is changing */ + /* + * Property callbacks are registered to a particular + * dsl_dir. Since ours is changing, evict the objset + * so that they will be unregistered from the old dsl_dir. + */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } + /* move snap name entry */ - VERIFY(0 == dsl_dataset_get_snapname(ds)); - VERIFY(0 == dsl_dataset_snap_remove(origin_head, + VERIFY0(dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_snap_remove(origin_head, ds->ds_snapname, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, + VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); @@ -2868,8 +2026,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); ds->ds_phys->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); - dsl_dir_close(ds->ds_dir, ds); - VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, + dsl_dir_rele(ds->ds_dir, ds); + VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); /* move any clone references */ @@ -2893,20 +2051,20 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) continue; } - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + VERIFY0(dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &cnds)); o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; - VERIFY3U(zap_remove_int(dp->dp_meta_objset, - odd->dd_phys->dd_clones, o, tx), ==, 0); - VERIFY3U(zap_add_int(dp->dp_meta_objset, - dd->dd_phys->dd_clones, o, tx), ==, 0); + VERIFY0(zap_remove_int(dp->dp_meta_objset, + odd->dd_phys->dd_clones, o, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dd->dd_phys->dd_clones, o, tx)); dsl_dataset_rele(cnds, FTAG); } zap_cursor_fini(&zc); } - ASSERT0(dsl_prop_numcb(ds)); + ASSERT(!dsl_prop_hascb(ds)); } /* @@ -2916,31 +2074,31 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) * is true for each of {clone,origin} independently. */ - delta = pa->cloneusedsnap - + delta = ddpa->cloneusedsnap - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, >=, 0); - ASSERT3U(pa->used, >=, delta); + ASSERT3U(ddpa->used, >=, delta); dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(dd, DD_USED_HEAD, - pa->used - delta, pa->comp, pa->uncomp, tx); + ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); - delta = pa->originusedsnap - + delta = ddpa->originusedsnap - odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, <=, 0); - ASSERT3U(pa->used, >=, -delta); + ASSERT3U(ddpa->used, >=, -delta); dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(odd, DD_USED_HEAD, - -pa->used - delta, -pa->comp, -pa->uncomp, tx); + -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); - origin_ds->ds_phys->ds_unique_bytes = pa->unique; + origin_ds->ds_phys->ds_unique_bytes = ddpa->unique; /* log history record */ spa_history_log_internal_ds(hds, "promote", tx, ""); - dsl_dir_close(odd, FTAG); + dsl_dir_rele(odd, FTAG); + promote_rele(ddpa, FTAG); } -static char *snaplist_tag = "snaplist"; /* * Make a list of dsl_dataset_t's for the snapshots between first_obj * (exclusive) and last_obj (inclusive). The list will be in reverse @@ -2948,13 +2106,11 @@ static char *snaplist_tag = "snaplist"; * snapshots back to this dataset's origin. */ static int -snaplist_make(dsl_pool_t *dp, boolean_t own, - uint64_t first_obj, uint64_t last_obj, list_t *l) +snaplist_make(dsl_pool_t *dp, + uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) { uint64_t obj = last_obj; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); - list_create(l, sizeof (struct promotenode), offsetof(struct promotenode, link)); @@ -2963,28 +2119,15 @@ snaplist_make(dsl_pool_t *dp, boolean_t own, struct promotenode *snap; int err; - if (own) { - err = dsl_dataset_own_obj(dp, obj, - 0, snaplist_tag, &ds); - if (err == 0) - dsl_dataset_make_exclusive(ds, snaplist_tag); - } else { - err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); - } - if (err == ENOENT) { - /* lost race with snapshot destroy */ - struct promotenode *last = list_tail(l); - ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); - obj = last->ds->ds_phys->ds_prev_snap_obj; - continue; - } else if (err) { + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); + ASSERT(err != ENOENT); + if (err != 0) return (err); - } if (first_obj == 0) first_obj = ds->ds_dir->dd_phys->dd_origin_obj; - snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); + snap = kmem_alloc(sizeof (*snap), KM_SLEEP); snap->ds = ds; list_insert_tail(l, snap); obj = ds->ds_phys->ds_prev_snap_obj; @@ -3009,209 +2152,209 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) } static void -snaplist_destroy(list_t *l, boolean_t own) +snaplist_destroy(list_t *l, void *tag) { struct promotenode *snap; - if (!l || !list_link_active(&l->list_head)) + if (l == NULL || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { list_remove(l, snap); - if (own) - dsl_dataset_disown(snap->ds, snaplist_tag); - else - dsl_dataset_rele(snap->ds, snaplist_tag); - kmem_free(snap, sizeof (struct promotenode)); + dsl_dataset_rele(snap->ds, tag); + kmem_free(snap, sizeof (*snap)); } list_destroy(l); } -/* - * Promote a clone. Nomenclature note: - * "clone" or "cds": the original clone which is being promoted - * "origin" or "ods": the snapshot which is originally clone's origin - * "origin head" or "ohds": the dataset which is the head - * (filesystem/volume) for the origin - * "origin origin": the origin of the origin's filesystem (typically - * NULL, indicating that the clone is not a clone of a clone). - */ -int -dsl_dataset_promote(const char *name, char *conflsnap) +static int +promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) { - dsl_dataset_t *ds; + int error; dsl_dir_t *dd; - dsl_pool_t *dp; - dmu_object_info_t doi; - struct promotearg pa; struct promotenode *snap; - int err; - bzero(&pa, sizeof(struct promotearg)); - err = dsl_dataset_hold(name, FTAG, &ds); - if (err) - return (err); - dd = ds->ds_dir; - dp = dd->dd_pool; - - err = dmu_object_info(dp->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, &doi); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } + error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, + &ddpa->ddpa_clone); + if (error != 0) + return (error); + dd = ddpa->ddpa_clone->ds_dir; - if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { - dsl_dataset_rele(ds, FTAG); + if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || + !dsl_dir_is_clone(dd)) { + dsl_dataset_rele(ddpa->ddpa_clone, tag); return (EINVAL); } - /* - * We are going to inherit all the snapshots taken before our - * origin (i.e., our new origin will be our parent's origin). - * Take ownership of them so that we can rename them into our - * namespace. - */ - rw_enter(&dp->dp_config_rwlock, RW_READER); - - err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, - &pa.shared_snaps); - if (err != 0) + error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj, + &ddpa->shared_snaps, tag); + if (error != 0) goto out; - err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); - if (err != 0) + error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, + &ddpa->clone_snaps, tag); + if (error != 0) goto out; - snap = list_head(&pa.shared_snaps); + snap = list_head(&ddpa->shared_snaps); ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); - err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, - snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); - if (err != 0) + error = snaplist_make(dp, dd->dd_phys->dd_origin_obj, + snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, + &ddpa->origin_snaps, tag); + if (error != 0) goto out; if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { - err = dsl_dataset_hold_obj(dp, + error = dsl_dataset_hold_obj(dp, snap->ds->ds_dir->dd_phys->dd_origin_obj, - FTAG, &pa.origin_origin); - if (err != 0) + tag, &ddpa->origin_origin); + if (error != 0) goto out; } - out: - rw_exit(&dp->dp_config_rwlock); + if (error != 0) + promote_rele(ddpa, tag); + return (error); +} + +static void +promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) +{ + snaplist_destroy(&ddpa->shared_snaps, tag); + snaplist_destroy(&ddpa->clone_snaps, tag); + snaplist_destroy(&ddpa->origin_snaps, tag); + if (ddpa->origin_origin != NULL) + dsl_dataset_rele(ddpa->origin_origin, tag); + dsl_dataset_rele(ddpa->ddpa_clone, tag); +} + +/* + * Promote a clone. + * + * If it fails due to a conflicting snapshot name, "conflsnap" will be filled + * in with the name. (It must be at least MAXNAMELEN bytes long.) + */ +int +dsl_dataset_promote(const char *name, char *conflsnap) +{ + dsl_dataset_promote_arg_t ddpa = { 0 }; + uint64_t numsnaps; + int error; + objset_t *os; /* - * Add in 128x the snapnames zapobj size, since we will be moving - * a bunch of snapnames to the promoted ds, and dirtying their - * bonus buffers. + * We will modify space proportional to the number of + * snapshots. Compute numsnaps. */ - if (err == 0) { - err = dsl_sync_task_do(dp, dsl_dataset_promote_check, - dsl_dataset_promote_sync, ds, &pa, - 2 + 2 * doi.doi_physical_blocks_512); - if (err && pa.err_ds && conflsnap) - (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); - } + error = dmu_objset_hold(name, FTAG, &os); + if (error != 0) + return (error); + error = zap_count(dmu_objset_pool(os)->dp_meta_objset, + dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps); + dmu_objset_rele(os, FTAG); + if (error != 0) + return (error); - snaplist_destroy(&pa.shared_snaps, B_TRUE); - snaplist_destroy(&pa.clone_snaps, B_FALSE); - snaplist_destroy(&pa.origin_snaps, B_FALSE); - if (pa.origin_origin) - dsl_dataset_rele(pa.origin_origin, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); -} + ddpa.ddpa_clonename = name; + ddpa.err_ds = conflsnap; -struct cloneswaparg { - dsl_dataset_t *cds; /* clone dataset */ - dsl_dataset_t *ohds; /* origin's head dataset */ - boolean_t force; - int64_t unused_refres_delta; /* change in unconsumed refreservation */ -}; + return (dsl_sync_task(name, dsl_dataset_promote_check, + dsl_dataset_promote_sync, &ddpa, 2 + numsnaps)); +} -/* ARGSUSED */ -static int -dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) +int +dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, boolean_t force) { - struct cloneswaparg *csa = arg1; + int64_t unused_refres_delta; /* they should both be heads */ - if (dsl_dataset_is_snapshot(csa->cds) || - dsl_dataset_is_snapshot(csa->ohds)) + if (dsl_dataset_is_snapshot(clone) || + dsl_dataset_is_snapshot(origin_head)) return (EINVAL); /* the branch point should be just before them */ - if (csa->cds->ds_prev != csa->ohds->ds_prev) + if (clone->ds_prev != origin_head->ds_prev) return (EINVAL); - /* cds should be the clone (unless they are unrelated) */ - if (csa->cds->ds_prev != NULL && - csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && - csa->ohds->ds_object != - csa->cds->ds_prev->ds_phys->ds_next_snap_obj) + /* clone should be the clone (unless they are unrelated) */ + if (clone->ds_prev != NULL && + clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && + origin_head->ds_object != + clone->ds_prev->ds_phys->ds_next_snap_obj) return (EINVAL); /* the clone should be a child of the origin */ - if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) + if (clone->ds_dir->dd_parent != origin_head->ds_dir) return (EINVAL); - /* ohds shouldn't be modified unless 'force' */ - if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) + /* origin_head shouldn't be modified unless 'force' */ + if (!force && dsl_dataset_modified_since_lastsnap(origin_head)) return (ETXTBSY); - /* adjust amount of any unconsumed refreservation */ - csa->unused_refres_delta = - (int64_t)MIN(csa->ohds->ds_reserved, - csa->ohds->ds_phys->ds_unique_bytes) - - (int64_t)MIN(csa->ohds->ds_reserved, - csa->cds->ds_phys->ds_unique_bytes); + /* origin_head should have no long holds (e.g. is not mounted) */ + if (dsl_dataset_long_held(origin_head)) + return (EBUSY); - if (csa->unused_refres_delta > 0 && - csa->unused_refres_delta > - dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) + /* check amount of any unconsumed refreservation */ + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + origin_head->ds_phys->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + clone->ds_phys->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) return (ENOSPC); - if (csa->ohds->ds_quota != 0 && - csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) + /* clone can't be over the head's refquota */ + if (origin_head->ds_quota != 0 && + clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota) return (EDQUOT); return (0); } -/* ARGSUSED */ -static void -dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) +void +dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, dmu_tx_t *tx) { - struct cloneswaparg *csa = arg1; - dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; + dsl_pool_t *dp = dmu_tx_pool(tx); + int64_t unused_refres_delta; - ASSERT(csa->cds->ds_reserved == 0); - ASSERT(csa->ohds->ds_quota == 0 || - csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); + ASSERT(clone->ds_reserved == 0); + ASSERT(origin_head->ds_quota == 0 || + clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota); - dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); - dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); + dmu_buf_will_dirty(clone->ds_dbuf, tx); + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); - if (csa->cds->ds_objset != NULL) { - dmu_objset_evict(csa->cds->ds_objset); - csa->cds->ds_objset = NULL; + if (clone->ds_objset != NULL) { + dmu_objset_evict(clone->ds_objset); + clone->ds_objset = NULL; } - if (csa->ohds->ds_objset != NULL) { - dmu_objset_evict(csa->ohds->ds_objset); - csa->ohds->ds_objset = NULL; + if (origin_head->ds_objset != NULL) { + dmu_objset_evict(origin_head->ds_objset); + origin_head->ds_objset = NULL; } + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + origin_head->ds_phys->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + clone->ds_phys->ds_unique_bytes); + /* * Reset origin's unique bytes, if it exists. */ - if (csa->cds->ds_prev) { - dsl_dataset_t *origin = csa->cds->ds_prev; + if (clone->ds_prev) { + dsl_dataset_t *origin = clone->ds_prev; uint64_t comp, uncomp; dmu_buf_will_dirty(origin->ds_dbuf, tx); - dsl_deadlist_space_range(&csa->cds->ds_deadlist, + dsl_deadlist_space_range(&clone->ds_deadlist, origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); } @@ -3219,9 +2362,9 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) /* swap blkptrs */ { blkptr_t tmp; - tmp = csa->ohds->ds_phys->ds_bp; - csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; - csa->cds->ds_phys->ds_bp = tmp; + tmp = origin_head->ds_phys->ds_bp; + origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp; + clone->ds_phys->ds_bp = tmp; } /* set dd_*_bytes */ @@ -3230,25 +2373,25 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) uint64_t cdl_used, cdl_comp, cdl_uncomp; uint64_t odl_used, odl_comp, odl_uncomp; - ASSERT3U(csa->cds->ds_dir->dd_phys-> + ASSERT3U(clone->ds_dir->dd_phys-> dd_used_breakdown[DD_USED_SNAP], ==, 0); - dsl_deadlist_space(&csa->cds->ds_deadlist, + dsl_deadlist_space(&clone->ds_deadlist, &cdl_used, &cdl_comp, &cdl_uncomp); - dsl_deadlist_space(&csa->ohds->ds_deadlist, + dsl_deadlist_space(&origin_head->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); - dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - - (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); - dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); - duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + + dused = clone->ds_phys->ds_referenced_bytes + cdl_used - + (origin_head->ds_phys->ds_referenced_bytes + odl_used); + dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp - + (origin_head->ds_phys->ds_compressed_bytes + odl_comp); + duncomp = clone->ds_phys->ds_uncompressed_bytes + cdl_uncomp - - (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); + (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp); - dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, dused, dcomp, duncomp, tx); - dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, + dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, -dused, -dcomp, -duncomp, tx); /* @@ -3257,86 +2400,46 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) * deadlist (since that's the only thing that's * changing that affects the snapused). */ - dsl_deadlist_space_range(&csa->cds->ds_deadlist, - csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, + dsl_deadlist_space_range(&clone->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used, &cdl_comp, &cdl_uncomp); - dsl_deadlist_space_range(&csa->ohds->ds_deadlist, - csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, + dsl_deadlist_space_range(&origin_head->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used, &odl_comp, &odl_uncomp); - dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, + dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } /* swap ds_*_bytes */ - SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, - csa->cds->ds_phys->ds_referenced_bytes); - SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, - csa->cds->ds_phys->ds_compressed_bytes); - SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, - csa->cds->ds_phys->ds_uncompressed_bytes); - SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, - csa->cds->ds_phys->ds_unique_bytes); + SWITCH64(origin_head->ds_phys->ds_referenced_bytes, + clone->ds_phys->ds_referenced_bytes); + SWITCH64(origin_head->ds_phys->ds_compressed_bytes, + clone->ds_phys->ds_compressed_bytes); + SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes, + clone->ds_phys->ds_uncompressed_bytes); + SWITCH64(origin_head->ds_phys->ds_unique_bytes, + clone->ds_phys->ds_unique_bytes); /* apply any parent delta for change in unconsumed refreservation */ - dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, - csa->unused_refres_delta, 0, 0, tx); + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, + unused_refres_delta, 0, 0, tx); /* * Swap deadlists. */ - dsl_deadlist_close(&csa->cds->ds_deadlist); - dsl_deadlist_close(&csa->ohds->ds_deadlist); - SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, - csa->cds->ds_phys->ds_deadlist_obj); - dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, - csa->cds->ds_phys->ds_deadlist_obj); - dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, - csa->ohds->ds_phys->ds_deadlist_obj); - - dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); - - spa_history_log_internal_ds(csa->cds, "clone swap", tx, - "parent=%s", csa->ohds->ds_dir->dd_myname); -} + dsl_deadlist_close(&clone->ds_deadlist); + dsl_deadlist_close(&origin_head->ds_deadlist); + SWITCH64(origin_head->ds_phys->ds_deadlist_obj, + clone->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, + clone->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, + origin_head->ds_phys->ds_deadlist_obj); -/* - * Swap 'clone' with its origin head datasets. Used at the end of "zfs - * recv" into an existing fs to swizzle the file system to the new - * version, and by "zfs rollback". Can also be used to swap two - * independent head datasets if neither has any snapshots. - */ -int -dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, - boolean_t force) -{ - struct cloneswaparg csa; - int error; + dsl_scan_ds_clone_swapped(origin_head, clone, tx); - ASSERT(clone->ds_owner); - ASSERT(origin_head->ds_owner); -retry: - /* - * Need exclusive access for the swap. If we're swapping these - * datasets back after an error, we already hold the locks. - */ - if (!RW_WRITE_HELD(&clone->ds_rwlock)) - rw_enter(&clone->ds_rwlock, RW_WRITER); - if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && - !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { - rw_exit(&clone->ds_rwlock); - rw_enter(&origin_head->ds_rwlock, RW_WRITER); - if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { - rw_exit(&origin_head->ds_rwlock); - goto retry; - } - } - csa.cds = clone; - csa.ohds = origin_head; - csa.force = force; - error = dsl_sync_task_do(clone->ds_dir->dd_pool, - dsl_dataset_clone_swap_check, - dsl_dataset_clone_swap_sync, &csa, NULL, 9); - return (error); + spa_history_log_internal_ds(clone, "clone swap", tx, + "parent=%s", origin_head->ds_dir->dd_myname); } /* @@ -3346,21 +2449,20 @@ retry: int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { - spa_t *spa; dsl_pool_t *dp; dsl_dataset_t *ds; int error; - if ((error = spa_open(pname, &spa, FTAG)) != 0) + error = dsl_pool_hold(pname, FTAG, &dp); + if (error != 0) return (error); - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { + + error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + if (error == 0) { dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -3407,113 +2509,140 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, error = ERESTART; else error = EDQUOT; - - DMU_TX_STAT_BUMP(dmu_tx_quota); } mutex_exit(&ds->ds_lock); return (error); } +typedef struct dsl_dataset_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dataset_set_qr_arg_t; + + /* ARGSUSED */ static int -dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - int err; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t newval; - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) + if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) return (ENOTSUP); - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + if (dsl_dataset_is_snapshot(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } - if (psa->psa_effective_value == 0) + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); return (0); + } - if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || - psa->psa_effective_value < ds->ds_reserved) + if (newval < ds->ds_phys->ds_referenced_bytes || + newval < ds->ds_reserved) { + dsl_dataset_rele(ds, FTAG); return (ENOSPC); + } + dsl_dataset_rele(ds, FTAG); return (0); } -extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); - -void -dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) +static void +dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; - dsl_prop_set_sync(ds, psa, tx); - DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - if (ds->ds_quota != effective_value) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_quota = effective_value; + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); - spa_history_log_internal_ds(ds, "set refquota", tx, - "refquota=%lld", (longlong_t)ds->ds_quota); + if (ds->ds_quota != newval) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_quota = newval; } + dsl_dataset_rele(ds, FTAG); } int -dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) +dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, + uint64_t refquota) { - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); + dsl_dataset_set_qr_arg_t ddsqra; - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(ds->ds_dir->dd_pool, 0); - - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, - ds, &psa, 0); + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refquota; - dsl_dataset_rele(ds, FTAG); - return (err); + return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, + dsl_dataset_set_refquota_sync, &ddsqra, 0)); } static int -dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value; - uint64_t unique; - int err; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t newval, unique; - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < - SPA_VERSION_REFRESERVATION) + if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) return (ENOTSUP); - if (dsl_dataset_is_snapshot(ds)) - return (EINVAL); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + if (dsl_dataset_is_snapshot(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } - effective_value = psa->psa_effective_value; + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ - if (!dmu_tx_is_syncing(tx)) + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); return (0); + } mutex_enter(&ds->ds_lock); if (!DS_UNIQUE_IS_ACCURATE(ds)) @@ -3521,638 +2650,76 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) unique = ds->ds_phys->ds_unique_bytes; mutex_exit(&ds->ds_lock); - if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { - uint64_t delta = MAX(unique, effective_value) - + if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, newval) - MAX(unique, ds->ds_reserved); - if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - if (ds->ds_quota > 0 && - effective_value > ds->ds_quota) + if (delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || + (ds->ds_quota > 0 && newval > ds->ds_quota)) { + dsl_dataset_rele(ds, FTAG); return (ENOSPC); + } } + dsl_dataset_rele(ds, FTAG); return (0); } -static void -dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) +void +dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, + zprop_source_t source, uint64_t value, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; + uint64_t newval; uint64_t unique; int64_t delta; - dsl_prop_set_sync(ds, psa, tx); - DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + source, sizeof (value), 1, &value, tx); - dmu_buf_will_dirty(ds->ds_dbuf, tx); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); unique = ds->ds_phys->ds_unique_bytes; - delta = MAX(0, (int64_t)(effective_value - unique)) - + delta = MAX(0, (int64_t)(newval - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); - ds->ds_reserved = effective_value; + ds->ds_reserved = newval; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); - - spa_history_log_internal_ds(ds, "set refreservation", tx, - "refreservation=%lld", (longlong_t)effective_value); -} - -int -dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, - uint64_t reservation) -{ - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "refreservation", source, - &reservation); - - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_set_reservation_check, - dsl_dataset_set_reservation_sync, ds, &psa, 0); - - dsl_dataset_rele(ds, FTAG); - return (err); -} - -typedef struct zfs_hold_cleanup_arg { - dsl_pool_t *dp; - uint64_t dsobj; - char htag[MAXNAMELEN]; -} zfs_hold_cleanup_arg_t; - -static void -dsl_dataset_user_release_onexit(void *arg) -{ - zfs_hold_cleanup_arg_t *ca = arg; - - (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, - B_TRUE); - kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); -} - -void -dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, - minor_t minor) -{ - zfs_hold_cleanup_arg_t *ca; - - ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); - ca->dp = ds->ds_dir->dd_pool; - ca->dsobj = ds->ds_object; - (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); - VERIFY3U(0, ==, zfs_onexit_add_cb(minor, - dsl_dataset_user_release_onexit, ca, NULL)); -} - -/* - * If you add new checks here, you may need to add - * additional checks to the "temporary" case in - * snapshot_check() in dmu_objset.c. - */ -static int -dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct dsl_ds_holdarg *ha = arg2; - const char *htag = ha->htag; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - int error = 0; - - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) - return (ENOTSUP); - - if (!dsl_dataset_is_snapshot(ds)) - return (EINVAL); - - /* tags must be unique */ - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_userrefs_obj) { - error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, - 8, 1, tx); - if (error == 0) - error = EEXIST; - else if (error == ENOENT) - error = 0; - } - mutex_exit(&ds->ds_lock); - - if (error == 0 && ha->temphold && - strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) - error = E2BIG; - - return (error); -} - -void -dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct dsl_ds_holdarg *ha = arg2; - const char *htag = ha->htag; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t now = gethrestime_sec(); - uint64_t zapobj; - - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_userrefs_obj == 0) { - /* - * This is the first user hold for this dataset. Create - * the userrefs zap object. - */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - zapobj = ds->ds_phys->ds_userrefs_obj = - zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); - } else { - zapobj = ds->ds_phys->ds_userrefs_obj; - } - ds->ds_userrefs++; - mutex_exit(&ds->ds_lock); - - VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); - - if (ha->temphold) { - VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, - htag, &now, tx)); - } - - spa_history_log_internal_ds(ds, "hold", tx, - "tag = %s temp = %d holds now = %llu", - htag, (int)ha->temphold, ds->ds_userrefs); -} - -static int -dsl_dataset_user_hold_one(const char *dsname, void *arg) -{ - struct dsl_ds_holdarg *ha = arg; - dsl_dataset_t *ds; - int error; - char *name; - - /* alloc a buffer to hold dsname@snapname plus terminating NULL */ - name = kmem_asprintf("%s@%s", dsname, ha->snapname); - error = dsl_dataset_hold(name, ha->dstg, &ds); - strfree(name); - if (error == 0) { - ha->gotone = B_TRUE; - dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, - dsl_dataset_user_hold_sync, ds, ha, 0); - } else if (error == ENOENT && ha->recursive) { - error = 0; - } else { - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); - } - return (error); -} - -int -dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, - boolean_t temphold) -{ - struct dsl_ds_holdarg *ha; - int error; - - ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); - ha->htag = htag; - ha->temphold = temphold; - error = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, - ds, ha, 0); - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - - return (error); -} - -int -dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, - boolean_t recursive, boolean_t temphold, int cleanup_fd) -{ - struct dsl_ds_holdarg *ha; - dsl_sync_task_t *dst; - spa_t *spa; - int error; - minor_t minor = 0; - - if (cleanup_fd != -1) { - /* Currently we only support cleanup-on-exit of tempholds. */ - if (!temphold) - return (EINVAL); - error = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (error) - return (error); - } - - ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); - - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); - - error = spa_open(dsname, &spa, FTAG); - if (error) { - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - if (cleanup_fd != -1) - zfs_onexit_fd_rele(cleanup_fd); - return (error); - } - - ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - ha->htag = htag; - ha->snapname = snapname; - ha->recursive = recursive; - ha->temphold = temphold; - - if (recursive) { - error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, - ha, DS_FIND_CHILDREN); - } else { - error = dsl_dataset_user_hold_one(dsname, ha); - } - if (error == 0) - error = dsl_sync_task_group_wait(ha->dstg); - - for (dst = list_head(&ha->dstg->dstg_tasks); dst; - dst = list_next(&ha->dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - - if (dst->dst_err) { - dsl_dataset_name(ds, ha->failed); - *strchr(ha->failed, '@') = '\0'; - } else if (error == 0 && minor != 0 && temphold) { - /* - * If this hold is to be released upon process exit, - * register that action now. - */ - dsl_register_onexit_hold_cleanup(ds, htag, minor); - } - dsl_dataset_rele(ds, ha->dstg); - } - - if (error == 0 && recursive && !ha->gotone) - error = ENOENT; - - if (error) - (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); - - dsl_sync_task_group_destroy(ha->dstg); - - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - spa_close(spa, FTAG); - if (cleanup_fd != -1) - zfs_onexit_fd_rele(cleanup_fd); - return (error); -} - -struct dsl_ds_releasearg { - dsl_dataset_t *ds; - const char *htag; - boolean_t own; /* do we own or just hold ds? */ -}; - -static int -dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, - boolean_t *might_destroy) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t zapobj; - uint64_t tmp; - int error; - - *might_destroy = B_FALSE; - - mutex_enter(&ds->ds_lock); - zapobj = ds->ds_phys->ds_userrefs_obj; - if (zapobj == 0) { - /* The tag can't possibly exist */ - mutex_exit(&ds->ds_lock); - return (ESRCH); - } - - /* Make sure the tag exists */ - error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); - if (error) { - mutex_exit(&ds->ds_lock); - if (error == ENOENT) - error = ESRCH; - return (error); - } - - if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && - DS_IS_DEFER_DESTROY(ds)) - *might_destroy = B_TRUE; - - mutex_exit(&ds->ds_lock); - return (0); -} - -static int -dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) -{ - struct dsl_ds_releasearg *ra = arg1; - dsl_dataset_t *ds = ra->ds; - boolean_t might_destroy; - int error; - - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) - return (ENOTSUP); - - error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); - if (error) - return (error); - - if (might_destroy) { - struct dsl_ds_destroyarg dsda = {0}; - - if (dmu_tx_is_syncing(tx)) { - /* - * If we're not prepared to remove the snapshot, - * we can't allow the release to happen right now. - */ - if (!ra->own) - return (EBUSY); - } - dsda.ds = ds; - dsda.releasing = B_TRUE; - return (dsl_dataset_destroy_check(&dsda, tag, tx)); - } - - return (0); } static void -dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) +dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) { - struct dsl_ds_releasearg *ra = arg1; - dsl_dataset_t *ds = ra->ds; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t zapobj; - uint64_t refs; - int error; - - mutex_enter(&ds->ds_lock); - ds->ds_userrefs--; - refs = ds->ds_userrefs; - mutex_exit(&ds->ds_lock); - error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); - VERIFY(error == 0 || error == ENOENT); - zapobj = ds->ds_phys->ds_userrefs_obj; - VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); - if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && - DS_IS_DEFER_DESTROY(ds)) { - struct dsl_ds_destroyarg dsda = {0}; - - ASSERT(ra->own); - dsda.ds = ds; - dsda.releasing = B_TRUE; - /* We already did the destroy_check */ - dsl_dataset_destroy_sync(&dsda, tag, tx); - } -} - -static int -dsl_dataset_user_release_one(const char *dsname, void *arg) -{ - struct dsl_ds_holdarg *ha = arg; - struct dsl_ds_releasearg *ra; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; - int error; - void *dtag = ha->dstg; - char *name; - boolean_t own = B_FALSE; - boolean_t might_destroy; - - /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ - name = kmem_asprintf("%s@%s", dsname, ha->snapname); - error = dsl_dataset_hold(name, dtag, &ds); - strfree(name); - if (error == ENOENT && ha->recursive) - return (0); - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); - if (error) - return (error); - - ha->gotone = B_TRUE; - - ASSERT(dsl_dataset_is_snapshot(ds)); - - error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); - if (error) { - dsl_dataset_rele(ds, dtag); - return (error); - } - - if (might_destroy) { -#ifdef _KERNEL - name = kmem_asprintf("%s@%s", dsname, ha->snapname); - error = zfs_unmount_snap(name, NULL); - strfree(name); - if (error) { - dsl_dataset_rele(ds, dtag); - return (error); - } -#endif - if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { - dsl_dataset_rele(ds, dtag); - return (EBUSY); - } else { - own = B_TRUE; - dsl_dataset_make_exclusive(ds, dtag); - } - } - - ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); - ra->ds = ds; - ra->htag = ha->htag; - ra->own = own; - dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, - dsl_dataset_user_release_sync, ra, dtag, 0); - - return (0); -} -int -dsl_dataset_user_release(char *dsname, char *snapname, char *htag, - boolean_t recursive) -{ - struct dsl_ds_holdarg *ha; - dsl_sync_task_t *dst; - spa_t *spa; - int error; - -top: - ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); - - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); - - error = spa_open(dsname, &spa, FTAG); - if (error) { - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - return (error); - } - - ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - ha->htag = htag; - ha->snapname = snapname; - ha->recursive = recursive; - if (recursive) { - error = dmu_objset_find(dsname, dsl_dataset_user_release_one, - ha, DS_FIND_CHILDREN); - } else { - error = dsl_dataset_user_release_one(dsname, ha); - } - if (error == 0) - error = dsl_sync_task_group_wait(ha->dstg); - - for (dst = list_head(&ha->dstg->dstg_tasks); dst; - dst = list_next(&ha->dstg->dstg_tasks, dst)) { - struct dsl_ds_releasearg *ra = dst->dst_arg1; - dsl_dataset_t *ds = ra->ds; - - if (dst->dst_err) - dsl_dataset_name(ds, ha->failed); - - if (ra->own) - dsl_dataset_disown(ds, ha->dstg); - else - dsl_dataset_rele(ds, ha->dstg); - - kmem_free(ra, sizeof (struct dsl_ds_releasearg)); - } - - if (error == 0 && recursive && !ha->gotone) - error = ENOENT; - - if (error && error != EBUSY) - (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); - - dsl_sync_task_group_destroy(ha->dstg); - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - spa_close(spa, FTAG); - - /* - * We can get EBUSY if we were racing with deferred destroy and - * dsl_dataset_user_release_check() hadn't done the necessary - * open context setup. We can also get EBUSY if we're racing - * with destroy and that thread is the ds_owner. Either way - * the busy condition should be transient, and we should retry - * the release operation. - */ - if (error == EBUSY) - goto top; - - return (error); -} - -/* - * Called at spa_load time (with retry == B_FALSE) to release a stale - * temporary user hold. Also called by the onexit code (with retry == B_TRUE). - */ -int -dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, - boolean_t retry) -{ - dsl_dataset_t *ds; - char *snap; - char *name; - int namelen; - int error; - - do { - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (error) - return (error); - namelen = dsl_dataset_namelen(ds)+1; - name = kmem_alloc(namelen, KM_SLEEP); - dsl_dataset_name(ds, name); - dsl_dataset_rele(ds, FTAG); - - snap = strchr(name, '@'); - *snap = '\0'; - ++snap; - error = dsl_dataset_user_release(name, snap, htag, B_FALSE); - kmem_free(name, namelen); - - /* - * The object can't have been destroyed because we have a hold, - * but it might have been renamed, resulting in ENOENT. Retry - * if we've been requested to do so. - * - * It would be nice if we could use the dsobj all the way - * through and avoid ENOENT entirely. But we might need to - * unmount the snapshot, and there's currently no way to lookup - * a vfsp using a ZFS object id. - */ - } while ((error == ENOENT) && retry); - - return (error); -} - -int -dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) -{ - dsl_dataset_t *ds; - int err; - - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); - if (ds->ds_phys->ds_userrefs_obj != 0) { - zap_attribute_t *za; - zap_cursor_t zc; - - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_userrefs_obj); - zap_cursor_retrieve(&zc, za) == 0; - zap_cursor_advance(&zc)) { - VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, - za->za_first_integer)); - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (zap_attribute_t)); - } + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + dsl_dataset_set_refreservation_sync_impl(ds, + ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); dsl_dataset_rele(ds, FTAG); - return (0); } -/* - * Note, this function is used as the callback for dmu_objset_find(). We - * always return 0 so that we will continue to find and process - * inconsistent datasets, even if we encounter an error trying to - * process one of them. - */ -/* ARGSUSED */ int -dsl_destroy_inconsistent(const char *dsname, void *arg) +dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, + uint64_t refreservation) { - dsl_dataset_t *ds; + dsl_dataset_set_qr_arg_t ddsqra; - if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { - if (DS_IS_INCONSISTENT(ds)) - (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); - else - dsl_dataset_disown(ds, FTAG); - } - return (0); -} + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refreservation; + return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, + dsl_dataset_set_refreservation_sync, &ddsqra, 0)); +} /* * Return (in *usedp) the amount of space written in new that is not @@ -4179,6 +2746,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t snapobj; dsl_pool_t *dp = new->ds_dir->dd_pool; + ASSERT(dsl_pool_config_held(dp)); + *usedp = 0; *usedp += new->ds_phys->ds_referenced_bytes; *usedp -= oldsnap->ds_phys->ds_referenced_bytes; @@ -4191,7 +2760,6 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, *uncompp += new->ds_phys->ds_uncompressed_bytes; *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; - rw_enter(&dp->dp_config_rwlock, RW_READER); snapobj = new->ds_object; while (snapobj != oldsnap->ds_object) { dsl_dataset_t *snap; @@ -4240,7 +2808,6 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, } } - rw_exit(&dp->dp_config_rwlock); return (err); } @@ -4282,7 +2849,6 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, *usedp = *compp = *uncompp = 0; - rw_enter(&dp->dp_config_rwlock, RW_READER); snapobj = lastsnap->ds_phys->ds_next_snap_obj; while (snapobj != firstsnap->ds_object) { dsl_dataset_t *ds; @@ -4303,12 +2869,47 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, ASSERT3U(snapobj, !=, 0); dsl_dataset_rele(ds, FTAG); } - rw_exit(&dp->dp_config_rwlock); return (err); } +/* + * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. + * For example, they could both be snapshots of the same filesystem, and + * 'earlier' is before 'later'. Or 'earlier' could be the origin of + * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's + * filesystem. Or 'earlier' could be the origin's origin. + */ +boolean_t +dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier) +{ + dsl_pool_t *dp = later->ds_dir->dd_pool; + int error; + boolean_t ret; + dsl_dataset_t *origin; + + ASSERT(dsl_pool_config_held(dp)); + + if (earlier->ds_phys->ds_creation_txg >= + later->ds_phys->ds_creation_txg) + return (B_FALSE); + + if (later->ds_dir == earlier->ds_dir) + return (B_TRUE); + if (!dsl_dir_is_clone(later->ds_dir)) + return (B_FALSE); + + if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) + return (B_TRUE); + error = dsl_dataset_hold_obj(dp, + later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin); + if (error != 0) + return (B_FALSE); + ret = dsl_dataset_is_before(origin, earlier); + dsl_dataset_rele(origin, FTAG); + return (ret); +} + #if defined(_KERNEL) && defined(HAVE_SPL) -EXPORT_SYMBOL(dmu_snapshots_destroy_nvl); EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_obj); EXPORT_SYMBOL(dsl_dataset_own); @@ -4316,22 +2917,14 @@ EXPORT_SYMBOL(dsl_dataset_own_obj); EXPORT_SYMBOL(dsl_dataset_name); EXPORT_SYMBOL(dsl_dataset_rele); EXPORT_SYMBOL(dsl_dataset_disown); -EXPORT_SYMBOL(dsl_dataset_drop_ref); EXPORT_SYMBOL(dsl_dataset_tryown); -EXPORT_SYMBOL(dsl_dataset_make_exclusive); EXPORT_SYMBOL(dsl_dataset_create_sync); EXPORT_SYMBOL(dsl_dataset_create_sync_dd); -EXPORT_SYMBOL(dsl_dataset_destroy); -EXPORT_SYMBOL(dsl_dataset_destroy_check); -EXPORT_SYMBOL(dsl_dataset_destroy_sync); EXPORT_SYMBOL(dsl_dataset_snapshot_check); EXPORT_SYMBOL(dsl_dataset_snapshot_sync); -EXPORT_SYMBOL(dsl_dataset_rename); EXPORT_SYMBOL(dsl_dataset_promote); -EXPORT_SYMBOL(dsl_dataset_clone_swap); EXPORT_SYMBOL(dsl_dataset_user_hold); EXPORT_SYMBOL(dsl_dataset_user_release); -EXPORT_SYMBOL(dsl_dataset_user_release_tmp); EXPORT_SYMBOL(dsl_dataset_get_holds); EXPORT_SYMBOL(dsl_dataset_get_blkptr); EXPORT_SYMBOL(dsl_dataset_set_blkptr); @@ -4351,8 +2944,6 @@ EXPORT_SYMBOL(dsl_dataset_space); EXPORT_SYMBOL(dsl_dataset_fsid_guid); EXPORT_SYMBOL(dsl_dsobj_to_dsname); EXPORT_SYMBOL(dsl_dataset_check_quota); -EXPORT_SYMBOL(dsl_dataset_set_quota); -EXPORT_SYMBOL(dsl_dataset_set_quota_sync); -EXPORT_SYMBOL(dsl_dataset_set_reservation); -EXPORT_SYMBOL(dsl_destroy_inconsistent); +EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl); +EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl); #endif diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index 48c261e63..d09e79f1c 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -147,28 +147,37 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) return (0); } +typedef struct dsl_deleg_arg { + const char *dda_name; + nvlist_t *dda_nvlist; +} dsl_deleg_arg_t; + static void -dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - nvlist_t *nvp = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; nvpair_t *whopair = NULL; - uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t zapobj; + + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + zapobj = dd->dd_phys->dd_deleg_zapobj; if (zapobj == 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); } - while ((whopair = nvlist_next_nvpair(nvp, whopair))) { + while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) { const char *whokey = nvpair_name(whopair); nvlist_t *perms; nvpair_t *permpair = NULL; uint64_t jumpobj; - VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + perms = fnvpair_value_nvlist(whopair); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, @@ -185,21 +194,27 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) "%s %s", whokey, perm); } } + dsl_dir_rele(dd, FTAG); } static void -dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - nvlist_t *nvp = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; nvpair_t *whopair = NULL; - uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t zapobj; - if (zapobj == 0) + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + zapobj = dd->dd_phys->dd_deleg_zapobj; + if (zapobj == 0) { + dsl_dir_rele(dd, FTAG); return; + } - while ((whopair = nvlist_next_nvpair(nvp, whopair))) { + while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) { const char *whokey = nvpair_name(whopair); nvlist_t *perms; nvpair_t *permpair = NULL; @@ -234,35 +249,40 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) "%s %s", whokey, perm); } } + dsl_dir_rele(dd, FTAG); } -int -dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +static int +dsl_deleg_check(void *arg, dmu_tx_t *tx) { + dsl_deleg_arg_t *dda = arg; dsl_dir_t *dd; int error; - nvpair_t *whopair = NULL; - int blocks_modified = 0; - error = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (error) - return (error); - - if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < + if (spa_version(dmu_tx_pool(tx)->dp_spa) < SPA_VERSION_DELEGATED_PERMS) { - dsl_dir_close(dd, FTAG); return (ENOTSUP); } - while ((whopair = nvlist_next_nvpair(nvp, whopair))) - blocks_modified++; + error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); + if (error == 0) + dsl_dir_rele(dd, FTAG); + return (error); +} - error = dsl_sync_task_do(dd->dd_pool, NULL, - unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, - dd, nvp, blocks_modified); - dsl_dir_close(dd, FTAG); +int +dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +{ + dsl_deleg_arg_t dda; - return (error); + /* nvp must already have been verified to be valid */ + + dda.dda_name = ddname; + dda.dda_nvlist = nvp; + + return (dsl_sync_task(ddname, dsl_deleg_check, + unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, + &dda, fnvlist_num_pairs(nvp))); } /* @@ -293,9 +313,15 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) zap_attribute_t *baseza, *za; char *source; - error = dsl_dir_open(ddname, FTAG, &startdd, NULL); - if (error) + error = dsl_pool_hold(ddname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); + } dp = startdd->dd_pool; mos = dp->dp_meta_objset; @@ -307,20 +333,16 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) source = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP); VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - rw_enter(&dp->dp_config_rwlock, RW_READER); for (dd = startdd; dd != NULL; dd = dd->dd_parent) { nvlist_t *sp_nvp; uint64_t n; - if (dd->dd_phys->dd_deleg_zapobj && - (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, - &n) == 0) && n) { - VERIFY(nvlist_alloc(&sp_nvp, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - } else { + if (dd->dd_phys->dd_deleg_zapobj == 0 || + zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 || + n == 0) continue; - } + sp_nvp = fnvlist_alloc(); for (zap_cursor_init(basezc, mos, dd->dd_phys->dd_deleg_zapobj); zap_cursor_retrieve(basezc, baseza) == 0; @@ -330,27 +352,23 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) ASSERT(baseza->za_integer_length == 8); ASSERT(baseza->za_num_integers == 1); - VERIFY(nvlist_alloc(&perms_nvp, - NV_UNIQUE_NAME, KM_SLEEP) == 0); + perms_nvp = fnvlist_alloc(); for (zap_cursor_init(zc, mos, baseza->za_first_integer); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { - VERIFY(nvlist_add_boolean(perms_nvp, - za->za_name) == 0); + fnvlist_add_boolean(perms_nvp, za->za_name); } zap_cursor_fini(zc); - VERIFY(nvlist_add_nvlist(sp_nvp, baseza->za_name, - perms_nvp) == 0); - nvlist_free(perms_nvp); + fnvlist_add_nvlist(sp_nvp, baseza->za_name, perms_nvp); + fnvlist_free(perms_nvp); } zap_cursor_fini(basezc); dsl_dir_name(dd, source); - VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); + fnvlist_add_nvlist(*nvp, source, sp_nvp); nvlist_free(sp_nvp); } - rw_exit(&dp->dp_config_rwlock); kmem_free(source, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); kmem_free(baseza, sizeof(zap_attribute_t)); @@ -358,7 +376,8 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) kmem_free(za, sizeof(zap_attribute_t)); kmem_free(zc, sizeof(zap_cursor_t)); - dsl_dir_close(startdd, FTAG); + dsl_dir_rele(startdd, FTAG); + dsl_pool_rele(dp, FTAG); return (0); } @@ -564,7 +583,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), offsetof(perm_set_t, p_node)); - rw_enter(&dp->dp_config_rwlock, RW_READER); + ASSERT(dsl_pool_config_held(dp)); for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, checkflag = ZFS_DELEG_DESCENDENT) { uint64_t zapobj; @@ -625,7 +644,6 @@ again: } error = EPERM; success: - rw_exit(&dp->dp_config_rwlock); cookie = NULL; while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) @@ -637,15 +655,19 @@ success: int dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) { + dsl_pool_t *dp; dsl_dataset_t *ds; int error; - error = dsl_dataset_hold(dsname, FTAG, &ds); - if (error) + error = dsl_pool_hold(dsname, FTAG, &dp); + if (error != 0) return (error); - - error = dsl_deleg_access_impl(ds, perm, cr); - dsl_dataset_rele(ds, FTAG); + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + error = dsl_deleg_access_impl(ds, perm, cr); + dsl_dataset_rele(ds, FTAG); + } + dsl_pool_rele(dp, FTAG); return (error); } diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c new file mode 100644 index 000000000..1fb3859ac --- /dev/null +++ b/module/zfs/dsl_destroy.c @@ -0,0 +1,940 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dsl_userhold.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_scan.h> +#include <sys/dmu_objset.h> +#include <sys/zap.h> +#include <sys/zfeature.h> +#include <sys/zfs_ioctl.h> +#include <sys/dsl_deleg.h> + +typedef struct dmu_snapshots_destroy_arg { + nvlist_t *dsda_snaps; + nvlist_t *dsda_successful_snaps; + boolean_t dsda_defer; + nvlist_t *dsda_errlist; +} dmu_snapshots_destroy_arg_t; + +/* + * ds must be owned. + */ +static int +dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) +{ + if (!dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + if (dsl_dataset_long_held(ds)) + return (EBUSY); + + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (ENOTSUP); + return (0); + } + + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0) + return (EBUSY); + + /* + * Can't delete a branch point. + */ + if (ds->ds_phys->ds_num_children > 1) + return (EEXIST); + + return (0); +} + +static int +dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dmu_snapshots_destroy_arg_t *dsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int error = 0; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) { + dsl_dataset_t *ds; + + error = dsl_dataset_hold(dp, nvpair_name(pair), + FTAG, &ds); + + /* + * If the snapshot does not exist, silently ignore it + * (it's "already destroyed"). + */ + if (error == ENOENT) + continue; + + if (error == 0) { + error = dsl_destroy_snapshot_check_impl(ds, + dsda->dsda_defer); + dsl_dataset_rele(ds, FTAG); + } + + if (error == 0) { + fnvlist_add_boolean(dsda->dsda_successful_snaps, + nvpair_name(pair)); + } else { + fnvlist_add_int32(dsda->dsda_errlist, + nvpair_name(pair), error); + } + } + + pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL); + if (pair != NULL) + return (fnvpair_value_int32(pair)); + return (0); +} + +struct process_old_arg { + dsl_dataset_t *ds; + dsl_dataset_t *ds_prev; + boolean_t after_branch_point; + zio_t *pio; + uint64_t used, comp, uncomp; +}; + +static int +process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct process_old_arg *poa = arg; + dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; + + if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + if (poa->ds_prev && !poa->after_branch_point && + bp->blk_birth > + poa->ds_prev->ds_phys->ds_prev_snap_txg) { + poa->ds_prev->ds_phys->ds_unique_bytes += + bp_get_dsize_sync(dp->dp_spa, bp); + } + } else { + poa->used += bp_get_dsize_sync(dp->dp_spa, bp); + poa->comp += BP_GET_PSIZE(bp); + poa->uncomp += BP_GET_UCSIZE(bp); + dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); + } + return (0); +} + +static void +process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, + dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) +{ + struct process_old_arg poa = { 0 }; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t deadlist_obj; + + ASSERT(ds->ds_deadlist.dl_oldfmt); + ASSERT(ds_next->ds_deadlist.dl_oldfmt); + + poa.ds = ds; + poa.ds_prev = ds_prev; + poa.after_branch_point = after_branch_point; + poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, + process_old_cb, &poa, tx)); + VERIFY0(zio_wait(poa.pio)); + ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -poa.used, -poa.comp, -poa.uncomp, tx); + + /* swap next's deadlist to our deadlist */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_close(&ds_next->ds_deadlist); + deadlist_obj = ds->ds_phys->ds_deadlist_obj; + ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj; + ds_next->ds_phys->ds_deadlist_obj = deadlist_obj; + dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&ds_next->ds_deadlist, mos, + ds_next->ds_phys->ds_deadlist_obj); +} + +static void +dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but dsl_deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (ds->ds_dir->dd_phys->dd_clones == 0) + return; + + for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + dsl_dataset_remove_clones_key(clone, mintxg, tx); + } + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); +} + +void +dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) +{ +#ifdef ZFS_DEBUG + int err; +#endif + int after_branch_point = FALSE; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds_prev = NULL; + uint64_t obj, old_unique, used = 0, comp = 0, uncomp = 0; + dsl_dataset_t *ds_next, *ds_head, *hds; + + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT(refcount_is_zero(&ds->ds_longholds)); + + if (defer && + (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); + return; + } + + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, ""); + + dsl_scan_ds_destroyed(ds, tx); + + obj = ds->ds_object; + + if (ds->ds_phys->ds_prev_snap_obj != 0) { + ASSERT3P(ds->ds_prev, ==, NULL); + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); + after_branch_point = + (ds_prev->ds_phys->ds_next_snap_obj != obj); + + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + if (after_branch_point && + ds_prev->ds_phys->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY0(zap_add_int(mos, + ds_prev->ds_phys->ds_next_clones_obj, + ds->ds_phys->ds_next_snap_obj, tx)); + } + } + if (!after_branch_point) { + ds_prev->ds_phys->ds_next_snap_obj = + ds->ds_phys->ds_next_snap_obj; + } + } + + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); + ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + + old_unique = ds_next->ds_phys->ds_unique_bytes; + + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); + ds_next->ds_phys->ds_prev_snap_obj = + ds->ds_phys->ds_prev_snap_obj; + ds_next->ds_phys->ds_prev_snap_txg = + ds->ds_phys->ds_prev_snap_txg; + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, + ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); + + if (ds_next->ds_deadlist.dl_oldfmt) { + process_old_deadlist(ds, ds_prev, ds_next, + after_branch_point, tx); + } else { + /* Adjust prev's unique space. */ + if (ds_prev && !after_branch_point) { + dsl_deadlist_space_range(&ds_next->ds_deadlist, + ds_prev->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_prev_snap_txg, + &used, &comp, &uncomp); + ds_prev->ds_phys->ds_unique_bytes += used; + } + + /* Adjust snapused. */ + dsl_deadlist_space_range(&ds_next->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -comp, -uncomp, tx); + + /* Move blocks to be freed to pool's free list. */ + dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, + &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, + tx); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, comp, uncomp, tx); + + /* Merge our deadlist into next's and free it. */ + dsl_deadlist_merge(&ds_next->ds_deadlist, + ds->ds_phys->ds_deadlist_obj, tx); + } + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_deadlist_obj = 0; + + /* Collapse range in clone heads */ + dsl_dataset_remove_clones_key(ds, + ds->ds_phys->ds_creation_txg, tx); + + if (dsl_dataset_is_snapshot(ds_next)) { + dsl_dataset_t *ds_nextnext; + + /* + * Update next's unique to include blocks which + * were previously shared by only this snapshot + * and it. Those blocks will be born after the + * prev snap and before this snap, and will have + * died after the next snap and before the one + * after that (ie. be on the snap after next's + * deadlist). + */ + VERIFY0(dsl_dataset_hold_obj(dp, + ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext)); + dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_creation_txg, + &used, &comp, &uncomp); + ds_next->ds_phys->ds_unique_bytes += used; + dsl_dataset_rele(ds_nextnext, FTAG); + ASSERT3P(ds_next->ds_prev, ==, NULL); + + /* Collapse range in this head. */ + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds)); + dsl_deadlist_remove_key(&hds->ds_deadlist, + ds->ds_phys->ds_creation_txg, tx); + dsl_dataset_rele(hds, FTAG); + + } else { + ASSERT3P(ds_next->ds_prev, ==, ds); + dsl_dataset_rele(ds_next->ds_prev, ds_next); + ds_next->ds_prev = NULL; + if (ds_prev) { + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, + ds_next, &ds_next->ds_prev)); + } + + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsumed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + ds_next->ds_phys->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + } + } + dsl_dataset_rele(ds_next, FTAG); + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* remove from snapshot namespace */ + ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); + VERIFY0(dsl_dataset_get_snapname(ds)); +#ifdef ZFS_DEBUG + { + uint64_t val; + + err = dsl_dataset_snap_lookup(ds_head, + ds->ds_snapname, &val); + ASSERT0(err); + ASSERT3U(val, ==, obj); + } +#endif + VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx)); + dsl_dataset_rele(ds_head, FTAG); + + if (ds_prev != NULL) + dsl_dataset_rele(ds_prev, FTAG); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + if (ds->ds_phys->ds_next_clones_obj != 0) { + ASSERTV(uint64_t count); + ASSERT0(zap_count(mos, + ds->ds_phys->ds_next_clones_obj, &count) && count == 0); + VERIFY0(dmu_object_free(mos, + ds->ds_phys->ds_next_clones_obj, tx)); + } + if (ds->ds_phys->ds_props_obj != 0) + VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + if (ds->ds_phys->ds_userrefs_obj != 0) + VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + VERIFY0(dmu_object_free(mos, obj, tx)); +} + +static void +dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dmu_snapshots_destroy_arg_t *dsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL); + pair != NULL; + pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + + dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx); + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The semantics of this function are described in the comment above + * lzc_destroy_snaps(). To summarize: + * + * The snapshots must all be in the same pool. + * + * Snapshots that don't exist will be silently ignored (considered to be + * "already deleted"). + * + * On success, all snaps will be destroyed and this will return 0. + * On failure, no snaps will be destroyed, the errlist will be filled in, + * and this will return an errno. + */ +int +dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, + nvlist_t *errlist) +{ + dmu_snapshots_destroy_arg_t dsda; + int error; + nvpair_t *pair; + + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + + dsda.dsda_snaps = snaps; + VERIFY0(nvlist_alloc(&dsda.dsda_successful_snaps, NV_UNIQUE_NAME, KM_PUSHPAGE)); + dsda.dsda_defer = defer; + dsda.dsda_errlist = errlist; + + error = dsl_sync_task(nvpair_name(pair), + dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync, + &dsda, 0); + fnvlist_free(dsda.dsda_successful_snaps); + + return (error); +} + +int +dsl_destroy_snapshot(const char *name, boolean_t defer) +{ + int error; + nvlist_t *nvl; + nvlist_t *errlist; + + VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); + VERIFY0(nvlist_alloc(&errlist, NV_UNIQUE_NAME, KM_PUSHPAGE)); + + fnvlist_add_boolean(nvl, name); + error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); + fnvlist_free(errlist); + fnvlist_free(nvl); + return (error); +} + +struct killarg { + dsl_dataset_t *ds; + dmu_tx_t *tx; +}; + +/* ARGSUSED */ +static int +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct killarg *ka = arg; + dmu_tx_t *tx = ka->tx; + + if (bp == NULL) + return (0); + + if (zb->zb_level == ZB_ZIL_LEVEL) { + ASSERT(zilog != NULL); + /* + * It's a block in the intent log. It has no + * accounting, so just free it. + */ + dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); + } else { + ASSERT(zilog == NULL); + ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); + } + + return (0); +} + +static void +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + struct killarg ka; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + VERIFY0(traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, + kill_blkptr, &ka)); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); +} + +typedef struct dsl_destroy_head_arg { + const char *ddha_name; +} dsl_destroy_head_arg_t; + +int +dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) +{ + int error; + uint64_t count; + objset_t *mos; + + if (dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + if (refcount_count(&ds->ds_longholds) != expected_holds) + return (EBUSY); + + mos = ds->ds_dir->dd_pool->dp_meta_objset; + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) + return (EBUSY); + + /* + * Can't delete if there are children of this fs. + */ + error = zap_count(mos, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); + if (error != 0) + return (error); + if (count != 0) + return (EEXIST); + + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && + ds->ds_prev->ds_phys->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0) { + /* We need to remove the origin snapshot as well. */ + if (!refcount_is_zero(&ds->ds_prev->ds_longholds)) + return (EBUSY); + } + return (0); +} + +static int +dsl_destroy_head_check(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(ds, 0); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) +{ + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dd_used_t t; + + ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); + + VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); + + ASSERT0(dd->dd_phys->dd_head_dataset_obj); + + /* + * Remove our reservation. The impl() routine avoids setting the + * actual property, which would require the (already destroyed) ds. + */ + dsl_dir_set_reservation_sync_impl(dd, 0, tx); + + ASSERT0(dd->dd_phys->dd_used_bytes); + ASSERT0(dd->dd_phys->dd_reserved); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT0(dd->dd_phys->dd_used_breakdown[t]); + + VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); + VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); + VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); + VERIFY0(zap_remove(mos, + dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); + + dsl_dir_rele(dd, FTAG); + VERIFY0(dmu_object_free(mos, ddobj, tx)); +} + +void +dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + uint64_t obj, ddobj, prevobj = 0; + boolean_t rmorigin; + zfeature_info_t *async_destroy; + objset_t *os; + + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT(ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); + ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, ""); + + rmorigin = (dsl_dir_is_clone(ds->ds_dir) && + DS_IS_DEFER_DESTROY(ds->ds_prev) && + ds->ds_prev->ds_phys->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0); + + /* Remove our reservation */ + if (ds->ds_reserved != 0) { + dsl_dataset_set_refreservation_sync_impl(ds, + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + 0, tx); + ASSERT0(ds->ds_reserved); + } + + dsl_scan_ds_destroyed(ds, tx); + + obj = ds->ds_object; + + if (ds->ds_phys->ds_prev_snap_obj != 0) { + /* This is a clone */ + ASSERT(ds->ds_prev != NULL); + ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj); + ASSERT0(ds->ds_phys->ds_next_snap_obj); + + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds->ds_prev, + obj, tx); + } + + ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1); + ds->ds_prev->ds_phys->ds_num_children--; + } + + async_destroy = + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; + + /* + * Destroy the deadlist. Unless it's a clone, the + * deadlist should be empty. (If it's a clone, it's + * safe to ignore the deadlist contents.) + */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_deadlist_obj = 0; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + + if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { + old_synchronous_dataset_destroy(ds, tx); + } else { + /* + * Move the bptree into the pool's list of trees to + * clean up and update space accounting information. + */ + uint64_t used, comp, uncomp; + + zil_destroy_sync(dmu_objset_zil(os), tx); + + if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { + spa_feature_incr(dp->dp_spa, async_destroy, tx); + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY0(zap_add(mos, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx)); + } + + used = ds->ds_dir->dd_phys->dd_used_bytes; + comp = ds->ds_dir->dd_phys->dd_compressed_bytes; + uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + ds->ds_phys->ds_unique_bytes == used); + + bptree_add(mos, dp->dp_bptree_obj, + &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, + used, comp, uncomp, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + } + + if (ds->ds_prev != NULL) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY0(zap_remove_int(mos, + ds->ds_prev->ds_dir->dd_phys->dd_clones, + ds->ds_object, tx)); + } + prevobj = ds->ds_prev->ds_object; + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = NULL; + } + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* Erase the link in the dir */ + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; + ddobj = ds->ds_dir->dd_object; + ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); + VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx)); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + ASSERT0(ds->ds_phys->ds_next_clones_obj); + ASSERT0(ds->ds_phys->ds_props_obj); + ASSERT0(ds->ds_phys->ds_userrefs_obj); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + VERIFY0(dmu_object_free(mos, obj, tx)); + + dsl_dir_destroy_sync(ddobj, tx); + + if (rmorigin) { + dsl_dataset_t *prev; + VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); + dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); + dsl_dataset_rele(prev, FTAG); + } +} + +static void +dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + dsl_destroy_head_sync_impl(ds, tx); + dsl_dataset_rele(ds, FTAG); +} + +static void +dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + + /* Mark it as inconsistent on-disk, in case we crash */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_log_internal_ds(ds, "destroy begin", tx, ""); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_destroy_head(const char *name) +{ + dsl_destroy_head_arg_t ddha; + int error; + spa_t *spa; + boolean_t isenabled; + +#ifdef _KERNEL + zfs_destroy_unmount_origin(name); +#endif + + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + isenabled = spa_feature_is_enabled(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]); + spa_close(spa, FTAG); + + ddha.ddha_name = name; + + if (!isenabled) { + objset_t *os; + + error = dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_begin_sync, &ddha, 0); + if (error != 0) + return (error); + + /* + * Head deletion is processed in one txg on old pools; + * remove the objects from open context so that the txg sync + * is not too long. + */ + error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); + if (error == 0) { + uint64_t obj; + uint64_t prev_snap_txg = + dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg; + for (obj = 0; error == 0; + error = dmu_object_next(os, &obj, FALSE, + prev_snap_txg)) + (void) dmu_free_object(os, obj); + /* sync out all frees */ + txg_wait_synced(dmu_objset_pool(os), 0); + dmu_objset_disown(os, FTAG); + } + } + + return (dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_sync, &ddha, 0)); +} + +/* + * Note, this function is used as the callback for dmu_objset_find(). We + * always return 0 so that we will continue to find and process + * inconsistent datasets, even if we encounter an error trying to + * process one of them. + */ +/* ARGSUSED */ +int +dsl_destroy_inconsistent(const char *dsname, void *arg) +{ + objset_t *os; + + if (dmu_objset_hold(dsname, FTAG, &os) == 0) { + boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + dmu_objset_rele(os, FTAG); + if (inconsistent) + (void) dsl_destroy_head(dsname); + } + return (0); +} + + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(dsl_destroy_head); +EXPORT_SYMBOL(dsl_destroy_head_sync_impl); +EXPORT_SYMBOL(dsl_dataset_user_hold_check_one); +EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl); +EXPORT_SYMBOL(dsl_destroy_inconsistent); +EXPORT_SYMBOL(dsl_dataset_user_release_tmp); +EXPORT_SYMBOL(dsl_destroy_head_check_impl); +#endif diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 45c73c363..ccae3f270 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -40,8 +40,6 @@ #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, - uint64_t value, dmu_tx_t *tx); /* ARGSUSED */ static void @@ -58,7 +56,7 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) } if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); spa_close(dd->dd_pool->dp_spa, dd); @@ -72,18 +70,17 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) } int -dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, +dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; int err; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); + ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); - if (err) + if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG @@ -110,9 +107,9 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_snap_cmtime_update(dd); if (dd->dd_phys->dd_parent_obj) { - err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, + err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); - if (err) + if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG @@ -129,7 +126,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_parent->dd_phys->dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } - if (err) + if (err != 0) goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); @@ -146,7 +143,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, */ err = dmu_bonus_hold(dp->dp_meta_objset, dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); - if (err) + if (err != 0) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = @@ -158,7 +155,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_evict); if (winner) { if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; @@ -185,7 +182,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, errout: if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); @@ -193,7 +190,7 @@ errout: } void -dsl_dir_close(dsl_dir_t *dd, void *tag) +dsl_dir_rele(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); @@ -250,6 +247,7 @@ static int getcomponent(const char *path, char *component, const char **nextp) { char *p; + if ((path == NULL) || (path[0] == '\0')) return (ENOENT); /* This would be a good place to reserve some namespace... */ @@ -272,10 +270,10 @@ getcomponent(const char *path, char *component, const char **nextp) (void) strcpy(component, path); p = NULL; } else if (p[0] == '/') { - if (p-path >= MAXNAMELEN) + if (p - path >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(component, path, p - path); - component[p-path] = '\0'; + component[p - path] = '\0'; p++; } else if (p[0] == '@') { /* @@ -284,66 +282,57 @@ getcomponent(const char *path, char *component, const char **nextp) */ if (strchr(path, '/')) return (EINVAL); - if (p-path >= MAXNAMELEN) + if (p - path >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(component, path, p - path); - component[p-path] = '\0'; + component[p - path] = '\0'; } else { - ASSERT(!"invalid p"); + panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* - * same as dsl_dir_open, ignore the first component of name and use the - * spa instead + * Return the dsl_dir_t, and possibly the last component which couldn't + * be found in *tail. The name must be in the specified dsl_pool_t. This + * thread must hold the dp_config_rwlock for the pool. Returns NULL if the + * path is bogus, or if tail==NULL and we couldn't parse the whole name. + * (*tail)[0] == '@' means that the last component is a snapshot. */ int -dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, +dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { char *buf; - const char *next, *nextnext = NULL; + const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; - dsl_pool_t *dp; uint64_t ddobj; - int openedspa = FALSE; - - dprintf("%s\n", name); buf = kmem_alloc(MAXNAMELEN, KM_PUSHPAGE); err = getcomponent(name, buf, &next); - if (err) + if (err != 0) goto error; - if (spa == NULL) { - err = spa_open(buf, &spa, FTAG); - if (err) { - dprintf("spa_open(%s) failed\n", buf); - goto error; - } - openedspa = TRUE; - /* XXX this assertion belongs in spa_open */ - ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); + /* Make sure the name is in the specified pool. */ + spaname = spa_name(dp->dp_spa); + if (strcmp(buf, spaname) != 0) { + err = EINVAL; + goto error; } - dp = spa_get_dsl(spa); + ASSERT(dsl_pool_config_held(dp)); - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); - if (err) { - rw_exit(&dp->dp_config_rwlock); - if (openedspa) - spa_close(spa, FTAG); + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); + if (err != 0) { goto error; } while (next != NULL) { dsl_dir_t *child_ds; err = getcomponent(next, buf, &nextnext); - if (err) + if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') @@ -354,25 +343,22 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, err = zap_lookup(dp->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); - if (err) { + if (err != 0) { if (err == ENOENT) err = 0; break; } - err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); - if (err) + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); + if (err != 0) break; - dsl_dir_close(dd, tag); + dsl_dir_rele(dd, tag); dd = child_ds; next = nextnext; } - rw_exit(&dp->dp_config_rwlock); - if (err) { - dsl_dir_close(dd, tag); - if (openedspa) - spa_close(spa, FTAG); + if (err != 0) { + dsl_dir_rele(dd, tag); goto error; } @@ -383,32 +369,18 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ - dsl_dir_close(dd, tag); + dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = ENOENT; } - if (tailp) + if (tailp != NULL) *tailp = next; - if (openedspa) - spa_close(spa, FTAG); *ddp = dd; error: kmem_free(buf, MAXNAMELEN); return (err); } -/* - * Return the dsl_dir_t, and possibly the last component which couldn't - * be found in *tail. Return NULL if the path is bogus, or if - * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' - * means that the last component is a snapshot. - */ -int -dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) -{ - return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); -} - uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) @@ -446,71 +418,6 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, return (ddobj); } -/* ARGSUSED */ -int -dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - uint64_t count; - - /* - * There should be exactly two holds, both from - * dsl_dataset_destroy: one on the dd directory, and one on its - * head ds. If there are more holds, then a concurrent thread is - * performing a lookup inside this dir while we're trying to destroy - * it. To minimize this possibility, we perform this check only - * in syncing context and fail the operation if we encounter - * additional holds. The dp_config_rwlock ensures that nobody else - * opens it after we check. - */ - if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2) - return (EBUSY); - - err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); - if (err) - return (err); - if (count != 0) - return (EEXIST); - - return (0); -} - -void -dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t obj; - dd_used_t t; - - ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); - ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); - - /* - * Remove our reservation. The impl() routine avoids setting the - * actual property, which would require the (already destroyed) ds. - */ - dsl_dir_set_reservation_sync_impl(dd, 0, tx); - - ASSERT0(dd->dd_phys->dd_used_bytes); - ASSERT0(dd->dd_phys->dd_reserved); - for (t = 0; t < DD_USED_NUM; t++) - ASSERT0(dd->dd_phys->dd_used_breakdown[t]); - - VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); - VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); - VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); - VERIFY(0 == zap_remove(mos, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); - - obj = dd->dd_object; - dsl_dir_close(dd, tag); - VERIFY(0 == dmu_object_free(mos, obj, tx)); -} - boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { @@ -546,18 +453,16 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) } mutex_exit(&dd->dd_lock); - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } - rw_exit(&dd->dd_pool->dp_config_rwlock); } void @@ -567,7 +472,7 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) ASSERT(dd->dd_phys); - if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { + if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } @@ -854,7 +759,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, FALSE, asize > usize, tr_list, tx, TRUE); } - if (err) + if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; @@ -1007,118 +912,123 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, mutex_exit(&dd->dd_lock); } +typedef struct dsl_dir_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dir_set_qr_arg_t; + static int -dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - int err; - uint64_t towrite; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t towrite, newval; - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_prop_predict(ds->ds_dir, "quota", + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } - if (psa->psa_effective_value == 0) + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); return (0); + } - mutex_enter(&dd->dd_lock); + mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ - towrite = dsl_dir_space_towrite(dd); + towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && - (psa->psa_effective_value < dd->dd_phys->dd_reserved || - psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { - err = ENOSPC; + (newval < ds->ds_dir->dd_phys->dd_reserved || + newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) { + error = ENOSPC; } - mutex_exit(&dd->dd_lock); - return (err); + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); + return (error); } -extern dsl_syncfunc_t dsl_prop_set_sync; - static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; - dsl_prop_set_sync(ds, psa, tx); - DSL_PROP_CHECK_PREDICTION(dd, psa); + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - mutex_enter(&dd->dd_lock); - dd->dd_phys->dd_quota = effective_value; - mutex_exit(&dd->dd_lock); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); - spa_history_log_internal_dd(dd, "set quota", tx, - "quota=%lld", (longlong_t)effective_value); + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + ds->ds_dir->dd_phys->dd_quota = newval; + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { - dsl_dir_t *dd; - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); - - err = dsl_dataset_hold(ddname, FTAG, &ds); - if (err) - return (err); - - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } + dsl_dir_set_qr_arg_t ddsqra; - ASSERT(ds->ds_dir == dd); + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = quota; - /* - * If someone removes a file, then tries to set the quota, we want to - * make sure the file freeing takes effect. - */ - txg_wait_open(dd->dd_pool, 0); - - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, ds, &psa, 0); - - dsl_dir_close(dd, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); + return (dsl_sync_task(ddname, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, &ddsqra, 0)); } int -dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value; - uint64_t used, avail; - int err; - - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + uint64_t newval, used, avail; + int error; - effective_value = psa->psa_effective_value; + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ - if (!dmu_tx_is_syncing(tx)) + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); return (0); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; @@ -1131,21 +1041,21 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } - if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { - uint64_t delta = MAX(used, effective_value) - + if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) { + uint64_t delta = MAX(used, newval) - MAX(used, dd->dd_phys->dd_reserved); - if (delta > avail) - return (ENOSPC); - if (dd->dd_phys->dd_quota > 0 && - effective_value > dd->dd_phys->dd_quota) - return (ENOSPC); + if (delta > avail || + (dd->dd_phys->dd_quota > 0 && + newval > dd->dd_phys->dd_quota)) + error = ENOSPC; } - return (0); + dsl_dataset_rele(ds, FTAG); + return (error); } -static void +void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; @@ -1167,51 +1077,38 @@ dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) } static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t value = psa->psa_effective_value; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; - dsl_prop_set_sync(ds, psa, tx); - DSL_PROP_CHECK_PREDICTION(dd, psa); - - dsl_dir_set_reservation_sync_impl(dd, value, tx); + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - spa_history_log_internal_dd(dd, "set reservation", tx, - "reservation=%lld", (longlong_t)value); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); + + dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); + dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { - dsl_dir_t *dd; - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); - - err = dsl_dataset_hold(ddname, FTAG, &ds); - if (err) - return (err); - - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } + dsl_dir_set_qr_arg_t ddsqra; - ASSERT(ds->ds_dir == dd); + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = reservation; - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, - dsl_dir_set_reservation_sync, ds, &psa, 0); - - dsl_dir_close(dd, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); + return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, + dsl_dir_set_reservation_sync, &ddsqra, 0)); } static dsl_dir_t * @@ -1243,79 +1140,123 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) return (would_change(dd->dd_parent, delta, ancestor)); } -struct renamearg { - dsl_dir_t *newparent; - const char *mynewname; -}; +typedef struct dsl_dir_rename_arg { + const char *ddra_oldname; + const char *ddra_newname; +} dsl_dir_rename_arg_t; +/* ARGSUSED */ static int -dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { - dsl_dir_t *dd = arg1; - struct renamearg *ra = arg2; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - uint64_t val; + int *deltap = arg; + char namebuf[MAXNAMELEN]; - /* - * There should only be one reference, from dmu_objset_rename(). - * Fleeting holds are also possible (eg, from "zfs list" getting - * stats), but any that are present in open context will likely - * be gone by syncing context, so only fail from syncing - * context. - */ - if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1) - return (EBUSY); + dsl_dataset_name(ds, namebuf); + + if (strlen(namebuf) + *deltap >= MAXNAMELEN) + return (ENAMETOOLONG); + return (0); +} + +static int +dsl_dir_rename_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + const char *mynewname; + int error; + int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); - /* check for existing name */ - err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, - ra->mynewname, 8, 1, &val); - if (err == 0) + /* target dir should exist */ + error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); + if (error != 0) + return (error); + + /* new parent should exist */ + error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, + &newparent, &mynewname); + if (error != 0) { + dsl_dir_rele(dd, FTAG); + return (error); + } + + /* can't rename to different pool */ + if (dd->dd_pool != newparent->dd_pool) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (ENXIO); + } + + /* new name should not already exist */ + if (mynewname == NULL) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); return (EEXIST); - if (err != ENOENT) - return (err); + } + + /* if the name length is growing, validate child name lengths */ + if (delta > 0) { + error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, + &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + } - if (ra->newparent != dd->dd_parent) { + if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); /* no rename into our descendant */ - if (closest_common_ancestor(dd, ra->newparent) == dd) + if (closest_common_ancestor(dd, newparent) == dd) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); return (EINVAL); + } - if ((err = dsl_dir_transfer_possible(dd->dd_parent, - ra->newparent, myspace))) - return (err); + error = dsl_dir_transfer_possible(dd->dd_parent, + newparent, myspace); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } } + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); return (0); } static void -dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct renamearg *ra = arg2; - dsl_pool_t *dp = dd->dd_pool; + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + const char *mynewname; + int error; objset_t *mos = dp->dp_meta_objset; - int err; - char namebuf[MAXNAMELEN]; - ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); + VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); + VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, + &mynewname)); /* Log this before we change the name. */ - dsl_dir_name(ra->newparent, namebuf); spa_history_log_internal_dd(dd, "rename", tx, - "-> %s/%s", namebuf, ra->mynewname); + "-> %s", ddra->ddra_newname); - if (ra->newparent != dd->dd_parent) { + if (newparent != dd->dd_parent) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dd->dd_phys->dd_used_bytes, -dd->dd_phys->dd_compressed_bytes, -dd->dd_phys->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, + dsl_dir_diduse_space(newparent, DD_USED_CHILD, dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_compressed_bytes, dd->dd_phys->dd_uncompressed_bytes, tx); @@ -1326,7 +1267,7 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); - dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, + dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } @@ -1334,52 +1275,36 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ - err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, + error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx); - ASSERT0(err); + ASSERT0(error); - (void) strcpy(dd->dd_myname, ra->mynewname); - dsl_dir_close(dd->dd_parent, dd); - dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; - VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, - ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); + (void) strcpy(dd->dd_myname, mynewname); + dsl_dir_rele(dd->dd_parent, dd); + dd->dd_phys->dd_parent_obj = newparent->dd_object; + VERIFY0(dsl_dir_hold_obj(dp, + newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ - err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, - dd->dd_myname, 8, 1, &dd->dd_object, tx); - ASSERT0(err); + VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj, + dd->dd_myname, 8, 1, &dd->dd_object, tx)); + + dsl_prop_notify_all(dd); + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); } int -dsl_dir_rename(dsl_dir_t *dd, const char *newname) +dsl_dir_rename(const char *oldname, const char *newname) { - struct renamearg ra; - int err; + dsl_dir_rename_arg_t ddra; - /* new parent should exist */ - err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); - if (err) - return (err); + ddra.ddra_oldname = oldname; + ddra.ddra_newname = newname; - /* can't rename to different pool */ - if (dd->dd_pool != ra.newparent->dd_pool) { - err = ENXIO; - goto out; - } - - /* new name should not already exist */ - if (ra.mynewname == NULL) { - err = EEXIST; - goto out; - } - - err = dsl_sync_task_do(dd->dd_pool, - dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); - -out: - dsl_dir_close(ra.newparent, FTAG); - return (err); + return (dsl_sync_task(oldname, + dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3)); } int @@ -1424,6 +1349,4 @@ dsl_dir_snap_cmtime_update(dsl_dir_t *dd) #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); -EXPORT_SYMBOL(dsl_dir_open); -EXPORT_SYMBOL(dsl_dir_close); #endif diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 7795d8045..b59e056bf 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -43,6 +43,7 @@ #include <sys/bptree.h> #include <sys/zfeature.h> #include <sys/zil_impl.h> +#include <sys/dsl_userhold.h> int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -264,7 +265,7 @@ dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) if (err) return (err); - return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); + return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * @@ -276,7 +277,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; - rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); + rrw_init(&dp->dp_config_rwlock, B_TRUE); dp->dp_write_limit = zfs_write_limit_min; txg_init(dp, txg); @@ -287,7 +288,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_dirty_dirs, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, - offsetof(dsl_sync_task_group_t, dstg_node)); + offsetof(dsl_sync_task_t, dst_node)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); @@ -324,14 +325,14 @@ dsl_pool_open(dsl_pool_t *dp) dsl_dataset_t *ds; uint64_t obj; - rw_enter(&dp->dp_config_rwlock, RW_WRITER); + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; - err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; @@ -352,7 +353,7 @@ dsl_pool_open(dsl_pool_t *dp) &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } - dsl_dir_close(dd, dp); + dsl_dir_rele(dd, dp); if (err) goto out; } @@ -367,7 +368,7 @@ dsl_pool_open(dsl_pool_t *dp) DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; - VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } @@ -400,7 +401,7 @@ dsl_pool_open(dsl_pool_t *dp) err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: - rw_exit(&dp->dp_config_rwlock); + rrw_exit(&dp->dp_config_rwlock, FTAG); return (err); } @@ -415,13 +416,13 @@ dsl_pool_close(dsl_pool_t *dp) * and not a hold, so just drop that here. */ if (dp->dp_origin_snap) - dsl_dataset_drop_ref(dp->dp_origin_snap, dp); + dsl_dataset_rele(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) - dsl_dir_close(dp->dp_mos_dir, dp); + dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir) - dsl_dir_close(dp->dp_free_dir, dp); + dsl_dir_rele(dp->dp_free_dir, dp); if (dp->dp_root_dir) - dsl_dir_close(dp->dp_root_dir, dp); + dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); @@ -439,7 +440,7 @@ dsl_pool_close(dsl_pool_t *dp) dsl_scan_fini(dp); dsl_pool_tx_assign_destroy(dp); dsl_pool_txg_history_destroy(dp); - rw_destroy(&dp->dp_config_rwlock); + rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_iput_taskq); if (dp->dp_blkstats) @@ -457,6 +458,8 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) dsl_dataset_t *ds; uint64_t obj; + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); @@ -467,30 +470,30 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) ASSERT0(err); /* Initialize scan structures */ - VERIFY3U(0, ==, dsl_scan_init(dp, txg)); + VERIFY0(dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); - VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_special_dir(dp, + VERIFY0(dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { /* create and open the free dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_special_dir(dp, + VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); - VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } @@ -501,7 +504,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); /* create the root objset */ - VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx))); #ifdef _KERNEL @@ -511,6 +514,8 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) dmu_tx_commit(tx); + rrw_exit(&dp->dp_config_rwlock, FTAG); + return (dp); } @@ -533,10 +538,7 @@ static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; - dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); - rw_enter(&dp->dp_config_rwlock, RW_READER); dsl_deadlist_insert(dl, bp, tx); - rw_exit(&dp->dp_config_rwlock); return (0); } @@ -558,7 +560,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) /* * We need to copy dp_space_towrite() before doing - * dsl_sync_task_group_sync(), because + * dsl_sync_task_sync(), because * dsl_dataset_snapshot_reserve_space() will increase * dp_space_towrite but not actually write anything. */ @@ -673,14 +675,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ DTRACE_PROBE(pool_sync__3task); if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { - dsl_sync_task_group_t *dstg; + dsl_sync_task_t *dst; /* * No more sync tasks should have been added while we * were syncing. */ ASSERT(spa_sync_pass(dp->dp_spa) == 1); - while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) - dsl_sync_task_group_sync(dstg, tx); + while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg))) + dsl_sync_task_sync(dst, tx); } dmu_tx_commit(tx); @@ -857,14 +859,13 @@ dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) /* ARGSUSED */ static int -upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; - dsl_pool_t *dp = spa_get_dsl(spa); - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); @@ -890,7 +891,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) * The $ORIGIN can't have any data, or the accounting * will be wrong. */ - ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); + ASSERT0(prev->ds_phys->ds_bp.blk_birth); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { @@ -910,13 +911,13 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) if (ds->ds_phys->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); - VERIFY(0 == dsl_dataset_hold_obj(dp, + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); } } - ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); - ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); + ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object); + ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object); if (prev->ds_phys->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); @@ -924,7 +925,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY(0 == zap_add_int(dp->dp_meta_objset, + VERIFY0(zap_add_int(dp->dp_meta_objset, prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); @@ -939,25 +940,21 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); - VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, tx, DS_FIND_CHILDREN)); } /* ARGSUSED */ static int -upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dmu_tx_t *tx = arg; - dsl_dataset_t *ds; - dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - - if (ds->ds_dir->dd_phys->dd_origin_obj) { + if (ds->ds_dir->dd_phys->dd_origin_obj != 0) { dsl_dataset_t *origin; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); if (origin->ds_dir->dd_phys->dd_clones == 0) { @@ -966,13 +963,11 @@ upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, - origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); dsl_dataset_rele(origin, FTAG); } - - dsl_dataset_rele(ds, FTAG); return (0); } @@ -984,7 +979,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_special_dir(dp, + VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* @@ -994,12 +989,11 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); - VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); - VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, - dp->dp_meta_objset, obj)); + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); - VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); } @@ -1011,17 +1005,16 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); + ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ - rw_enter(&dp->dp_config_rwlock, RW_WRITER); dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, tx); - VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); - VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); - rw_exit(&dp->dp_config_rwlock); } taskq_t * @@ -1056,7 +1049,7 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) *htag = '\0'; ++htag; dsobj = strtonum(za.za_name, NULL); - (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); + dsl_dataset_user_release_tmp(dp, dsobj, htag); } zap_cursor_fini(&zc); } @@ -1078,7 +1071,7 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) + const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; @@ -1103,7 +1096,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) - error = zap_add(mos, zapobj, name, 8, 1, now, tx); + error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); strfree(name); @@ -1116,7 +1109,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - uint64_t *now, dmu_tx_t *tx) + uint64_t now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } @@ -1128,10 +1121,113 @@ int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx) { - return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); } +/* + * DSL Pool Configuration Lock + * + * The dp_config_rwlock protects against changes to DSL state (e.g. dataset + * creation / destruction / rename / property setting). It must be held for + * read to hold a dataset or dsl_dir. I.e. you must call + * dsl_pool_config_enter() or dsl_pool_hold() before calling + * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock + * must be held continuously until all datasets and dsl_dirs are released. + * + * The only exception to this rule is that if a "long hold" is placed on + * a dataset, then the dp_config_rwlock may be dropped while the dataset + * is still held. The long hold will prevent the dataset from being + * destroyed -- the destroy will fail with EBUSY. A long hold can be + * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset + * (by calling dsl_{dataset,objset}_{try}own{_obj}). + * + * Legitimate long-holders (including owners) should be long-running, cancelable + * tasks that should cause "zfs destroy" to fail. This includes DMU + * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), + * "zfs send", and "zfs diff". There are several other long-holders whose + * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). + * + * The usual formula for long-holding would be: + * dsl_pool_hold() + * dsl_dataset_hold() + * ... perform checks ... + * dsl_dataset_long_hold() + * dsl_pool_rele() + * ... perform long-running task ... + * dsl_dataset_long_rele() + * dsl_dataset_rele() + * + * Note that when the long hold is released, the dataset is still held but + * the pool is not held. The dataset may change arbitrarily during this time + * (e.g. it could be destroyed). Therefore you shouldn't do anything to the + * dataset except release it. + * + * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only + * or modifying operations. + * + * Modifying operations should generally use dsl_sync_task(). The synctask + * infrastructure enforces proper locking strategy with respect to the + * dp_config_rwlock. See the comment above dsl_sync_task() for details. + * + * Read-only operations will manually hold the pool, then the dataset, obtain + * information from the dataset, then release the pool and dataset. + * dmu_objset_{hold,rele}() are convenience routines that also do the pool + * hold/rele. + */ + +int +dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) +{ + spa_t *spa; + int error; + + error = spa_open(name, &spa, tag); + if (error == 0) { + *dp = spa_get_dsl(spa); + dsl_pool_config_enter(*dp, tag); + } + return (error); +} + +void +dsl_pool_rele(dsl_pool_t *dp, void *tag) +{ + dsl_pool_config_exit(dp, tag); + spa_close(dp->dp_spa, tag); +} + +void +dsl_pool_config_enter(dsl_pool_t *dp, void *tag) +{ + /* + * We use a "reentrant" reader-writer lock, but not reentrantly. + * + * The rrwlock can (with the track_all flag) track all reading threads, + * which is very useful for debugging which code path failed to release + * the lock, and for verifying that the *current* thread does hold + * the lock. + * + * (Unlike a rwlock, which knows that N threads hold it for + * read, but not *which* threads, so rw_held(RW_READER) returns TRUE + * if any thread holds it for read, even if this thread doesn't). + */ + ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); + rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); +} + +void +dsl_pool_config_exit(dsl_pool_t *dp, void *tag) +{ + rrw_exit(&dp->dp_config_rwlock, tag); +} + +boolean_t +dsl_pool_config_held(dsl_pool_t *dp) +{ + return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); +} + #if defined(_KERNEL) && defined(HAVE_SPL) module_param(zfs_no_write_throttle, int, 0644); MODULE_PARM_DESC(zfs_no_write_throttle, "Disable write throttling"); diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index 153420ccf..1d981a7ee 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -82,7 +82,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, char *inheritstr; char *recvdstr; - ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); + ASSERT(dsl_pool_config_held(dd->dd_pool)); if (setpoint) setpoint[0] = '\0'; @@ -97,8 +97,6 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, * after this loop. */ for (; dd != NULL; dd = dd->dd_parent) { - ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); - if (dd != target || snapshot) { if (!inheritable) break; @@ -167,7 +165,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, boolean_t snapshot; uint64_t zapobj; - ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)); zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj); @@ -231,22 +229,16 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg) { dsl_dir_t *dd = ds->ds_dir; - dsl_pool_t *dp = dd->dd_pool; uint64_t value; dsl_prop_cb_record_t *cbr; int err; - int need_rwlock; + ASSERTV(dsl_pool_t *dp = dd->dd_pool); - need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); - if (need_rwlock) - rw_enter(&dp->dp_config_rwlock, RW_READER); + ASSERT(dsl_pool_config_held(dp)); - err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); - if (err != 0) { - if (need_rwlock) - rw_exit(&dp->dp_config_rwlock); + err = dsl_prop_get_int_ds(ds, propname, &value); + if (err != 0) return (err); - } cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_PUSHPAGE); cbr->cbr_ds = ds; @@ -259,9 +251,6 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, mutex_exit(&dd->dd_lock); cbr->cbr_func(cbr->cbr_arg, value); - - if (need_rwlock) - rw_exit(&dp->dp_config_rwlock); return (0); } @@ -269,19 +258,18 @@ int dsl_prop_get(const char *dsname, const char *propname, int intsz, int numints, void *buf, char *setpoint) { - dsl_dataset_t *ds; - int err; + objset_t *os; + int error; - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + error = dsl_prop_get_ds(dmu_objset_ds(os), propname, + intsz, numints, buf, setpoint); - dsl_dataset_rele(ds, FTAG); - return (err); + dmu_objset_rele(os, FTAG); + return (error); } /* @@ -299,17 +287,11 @@ dsl_prop_get_integer(const char *ddname, const char *propname, return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); } -void -dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, - zprop_source_t source, uint64_t *value) +int +dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, + uint64_t *valuep) { - psa->psa_name = propname; - psa->psa_source = source; - psa->psa_intsz = 8; - psa->psa_numints = 1; - psa->psa_value = value; - - psa->psa_effective_value = -1ULL; + return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); } /* @@ -323,11 +305,10 @@ dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, * a property not handled by this function. */ int -dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) +dsl_prop_predict(dsl_dir_t *dd, const char *propname, + zprop_source_t source, uint64_t value, uint64_t *newvalp) { - const char *propname = psa->psa_name; zfs_prop_t prop = zfs_name_to_prop(propname); - zprop_source_t source = psa->psa_source; objset_t *mos; uint64_t zapobj; uint64_t version; @@ -359,36 +340,33 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) switch ((int)source) { case ZPROP_SRC_NONE: /* Revert to the received value, if any. */ - err = zap_lookup(mos, zapobj, recvdstr, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = 0; + *newvalp = 0; break; case ZPROP_SRC_LOCAL: - psa->psa_effective_value = *(uint64_t *)psa->psa_value; + *newvalp = value; break; case ZPROP_SRC_RECEIVED: /* * If there's no local setting, then the new received value will * be the effective value. */ - err = zap_lookup(mos, zapobj, propname, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = *(uint64_t *)psa->psa_value; + *newvalp = value; break; case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * We're clearing the received value, so the local setting (if * it exists) remains the effective value. */ - err = zap_lookup(mos, zapobj, propname, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = 0; + *newvalp = 0; break; default: - cmn_err(CE_PANIC, "unexpected property source: %d", source); + panic("unexpected property source: %d", source); } strfree(recvdstr); @@ -399,39 +377,6 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) return (err); } -#ifdef ZFS_DEBUG -void -dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa) -{ - zfs_prop_t prop = zfs_name_to_prop(psa->psa_name); - uint64_t intval; - char setpoint[MAXNAMELEN]; - uint64_t version = spa_version(dd->dd_pool->dp_spa); - int err; - - if (version < SPA_VERSION_RECVD_PROPS) { - switch (prop) { - case ZFS_PROP_QUOTA: - case ZFS_PROP_RESERVATION: - return; - default: - break; - } - } - - err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval, - setpoint, B_FALSE); - if (err == 0 && intval != psa->psa_effective_value) { - cmn_err(CE_PANIC, "%s property, source: %x, " - "predicted effective value: %llu, " - "actual effective value: %llu (setpoint: %s)", - psa->psa_name, psa->psa_source, - (unsigned long long)psa->psa_effective_value, - (unsigned long long)intval, setpoint); - } -} -#endif - /* * Unregister this callback. Return 0 on success, ENOENT if ddname is * invalid, ENOMSG if no matching callback registered. @@ -466,25 +411,57 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, return (0); } -/* - * Return the number of callbacks that are registered for this dataset. - */ -int -dsl_prop_numcb(dsl_dataset_t *ds) +boolean_t +dsl_prop_hascb(dsl_dataset_t *ds) { dsl_dir_t *dd = ds->ds_dir; + boolean_t rv = B_FALSE; dsl_prop_cb_record_t *cbr; - int num = 0; mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (cbr->cbr_ds == ds) - num++; + for (cbr = list_head(&dd->dd_prop_cbs); cbr; + cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds) { + rv = B_TRUE; + break; + } } mutex_exit(&dd->dd_lock); + return (rv); +} - return (num); +/* ARGSUSED */ +static int +dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_cb_record_t *cbr; + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); cbr; + cbr = list_next(&dd->dd_prop_cbs, cbr)) { + uint64_t value; + + if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname, + sizeof (value), 1, &value, NULL) == 0) + cbr->cbr_func(cbr->cbr_arg, value); + } + mutex_exit(&dd->dd_lock); + + return (0); +} + +/* + * Update all property values for ddobj & its descendants. This is used + * when renaming the dir. + */ +void +dsl_prop_notify_all(dsl_dir_t *dd) +{ + dsl_pool_t *dp = dd->dd_pool; + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, + NULL, DS_FIND_CHILDREN); } static void @@ -498,8 +475,8 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, zap_attribute_t *za; int err; - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err) return; @@ -510,7 +487,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, */ err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname); if (err == 0) { - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); return; } ASSERT3U(err, ==, ENOENT); @@ -545,26 +522,24 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, } kmem_free(za, sizeof (zap_attribute_t)); zap_cursor_fini(&zc); - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); } void -dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, + zprop_source_t source, int intsz, int numints, const void *value, + dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t zapobj, intval, dummy; int isint; char valbuf[32]; - char *valstr = NULL; + const char *valstr = NULL; char *inheritstr; char *recvdstr; char *tbuf = NULL; int err; uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); - const char *propname = psa->psa_name; - zprop_source_t source = psa->psa_source; isint = (dodefault(propname, 8, 1, &intval) == 0); @@ -614,8 +589,8 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) */ err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); - VERIFY(0 == zap_update(mos, zapobj, propname, - psa->psa_intsz, psa->psa_numints, psa->psa_value, tx)); + VERIFY0(zap_update(mos, zapobj, propname, + intsz, numints, value, tx)); break; case ZPROP_SRC_INHERITED: /* @@ -626,12 +601,10 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); if (version >= SPA_VERSION_RECVD_PROPS && - dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, - NULL) == 0) { + dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { dummy = 0; - err = zap_update(mos, zapobj, inheritstr, - 8, 1, &dummy, tx); - ASSERT(err == 0); + VERIFY0(zap_update(mos, zapobj, inheritstr, + 8, 1, &dummy, tx)); } break; case ZPROP_SRC_RECEIVED: @@ -639,7 +612,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) * set propname$recvd -> value */ err = zap_update(mos, zapobj, recvdstr, - psa->psa_intsz, psa->psa_numints, psa->psa_value, tx); + intsz, numints, value, tx); ASSERT(err == 0); break; case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): @@ -669,7 +642,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) strfree(recvdstr); if (isint) { - VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL)); + VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { dsl_prop_cb_record_t *cbr; @@ -696,7 +669,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) valstr = valbuf; } else { if (source == ZPROP_SRC_LOCAL) { - valstr = (char *)psa->psa_value; + valstr = value; } else { tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_PUSHPAGE); if (dsl_prop_get_ds(ds, propname, 1, @@ -713,118 +686,73 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) kmem_free(tbuf, ZAP_MAXVALUELEN); } -void -dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +int +dsl_prop_set_int(const char *dsname, const char *propname, + zprop_source_t source, uint64_t value) { - dsl_dataset_t *ds = arg1; - dsl_props_arg_t *pa = arg2; - nvlist_t *props = pa->pa_props; - dsl_prop_setarg_t psa; - nvpair_t *elem = NULL; - - psa.psa_source = pa->pa_source; - - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - nvpair_t *pair = elem; - - psa.psa_name = nvpair_name(pair); + nvlist_t *nvl = fnvlist_alloc(); + int error; - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - /* - * dsl_prop_get_all_impl() returns properties in this - * format. - */ - nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) == 0); - } - - if (nvpair_type(pair) == DATA_TYPE_STRING) { - VERIFY(nvpair_value_string(pair, - (char **)&psa.psa_value) == 0); - psa.psa_intsz = 1; - psa.psa_numints = strlen(psa.psa_value) + 1; - } else { - uint64_t intval; - VERIFY(nvpair_value_uint64(pair, &intval) == 0); - psa.psa_intsz = sizeof (intval); - psa.psa_numints = 1; - psa.psa_value = &intval; - } - dsl_prop_set_sync(ds, &psa, tx); - } + fnvlist_add_uint64(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); } int -dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source, - int intsz, int numints, const void *buf) +dsl_prop_set_string(const char *dsname, const char *propname, + zprop_source_t source, const char *value) { - dsl_dataset_t *ds; - uint64_t version; - int err; - dsl_prop_setarg_t psa; - - /* - * We must do these checks before we get to the syncfunc, since - * it can't fail. - */ - if (strlen(propname) >= ZAP_MAXNAMELEN) - return (ENAMETOOLONG); - - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - version = spa_version(ds->ds_dir->dd_pool->dp_spa); - if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? - ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { - dsl_dataset_rele(ds, FTAG); - return (E2BIG); - } - if (dsl_dataset_is_snapshot(ds) && - version < SPA_VERSION_SNAP_PROPS) { - dsl_dataset_rele(ds, FTAG); - return (ENOTSUP); - } + nvlist_t *nvl = fnvlist_alloc(); + int error; - psa.psa_name = propname; - psa.psa_source = source; - psa.psa_intsz = intsz; - psa.psa_numints = numints; - psa.psa_value = buf; - psa.psa_effective_value = -1ULL; + fnvlist_add_string(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - NULL, dsl_prop_set_sync, ds, &psa, 2); +int +dsl_prop_inherit(const char *dsname, const char *propname, + zprop_source_t source) +{ + nvlist_t *nvl = fnvlist_alloc(); + int error; - dsl_dataset_rele(ds, FTAG); - return (err); + fnvlist_add_boolean(nvl, propname); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); } -int -dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) +typedef struct dsl_props_set_arg { + const char *dpsa_dsname; + zprop_source_t dpsa_source; + nvlist_t *dpsa_props; +} dsl_props_set_arg_t; + +static int +dsl_props_set_check(void *arg, dmu_tx_t *tx) { + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t version; nvpair_t *elem = NULL; - dsl_props_arg_t pa; int err; - if ((err = dsl_dataset_hold(dsname, FTAG, &ds))) + err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); + if (err != 0) return (err); - /* - * Do these checks before the syncfunc, since it can't fail. - */ + version = spa_version(ds->ds_dir->dd_pool->dp_spa); - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { dsl_dataset_rele(ds, FTAG); return (ENAMETOOLONG); } if (nvpair_type(elem) == DATA_TYPE_STRING) { - char *valstr; - VERIFY(nvpair_value_string(elem, &valstr) == 0); + char *valstr = fnvpair_value_string(elem); if (strlen(valstr) >= (version < SPA_VERSION_STMF_PROP ? ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { @@ -834,20 +762,83 @@ dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) } } - if (dsl_dataset_is_snapshot(ds) && - version < SPA_VERSION_SNAP_PROPS) { + if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (ENOTSUP); } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, + nvlist_t *props, dmu_tx_t *tx) +{ + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + nvpair_t *pair = elem; - pa.pa_props = props; - pa.pa_source = source; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs = fnvpair_value_nvlist(pair); + pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); + } + + if (nvpair_type(pair) == DATA_TYPE_STRING) { + const char *value = fnvpair_value_string(pair); + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, 1, strlen(value) + 1, value, tx); + } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { + uint64_t intval = fnvpair_value_uint64(pair); + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, sizeof (intval), 1, &intval, tx); + } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, 0, 0, NULL, tx); + } else { + panic("invalid nvpair type"); + } + } +} - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - NULL, dsl_props_set_sync, ds, &pa, 2); +static void +dsl_props_set_sync(void *arg, dmu_tx_t *tx) +{ + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); + dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); dsl_dataset_rele(ds, FTAG); - return (err); +} + +/* + * All-or-nothing; if any prop can't be set, nothing will be modified. + */ +int +dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) +{ + dsl_props_set_arg_t dpsa; + int nblks = 0; + + dpsa.dpsa_dsname = dsname; + dpsa.dpsa_source = source; + dpsa.dpsa_props = props; + + /* + * If the source includes NONE, then we will only be removing entries + * from the ZAP object. In that case don't check for ENOSPC. + */ + if ((source & ZPROP_SRC_NONE) == 0) + nblks = 2 * fnvlist_num_pairs(props); + + return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, + &dpsa, nblks)); } typedef enum dsl_prop_getflags { @@ -997,7 +988,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, if (dsl_dataset_is_snapshot(ds)) flags |= DSL_PROP_GET_SNAPSHOT; - rw_enter(&dp->dp_config_rwlock, RW_READER); + ASSERT(dsl_pool_config_held(dp)); if (ds->ds_phys->ds_props_obj != 0) { ASSERT(flags & DSL_PROP_GET_SNAPSHOT); @@ -1022,58 +1013,51 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, break; } out: - rw_exit(&dp->dp_config_rwlock); return (err); } boolean_t -dsl_prop_get_hasrecvd(objset_t *os) +dsl_prop_get_hasrecvd(const char *dsname) { - dsl_dataset_t *ds = os->os_dsl_dataset; - int rc; uint64_t dummy; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); - return (rc == 0); + return (0 == + dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); } -static void -dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) +static int +dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) { - dsl_dataset_t *ds = os->os_dsl_dataset; - uint64_t dummy = 0; - dsl_prop_setarg_t psa; - - if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS) - return; + uint64_t version; + spa_t *spa; + int error = 0; - dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy); + VERIFY0(spa_open(dsname, &spa, FTAG)); + version = spa_version(spa); + spa_close(spa, FTAG); - (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, - dsl_prop_set_sync, ds, &psa, 2); + if (version >= SPA_VERSION_RECVD_PROPS) + error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); + return (error); } /* * Call after successfully receiving properties to ensure that only the first * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. */ -void -dsl_prop_set_hasrecvd(objset_t *os) +int +dsl_prop_set_hasrecvd(const char *dsname) { - if (dsl_prop_get_hasrecvd(os)) { - ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); - return; - } - dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL); + int error = 0; + if (!dsl_prop_get_hasrecvd(dsname)) + error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); + return (error); } void -dsl_prop_unset_hasrecvd(objset_t *os) +dsl_prop_unset_hasrecvd(const char *dsname) { - dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE); + VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); } int @@ -1083,16 +1067,25 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) } int -dsl_prop_get_received(objset_t *os, nvlist_t **nvp) +dsl_prop_get_received(const char *dsname, nvlist_t **nvp) { + objset_t *os; + int error; + /* * Received properties are not distinguishable from local properties * until the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. */ - dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ? + dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); - return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags)); + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); + dmu_objset_rele(os, FTAG); + return (error); } void @@ -1138,8 +1131,6 @@ dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dsl_prop_register); EXPORT_SYMBOL(dsl_prop_unregister); -EXPORT_SYMBOL(dsl_prop_numcb); -EXPORT_SYMBOL(dsl_prop_set); EXPORT_SYMBOL(dsl_prop_get); EXPORT_SYMBOL(dsl_prop_get_integer); EXPORT_SYMBOL(dsl_prop_get_all); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 90ca7b256..2e5034bdf 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -53,7 +53,7 @@ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); static scan_cb_t dsl_scan_scrub_cb; -static dsl_syncfunc_t dsl_scan_cancel_sync; +static void dsl_scan_cancel_sync(void *, dmu_tx_t *); static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ @@ -150,9 +150,9 @@ dsl_scan_fini(dsl_pool_t *dp) /* ARGSUSED */ static int -dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { - dsl_scan_t *scn = arg1; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state == DSS_SCANNING) return (EBUSY); @@ -160,12 +160,11 @@ dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void -dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { - dsl_scan_t *scn = arg1; - pool_scan_func_t *funcp = arg2; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; @@ -312,9 +311,9 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) /* ARGSUSED */ static int -dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { - dsl_scan_t *scn = arg1; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state != DSS_SCANNING) return (ENOENT); @@ -323,9 +322,9 @@ dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { - dsl_scan_t *scn = arg1; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx); @@ -334,12 +333,8 @@ dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) int dsl_scan_cancel(dsl_pool_t *dp) { - boolean_t complete = B_FALSE; - int err; - - err = dsl_sync_task_do(dp, dsl_scan_cancel_check, - dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); - return (err); + return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, + dsl_scan_cancel_sync, NULL, 3)); } static void dsl_scan_visitbp(blkptr_t *bp, @@ -375,7 +370,7 @@ dsl_scan_ds_maxtxg(dsl_dataset_t *ds) static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) { - VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); @@ -959,33 +954,33 @@ struct enqueue_clones_arg { /* ARGSUSED */ static int -enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { struct enqueue_clones_arg *eca = arg; dsl_dataset_t *ds; int err; - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj) + return (0); + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); - if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { - while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - ds = prev; - } - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, - ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; } + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } @@ -1075,17 +1070,17 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) } if (usenext) { - VERIFY(zap_join_key(dp->dp_meta_objset, + VERIFY0(zap_join_key(dp->dp_meta_objset, ds->ds_phys->ds_next_clones_obj, scn->scn_phys.scn_queue_obj, - ds->ds_phys->ds_creation_txg, tx) == 0); + ds->ds_phys->ds_creation_txg, tx)); } else { struct enqueue_clones_arg eca; eca.tx = tx; eca.originobj = ds->ds_object; - (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, - NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); } } @@ -1095,15 +1090,14 @@ out: /* ARGSUSED */ static int -enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); @@ -1261,8 +1255,8 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, - NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_cb, tx, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); @@ -1402,7 +1396,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", func, tx->tx_txg); - dsl_scan_setup_sync(scn, &func, tx); + dsl_scan_setup_sync(&func, tx); } if (!dsl_scan_active(scn) || @@ -1436,21 +1430,21 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); - VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); - if (err != 0) - return; - - /* disable async destroy feature */ - spa_feature_decr(spa, - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); - ASSERT(!spa_feature_is_active(spa, - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); - VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, tx)); - VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, - dp->dp_bptree_obj, tx)); - dp->dp_bptree_obj = 0; + VERIFY0(zio_wait(scn->scn_zio_root)); + + if (err == 0) { + zfeature_info_t *feat = &spa_feature_table + [SPA_FEATURE_ASYNC_DESTROY]; + /* finished; deactivate async destroy feature */ + spa_feature_decr(spa, feat, tx); + ASSERT(!spa_feature_is_active(spa, feat)); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY0(bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " @@ -1497,7 +1491,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); + dsl_pool_config_exit(dp, FTAG); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; @@ -1734,8 +1730,8 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); - return (dsl_sync_task_do(dp, dsl_scan_setup_check, - dsl_scan_setup_sync, dp->dp_scan, &func, 0)); + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, + dsl_scan_setup_sync, &func, 0)); } #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index 2ed47fe0c..6eb712314 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -34,138 +34,115 @@ /* ARGSUSED */ static int -dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_null_checkfunc(void *arg, dmu_tx_t *tx) { return (0); } -dsl_sync_task_group_t * -dsl_sync_task_group_create(dsl_pool_t *dp) -{ - dsl_sync_task_group_t *dstg; - - dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); - list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), - offsetof(dsl_sync_task_t, dst_node)); - dstg->dstg_pool = dp; - - return (dstg); -} - -void -dsl_sync_task_create(dsl_sync_task_group_t *dstg, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified) -{ - dsl_sync_task_t *dst; - - if (checkfunc == NULL) - checkfunc = dsl_null_checkfunc; - dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); - dst->dst_checkfunc = checkfunc; - dst->dst_syncfunc = syncfunc; - dst->dst_arg1 = arg1; - dst->dst_arg2 = arg2; - list_insert_tail(&dstg->dstg_tasks, dst); - - dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; -} - +/* + * Called from open context to perform a callback in syncing context. Waits + * for the operation to complete. + * + * The checkfunc will be called from open context as a preliminary check + * which can quickly fail. If it succeeds, it will be called again from + * syncing context. The checkfunc should generally be designed to work + * properly in either context, but if necessary it can check + * dmu_tx_is_syncing(tx). + * + * The synctask infrastructure enforces proper locking strategy with respect + * to the dp_config_rwlock -- the lock will always be held when the callbacks + * are called. It will be held for read during the open-context (preliminary) + * call to the checkfunc, and then held for write from syncing context during + * the calls to the check and sync funcs. + * + * A dataset or pool name can be passed as the first argument. Typically, + * the check func will hold, check the return value of the hold, and then + * release the dataset. The sync func will VERIFYO(hold()) the dataset. + * This is safe because no changes can be made between the check and sync funcs, + * and the sync func will only be called if the check func successfully opened + * the dataset. + */ int -dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) +dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified) { + spa_t *spa; dmu_tx_t *tx; - uint64_t txg; - dsl_sync_task_t *dst; - -top: - tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); - - txg = dmu_tx_get_txg(tx); + int err; + dsl_sync_task_t dst = { { { NULL } } }; + dsl_pool_t *dp; - /* Do a preliminary error check. */ - dstg->dstg_err = 0; -#ifdef ZFS_DEBUG - /* - * Only check half the time, otherwise, the sync-context - * check will almost never fail. - */ - if (spa_get_random(2) == 0) - goto skip; -#endif - rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_err = - dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); - if (dst->dst_err) - dstg->dstg_err = dst->dst_err; - } - rw_exit(&dstg->dstg_pool->dp_config_rwlock); + err = spa_open(pool, &spa, FTAG); + if (err != 0) + return (err); + dp = spa_get_dsl(spa); - if (dstg->dstg_err) { +top: + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + dst.dst_pool = dp; + dst.dst_txg = dmu_tx_get_txg(tx); + dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; + dst.dst_syncfunc = syncfunc; + dst.dst_arg = arg; + dst.dst_error = 0; + dst.dst_nowaiter = B_FALSE; + + dsl_pool_config_enter(dp, FTAG); + err = dst.dst_checkfunc(arg, tx); + dsl_pool_config_exit(dp, FTAG); + + if (err != 0) { dmu_tx_commit(tx); - return (dstg->dstg_err); + spa_close(spa, FTAG); + return (err); } -#ifdef ZFS_DEBUG -skip: -#endif - /* - * We don't generally have many sync tasks, so pay the price of - * add_tail to get the tasks executed in the right order. - */ - VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, - dstg, txg)); + VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg)); dmu_tx_commit(tx); - txg_wait_synced(dstg->dstg_pool, txg); + txg_wait_synced(dp, dst.dst_txg); - if (dstg->dstg_err == EAGAIN) { - txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE); + if (dst.dst_error == EAGAIN) { + txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); goto top; } - return (dstg->dstg_err); + spa_close(spa, FTAG); + return (dst.dst_error); } void -dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, dmu_tx_t *tx) { - uint64_t txg; + dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); - dstg->dstg_nowaiter = B_TRUE; - txg = dmu_tx_get_txg(tx); - /* - * We don't generally have many sync tasks, so pay the price of - * add_tail to get the tasks executed in the right order. - */ - VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, - dstg, txg)); -} - -void -dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) -{ - dsl_sync_task_t *dst; + dst->dst_pool = dp; + dst->dst_txg = dmu_tx_get_txg(tx); + dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst->dst_checkfunc = dsl_null_checkfunc; + dst->dst_syncfunc = syncfunc; + dst->dst_arg = arg; + dst->dst_error = 0; + dst->dst_nowaiter = B_TRUE; - while ((dst = list_head(&dstg->dstg_tasks))) { - list_remove(&dstg->dstg_tasks, dst); - kmem_free(dst, sizeof (dsl_sync_task_t)); - } - kmem_free(dstg, sizeof (dsl_sync_task_group_t)); + VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg)); } +/* + * Called in syncing context to execute the synctask. + */ void -dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) { - dsl_sync_task_t *dst; - dsl_pool_t *dp = dstg->dstg_pool; + dsl_pool_t *dp = dst->dst_pool; uint64_t quota, used; - ASSERT0(dstg->dstg_err); + ASSERT0(dst->dst_error); /* * Check for sufficient space. We just check against what's @@ -177,70 +154,24 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); used = dp->dp_root_dir->dd_phys->dd_used_bytes; /* MOS space is triple-dittoed, so we multiply by 3. */ - if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) { - dstg->dstg_err = ENOSPC; + if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) { + dst->dst_error = ENOSPC; + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); return; } /* - * Check for errors by calling checkfuncs. + * Check for errors by calling checkfunc. */ - rw_enter(&dp->dp_config_rwlock, RW_WRITER); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_err = - dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); - if (dst->dst_err) - dstg->dstg_err = dst->dst_err; - } - - if (dstg->dstg_err == 0) { - /* - * Execute sync tasks. - */ - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); - } - } - rw_exit(&dp->dp_config_rwlock); - - if (dstg->dstg_nowaiter) - dsl_sync_task_group_destroy(dstg); -} - -int -dsl_sync_task_do(dsl_pool_t *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified) -{ - dsl_sync_task_group_t *dstg; - int err; - - ASSERT(spa_writeable(dp->dp_spa)); - - dstg = dsl_sync_task_group_create(dp); - dsl_sync_task_create(dstg, checkfunc, syncfunc, - arg1, arg2, blocks_modified); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); - return (err); -} - -void -dsl_sync_task_do_nowait(dsl_pool_t *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) -{ - dsl_sync_task_group_t *dstg; - - dstg = dsl_sync_task_group_create(dp); - dsl_sync_task_create(dstg, checkfunc, syncfunc, - arg1, arg2, blocks_modified); - dsl_sync_task_group_nowait(dstg, tx); + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); + if (dst->dst_error == 0) + dst->dst_syncfunc(dst->dst_arg, tx); + rrw_exit(&dp->dp_config_rwlock, FTAG); + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); } #if defined(_KERNEL) && defined(HAVE_SPL) -EXPORT_SYMBOL(dsl_sync_task_do); -EXPORT_SYMBOL(dsl_sync_task_do_nowait); #endif diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c new file mode 100644 index 000000000..c8bc4424f --- /dev/null +++ b/module/zfs/dsl_userhold.c @@ -0,0 +1,537 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dsl_userhold.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_tx.h> +#include <sys/zfs_onexit.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> + +typedef struct dsl_dataset_user_hold_arg { + nvlist_t *dduha_holds; + nvlist_t *dduha_errlist; + minor_t dduha_minor; +} dsl_dataset_user_hold_arg_t; + +/* + * If you add new checks here, you may need to add additional checks to the + * "temporary" case in snapshot_check() in dmu_objset.c. + */ +int +dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, + boolean_t temphold, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + int error = 0; + + if (strlen(htag) > MAXNAMELEN) + return (E2BIG); + /* Tempholds have a more restricted length */ + if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (E2BIG); + + /* tags must be unique (if ds already exists) */ + if (ds != NULL) { + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj != 0) { + uint64_t value; + error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, + htag, 8, 1, &value); + if (error == 0) + error = EEXIST; + else if (error == ENOENT) + error = 0; + } + mutex_exit(&ds->ds_lock); + } + + return (error); +} + +static int +dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + int error = 0; + dsl_dataset_t *ds; + char *htag; + + /* must be a snapshot */ + if (strchr(nvpair_name(pair), '@') == NULL) + error = EINVAL; + + if (error == 0) + error = nvpair_value_string(pair, &htag); + if (error == 0) { + error = dsl_dataset_hold(dp, + nvpair_name(pair), FTAG, &ds); + } + if (error == 0) { + error = dsl_dataset_user_hold_check_one(ds, htag, + dduha->dduha_minor != 0, tx); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + rv = error; + fnvlist_add_int32(dduha->dduha_errlist, + nvpair_name(pair), error); + } + } + return (rv); +} + +void +dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, + minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = ds->ds_phys->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = ds->ds_phys->ds_userrefs_obj; + } + ds->ds_userrefs++; + mutex_exit(&ds->ds_lock); + + VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + if (minor != 0) { + VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, + htag, now, tx)); + dsl_register_onexit_hold_cleanup(ds, htag, minor); + } + + spa_history_log_internal_ds(ds, "hold", tx, + "tag=%s temp=%d refs=%llu", + htag, minor != 0, ds->ds_userrefs); +} + +static void +dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + uint64_t now = gethrestime_sec(); + + for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + dsl_dataset_t *ds; + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + dsl_dataset_user_hold_sync_one(ds, fnvpair_value_string(pair), + dduha->dduha_minor, now, tx); + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * holds is nvl of snapname -> holdname + * errlist will be filled in with snapname -> error + * if cleanup_minor is not 0, the holds will be temporary, cleaned up + * when the process exits. + * + * if any fails, all will fail. + */ +int +dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) +{ + dsl_dataset_user_hold_arg_t dduha; + nvpair_t *pair; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + dduha.dduha_holds = holds; + dduha.dduha_errlist = errlist; + dduha.dduha_minor = cleanup_minor; + + return (dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds))); +} + +typedef struct dsl_dataset_user_release_arg { + nvlist_t *ddura_holds; + nvlist_t *ddura_todelete; + nvlist_t *ddura_errlist; +} dsl_dataset_user_release_arg_t; + +static int +dsl_dataset_user_release_check_one(dsl_dataset_t *ds, + nvlist_t *holds, boolean_t *todelete) +{ + uint64_t zapobj; + nvpair_t *pair; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int error; + int numholds = 0; + + *todelete = B_FALSE; + + if (!dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + zapobj = ds->ds_phys->ds_userrefs_obj; + if (zapobj == 0) + return (ESRCH); + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + /* Make sure the hold exists */ + uint64_t tmp; + error = zap_lookup(mos, zapobj, nvpair_name(pair), 8, 1, &tmp); + if (error == ENOENT) + error = ESRCH; + if (error != 0) + return (error); + numholds++; + } + + if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 && + ds->ds_userrefs == numholds) { + /* we need to destroy the snapshot as well */ + + if (dsl_dataset_long_held(ds)) + return (EBUSY); + *todelete = B_TRUE; + } + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { + const char *name = nvpair_name(pair); + int error; + dsl_dataset_t *ds; + nvlist_t *holds; + + error = nvpair_value_nvlist(pair, &holds); + if (error != 0) + return (EINVAL); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error == 0) { + boolean_t deleteme; + error = dsl_dataset_user_release_check_one(ds, + holds, &deleteme); + if (error == 0 && deleteme) { + fnvlist_add_boolean(ddura->ddura_todelete, + name); + } + dsl_dataset_rele(ds, FTAG); + } + if (error != 0) { + if (ddura->ddura_errlist != NULL) { + fnvlist_add_int32(ddura->ddura_errlist, + name, error); + } + rv = error; + } + } + return (rv); +} + +static void +dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + int error; + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + ds->ds_userrefs--; + error = dsl_pool_user_release(dp, ds->ds_object, + nvpair_name(pair), tx); + VERIFY(error == 0 || error == ENOENT); + zapobj = ds->ds_phys->ds_userrefs_obj; + VERIFY0(zap_remove(mos, zapobj, nvpair_name(pair), tx)); + + spa_history_log_internal_ds(ds, "release", tx, + "tag=%s refs=%lld", nvpair_name(pair), + (longlong_t)ds->ds_userrefs); + } +} + +static void +dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + dsl_dataset_user_release_sync_one(ds, + fnvpair_value_nvlist(pair), tx); + if (nvlist_exists(ddura->ddura_todelete, + nvpair_name(pair))) { + ASSERT(ds->ds_userrefs == 0 && + ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)); + dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); + } + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error + * + * if any fails, all will fail. + */ +int +dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) +{ + dsl_dataset_user_release_arg_t ddura; + nvpair_t *pair; + int error; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + ddura.ddura_holds = holds; + ddura.ddura_errlist = errlist; + ddura.ddura_todelete = fnvlist_alloc(); + + error = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, &ddura, fnvlist_num_pairs(holds)); + fnvlist_free(ddura.ddura_todelete); + return (error); +} + +typedef struct dsl_dataset_user_release_tmp_arg { + uint64_t ddurta_dsobj; + nvlist_t *ddurta_holds; + boolean_t ddurta_deleteme; +} dsl_dataset_user_release_tmp_arg_t; + +static int +dsl_dataset_user_release_tmp_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_tmp_arg_t *ddurta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + error = dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds); + if (error) + return (error); + + error = dsl_dataset_user_release_check_one(ds, + ddurta->ddurta_holds, &ddurta->ddurta_deleteme); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dataset_user_release_tmp_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_tmp_arg_t *ddurta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds)); + dsl_dataset_user_release_sync_one(ds, ddurta->ddurta_holds, tx); + if (ddurta->ddurta_deleteme) { + ASSERT(ds->ds_userrefs == 0 && + ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)); + dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); + } + dsl_dataset_rele(ds, FTAG); +} + +/* + * Called at spa_load time to release a stale temporary user hold. + * Also called by the onexit code. + */ +void +dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, const char *htag) +{ + dsl_dataset_user_release_tmp_arg_t ddurta; + +#ifdef _KERNEL + dsl_dataset_t *ds; + int error; + + /* Make sure it is not mounted. */ + dsl_pool_config_enter(dp, FTAG); + error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (error == 0) { + char name[MAXNAMELEN]; + dsl_dataset_name(ds, name); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + zfs_unmount_snap(name); + } else { + dsl_pool_config_exit(dp, FTAG); + } +#endif + + ddurta.ddurta_dsobj = dsobj; + ddurta.ddurta_holds = fnvlist_alloc(); + fnvlist_add_boolean(ddurta.ddurta_holds, htag); + + (void) dsl_sync_task(spa_name(dp->dp_spa), + dsl_dataset_user_release_tmp_check, + dsl_dataset_user_release_tmp_sync, &ddurta, 1); + fnvlist_free(ddurta.ddurta_holds); +} + +typedef struct zfs_hold_cleanup_arg { + char zhca_spaname[MAXNAMELEN]; + uint64_t zhca_spa_load_guid; + uint64_t zhca_dsobj; + char zhca_htag[MAXNAMELEN]; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + spa_t *spa; + int error; + + error = spa_open(ca->zhca_spaname, &spa, FTAG); + if (error != 0) { + zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s " + "because pool is no longer loaded", + ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag); + return; + } + if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { + zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s " + "because pool is no longer loaded (guid doesn't match)", + ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag); + spa_close(spa, FTAG); + return; + } + + dsl_dataset_user_release_tmp(spa_get_dsl(spa), + ca->zhca_dsobj, ca->zhca_htag); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); + spa_close(spa, FTAG); +} + +void +dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, + minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca = kmem_alloc(sizeof (*ca), KM_SLEEP); + spa_t *spa = dsl_dataset_get_spa(ds); + (void) strlcpy(ca->zhca_spaname, spa_name(spa), + sizeof (ca->zhca_spaname)); + ca->zhca_spa_load_guid = spa_load_guid(spa); + ca->zhca_dsobj = ds->ds_object; + (void) strlcpy(ca->zhca_htag, htag, sizeof (ca->zhca_htag)); + VERIFY0(zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (ds->ds_phys->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + fnvlist_add_uint64(nvl, za->za_name, + za->za_first_integer); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (0); +} diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index cd1b6ce73..f9cb8cead 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1928,6 +1928,46 @@ void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) spa_config_exit(spa, SCL_VDEV, FTAG); } +static void +checkmap(space_map_t *sm, uint64_t off, uint64_t size) +{ + space_seg_t *ss; + avl_index_t where; + + mutex_enter(sm->sm_lock); + ss = space_map_find(sm, off, size, &where); + if (ss != NULL) + panic("freeing free block; ss=%p", (void *)ss); + mutex_exit(sm->sm_lock); +} + +void +metaslab_check_free(spa_t *spa, const blkptr_t *bp) +{ + int i, j; + + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]); + vdev_t *vd = vdev_lookup_top(spa, vdid); + uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]); + uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); + metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift]; + + if (ms->ms_map->sm_loaded) + checkmap(ms->ms_map, off, size); + + for (j = 0; j < TXG_SIZE; j++) + checkmap(ms->ms_freemap[j], off, size); + for (j = 0; j < TXG_DEFER_SIZE; j++) + checkmap(ms->ms_defermap[j], off, size); + } + spa_config_exit(spa, SCL_VDEV, FTAG); +} + #if defined(_KERNEL) && defined(HAVE_SPL) module_param(metaslab_debug, int, 0644); MODULE_PARM_DESC(metaslab_debug, "keep space maps in core to verify frees"); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index e43807c8e..49980efcc 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -32,7 +33,7 @@ int reference_tracking_enable = FALSE; /* runs out of memory too easily */ #else int reference_tracking_enable = TRUE; #endif -int reference_history = 4; /* tunable */ +int reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; static kmem_cache_t *reference_history_cache; @@ -64,6 +65,14 @@ refcount_create(refcount_t *rc) offsetof(reference_t, ref_link)); rc->rc_count = 0; rc->rc_removed_count = 0; + rc->rc_tracked = reference_tracking_enable; +} + +void +refcount_create_untracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_FALSE; } void @@ -96,14 +105,12 @@ refcount_destroy(refcount_t *rc) int refcount_is_zero(refcount_t *rc) { - ASSERT(rc->rc_count >= 0); return (rc->rc_count == 0); } int64_t refcount_count(refcount_t *rc) { - ASSERT(rc->rc_count >= 0); return (rc->rc_count); } @@ -113,14 +120,14 @@ refcount_add_many(refcount_t *rc, uint64_t number, void *holder) reference_t *ref = NULL; int64_t count; - if (reference_tracking_enable) { + if (rc->rc_tracked) { ref = kmem_cache_alloc(reference_cache, KM_PUSHPAGE); ref->ref_holder = holder; ref->ref_number = number; } mutex_enter(&rc->rc_mtx); ASSERT(rc->rc_count >= 0); - if (reference_tracking_enable) + if (rc->rc_tracked) list_insert_head(&rc->rc_list, ref); rc->rc_count += number; count = rc->rc_count; @@ -144,7 +151,7 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) mutex_enter(&rc->rc_mtx); ASSERT(rc->rc_count >= number); - if (!reference_tracking_enable) { + if (!rc->rc_tracked) { rc->rc_count -= number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); @@ -161,7 +168,7 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) KM_PUSHPAGE); list_insert_head(&rc->rc_removed, ref); rc->rc_removed_count++; - if (rc->rc_removed_count >= reference_history) { + if (rc->rc_removed_count > reference_history) { ref = list_tail(&rc->rc_removed); list_remove(&rc->rc_removed, ref); kmem_cache_free(reference_history_cache, diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c index 7f9290bd4..8e80166c7 100644 --- a/module/zfs/rrwlock.c +++ b/module/zfs/rrwlock.c @@ -75,8 +75,9 @@ uint_t rrw_tsd_key; typedef struct rrw_node { - struct rrw_node *rn_next; - rrwlock_t *rn_rrl; + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; + void *rn_tag; } rrw_node_t; static rrw_node_t * @@ -98,13 +99,14 @@ rrn_find(rrwlock_t *rrl) * Add a node to the head of the singly linked list. */ static void -rrn_add(rrwlock_t *rrl) +rrn_add(rrwlock_t *rrl, void *tag) { rrw_node_t *rn; rn = kmem_alloc(sizeof (*rn), KM_SLEEP); rn->rn_rrl = rrl; rn->rn_next = tsd_get(rrw_tsd_key); + rn->rn_tag = tag; VERIFY(tsd_set(rrw_tsd_key, rn) == 0); } @@ -113,7 +115,7 @@ rrn_add(rrwlock_t *rrl) * thread's list and return TRUE; otherwise return FALSE. */ static boolean_t -rrn_find_and_remove(rrwlock_t *rrl) +rrn_find_and_remove(rrwlock_t *rrl, void *tag) { rrw_node_t *rn; rrw_node_t *prev = NULL; @@ -122,7 +124,7 @@ rrn_find_and_remove(rrwlock_t *rrl) return (B_FALSE); for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { - if (rn->rn_rrl == rrl) { + if (rn->rn_rrl == rrl && rn->rn_tag == tag) { if (prev) prev->rn_next = rn->rn_next; else @@ -136,7 +138,7 @@ rrn_find_and_remove(rrwlock_t *rrl) } void -rrw_init(rrwlock_t *rrl) +rrw_init(rrwlock_t *rrl, boolean_t track_all) { mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); @@ -144,6 +146,7 @@ rrw_init(rrwlock_t *rrl) refcount_create(&rrl->rr_anon_rcount); refcount_create(&rrl->rr_linked_rcount); rrl->rr_writer_wanted = B_FALSE; + rrl->rr_track_all = track_all; } void @@ -156,12 +159,13 @@ rrw_destroy(rrwlock_t *rrl) refcount_destroy(&rrl->rr_linked_rcount); } -static void +void rrw_enter_read(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); #if !defined(DEBUG) && defined(_KERNEL) - if (!rrl->rr_writer && !rrl->rr_writer_wanted) { + if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && + !rrl->rr_track_all) { rrl->rr_anon_rcount.rc_count++; mutex_exit(&rrl->rr_lock); return; @@ -171,14 +175,14 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) ASSERT(rrl->rr_writer != curthread); ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); - while (rrl->rr_writer || (rrl->rr_writer_wanted && + while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && refcount_is_zero(&rrl->rr_anon_rcount) && rrn_find(rrl) == NULL)) cv_wait(&rrl->rr_cv, &rrl->rr_lock); - if (rrl->rr_writer_wanted) { + if (rrl->rr_writer_wanted || rrl->rr_track_all) { /* may or may not be a re-entrant enter */ - rrn_add(rrl); + rrn_add(rrl, tag); (void) refcount_add(&rrl->rr_linked_rcount, tag); } else { (void) refcount_add(&rrl->rr_anon_rcount, tag); @@ -187,7 +191,7 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) mutex_exit(&rrl->rr_lock); } -static void +void rrw_enter_write(rrwlock_t *rrl) { mutex_enter(&rrl->rr_lock); @@ -233,10 +237,12 @@ rrw_exit(rrwlock_t *rrl, void *tag) if (rrl->rr_writer == NULL) { int64_t count; - if (rrn_find_and_remove(rrl)) + if (rrn_find_and_remove(rrl, tag)) { count = refcount_remove(&rrl->rr_linked_rcount, tag); - else + } else { + ASSERT(!rrl->rr_track_all); count = refcount_remove(&rrl->rr_anon_rcount, tag); + } if (count == 0) cv_broadcast(&rrl->rr_cv); } else { @@ -249,6 +255,11 @@ rrw_exit(rrwlock_t *rrl, void *tag) mutex_exit(&rrl->rr_lock); } +/* + * If the lock was created with track_all, rrw_held(RW_READER) will return + * B_TRUE iff the current thread has the lock for reader. Otherwise it may + * return B_TRUE if any thread has the lock for reader. + */ boolean_t rrw_held(rrwlock_t *rrl, krw_t rw) { @@ -259,7 +270,7 @@ rrw_held(rrwlock_t *rrl, krw_t rw) held = (rrl->rr_writer == curthread); } else { held = (!refcount_is_zero(&rrl->rr_anon_rcount) || - !refcount_is_zero(&rrl->rr_linked_rcount)); + rrn_find(rrl) != NULL); } mutex_exit(&rrl->rr_lock); diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 581cf4b0d..bad6123aa 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1019,10 +1019,10 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, sa_attr_type_t *tb; int error; - mutex_enter(&os->os_lock); + mutex_enter(&os->os_user_ptr_lock); if (os->os_sa) { mutex_enter(&os->os_sa->sa_lock); - mutex_exit(&os->os_lock); + mutex_exit(&os->os_user_ptr_lock); tb = os->os_sa->sa_user_table; mutex_exit(&os->os_sa->sa_lock); *user_table = tb; @@ -1035,7 +1035,7 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, os->os_sa = sa; mutex_enter(&sa->sa_lock); - mutex_exit(&os->os_lock); + mutex_exit(&os->os_user_ptr_lock); avl_create(&sa->sa_layout_num_tree, layout_num_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 7c37ca426..fcb1711a2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -64,6 +64,7 @@ #include <sys/zfs_ioctl.h> #include <sys/dsl_scan.h> #include <sys/zfeature.h> +#include <sys/dsl_destroy.h> #include <sys/zvol.h> #ifdef _KERNEL @@ -131,10 +132,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; -static dsl_syncfunc_t spa_sync_version; -static dsl_syncfunc_t spa_sync_props; -static dsl_checkfunc_t spa_change_guid_check; -static dsl_syncfunc_t spa_change_guid_sync; +static void spa_sync_version(void *arg, dmu_tx_t *tx); +static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, @@ -329,10 +328,10 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); + dsl_pool_config_enter(dp, FTAG); if ((err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds))) { - rw_exit(&dp->dp_config_rwlock); + dsl_pool_config_exit(dp, FTAG); break; } @@ -341,7 +340,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) KM_PUSHPAGE); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); - rw_exit(&dp->dp_config_rwlock); + dsl_pool_config_exit(dp, FTAG); } else { strval = NULL; intval = za.za_first_integer; @@ -495,9 +494,10 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (dmu_objset_type(os) != DMU_OST_ZFS) { error = ENOTSUP; - } else if ((error = dsl_prop_get_integer(strval, + } else if ((error = + dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_COMPRESSION), - &compress, NULL)) == 0 && + &compress)) == 0 && !BOOTFS_COMPRESS_VALID(compress)) { error = ENOTSUP; } else { @@ -661,8 +661,8 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) * read object, the features for write object, or the * feature descriptions object. */ - error = dsl_sync_task_do(spa_get_dsl(spa), NULL, - spa_sync_version, spa, &ver, 6); + error = dsl_sync_task(spa->spa_name, NULL, + spa_sync_version, &ver, 6); if (error) return (error); continue; @@ -673,8 +673,8 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) } if (need_sync) { - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 6)); + return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, + nvp, 6)); } return (0); @@ -696,12 +696,12 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) /*ARGSUSED*/ static int -spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) +spa_change_guid_check(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; - ASSERTV(uint64_t *newguid = arg2); + ASSERTV(uint64_t *newguid = arg); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; @@ -716,10 +716,10 @@ spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) +spa_change_guid_sync(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - uint64_t *newguid = arg2; + uint64_t *newguid = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; @@ -753,8 +753,8 @@ spa_change_guid(spa_t *spa) mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); - error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, - spa_change_guid_sync, spa, &guid, 5); + error = dsl_sync_task(spa->spa_name, spa_change_guid_check, + spa_change_guid_sync, &guid, 5); if (error == 0) { spa_config_sync(spa, B_FALSE, B_TRUE); @@ -1729,23 +1729,24 @@ spa_config_valid(spa_t *spa, nvlist_t *config) /* * Check for missing log devices */ -static int +static boolean_t spa_check_logs(spa_t *spa) { + boolean_t rv = B_FALSE; + switch (spa->spa_log_state) { default: break; case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: - if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, - DS_FIND_CHILDREN)) { + rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, + NULL, DS_FIND_CHILDREN) != 0); + if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); - return (1); - } break; } - return (0); + return (rv); } static boolean_t @@ -1793,11 +1794,11 @@ spa_activate_log(spa_t *spa) int spa_offline_log(spa_t *spa) { - int error = 0; - - if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN)) == 0) { + int error; + error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN); + if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed @@ -3610,7 +3611,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_sync_props(spa, props, tx); + spa_sync_props(props, tx); } dmu_tx_commit(tx); @@ -3844,7 +3845,7 @@ out: * Import a non-root pool into the system. */ int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) +spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; @@ -5878,10 +5879,11 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) } static void -spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) +spa_sync_version(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - uint64_t version = *(uint64_t *)arg2; + uint64_t *versionp = arg; + uint64_t version = *versionp; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. @@ -5900,11 +5902,11 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) * Set zpool properties. */ static void -spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) +spa_sync_props(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; + nvlist_t *nvp = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; - nvlist_t *nvp = arg2; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); @@ -6056,6 +6058,8 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) ASSERT(spa->spa_sync_pass == 1); + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); @@ -6081,6 +6085,7 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } + rrw_exit(&dp->dp_config_rwlock, FTAG); } /* diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 79d48620c..bbcd697e0 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -197,10 +197,10 @@ spa_history_zone(void) */ /*ARGSUSED*/ static void -spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) +spa_history_log_sync(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - nvlist_t *nvl = arg2; + nvlist_t *nvl = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; @@ -222,7 +222,7 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ - VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); @@ -326,8 +326,8 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); /* Kick this off asynchronously; errors are ignored. */ - dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, - spa_history_log_sync, spa, nvarg, 0, tx); + dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, + nvarg, 0, tx); dmu_tx_commit(tx); /* spa_history_log_sync will free nvl */ @@ -465,10 +465,10 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); if (dmu_tx_is_syncing(tx)) { - spa_history_log_sync(spa, nvl, tx); + spa_history_log_sync(nvl, tx); } else { - dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, - spa_history_log_sync, spa, nvl, 0, tx); + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_history_log_sync, nvl, 0, tx); } /* spa_history_log_sync() will free nvl */ } @@ -544,17 +544,11 @@ spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, void spa_history_log_version(spa_t *spa, const char *operation) { -#ifdef _KERNEL - uint64_t current_vers = spa_version(spa); - spa_history_log_internal(spa, operation, NULL, "pool version %llu; software version %llu/%d; uts %s %s %s %s", - (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, + (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, utsname.nodename, utsname.release, utsname.version, utsname.machine); - cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", operation, - (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); -#endif } #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 0ca9f3a7a..a5e13b5fb 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -268,7 +268,7 @@ spa_config_lock_init(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); - refcount_create(&scl->scl_count); + refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; } @@ -326,6 +326,8 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) int wlocks_held = 0; int i; + ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); + for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) @@ -406,27 +408,22 @@ spa_lookup(const char *name) static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; - char c = 0; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); + /* * If it's a full dataset name, figure out the pool name and * just use that. */ - cp = strpbrk(name, "/@"); - if (cp) { - c = *cp; + cp = strpbrk(search.spa_name, "/@"); + if (cp != NULL) *cp = '\0'; - } - (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); spa = avl_find(&spa_namespace_avl, &search, &where); - if (cp) - *cp = c; - return (spa); } @@ -539,6 +536,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) KM_SLEEP) == 0); } + spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); + return (spa); } diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index a031f3a20..2cf1d2a18 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -102,7 +102,7 @@ void space_map_add(space_map_t *sm, uint64_t start, uint64_t size) { avl_index_t where; - space_seg_t ssearch, *ss_before, *ss_after, *ss; + space_seg_t *ss_before, *ss_after, *ss; uint64_t end = start + size; int merge_before, merge_after; @@ -115,11 +115,8 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); - - if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { + ss = space_map_find(sm, start, size, &where); + if (ss != NULL) { zfs_panic_recover("zfs: allocating allocated segment" "(offset=%llu size=%llu)\n", (longlong_t)start, (longlong_t)size); @@ -171,19 +168,12 @@ void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) { avl_index_t where; - space_seg_t ssearch, *ss, *newseg; + space_seg_t *ss, *newseg; uint64_t end = start + size; int left_over, right_over; - ASSERT(MUTEX_HELD(sm->sm_lock)); VERIFY(!sm->sm_condensing); - VERIFY(size != 0); - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); + ss = space_map_find(sm, start, size, &where); /* Make sure we completely overlap with someone */ if (ss == NULL) { @@ -226,12 +216,11 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) sm->sm_space -= size; } -boolean_t -space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) +space_seg_t * +space_map_find(space_map_t *sm, uint64_t start, uint64_t size, + avl_index_t *wherep) { - avl_index_t where; space_seg_t ssearch, *ss; - uint64_t end = start + size; ASSERT(MUTEX_HELD(sm->sm_lock)); VERIFY(size != 0); @@ -239,10 +228,20 @@ space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); + ssearch.ss_end = start + size; + ss = avl_find(&sm->sm_root, &ssearch, wherep); + + if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size) + return (ss); + return (NULL); +} + +boolean_t +space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; - return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); + return (space_map_find(sm, start, size, &where) != 0); } void diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 7c820af4f..b3e537f45 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -659,6 +659,8 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; + ASSERT(!dsl_pool_config_held(dp)); + mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) @@ -682,6 +684,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; + ASSERT(!dsl_pool_config_held(dp)); + mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) @@ -747,42 +751,43 @@ txg_list_empty(txg_list_t *tl, uint64_t txg) } /* - * Add an entry to the list. - * Returns 0 if it's a new entry, 1 if it's already there. + * Add an entry to the list (unless it's already on the list). + * Returns B_TRUE if it was actually added. */ -int +boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - int already_on_list; + boolean_t add; mutex_enter(&tl->tl_lock); - already_on_list = tn->tn_member[t]; - if (!already_on_list) { + add = (tn->tn_member[t] == 0); + if (add) { tn->tn_member[t] = 1; tn->tn_next[t] = tl->tl_head[t]; tl->tl_head[t] = tn; } mutex_exit(&tl->tl_lock); - return (already_on_list); + return (add); } /* - * Add an entry to the end of the list (walks list to find end). - * Returns 0 if it's a new entry, 1 if it's already there. + * Add an entry to the end of the list, unless it's already on the list. + * (walks list to find end) + * Returns B_TRUE if it was actually added. */ -int +boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - int already_on_list; + boolean_t add; mutex_enter(&tl->tl_lock); - already_on_list = tn->tn_member[t]; - if (!already_on_list) { + add = (tn->tn_member[t] == 0); + if (add) { txg_node_t **tp; for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) @@ -794,7 +799,7 @@ txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) } mutex_exit(&tl->tl_lock); - return (already_on_list); + return (add); } /* @@ -845,13 +850,13 @@ txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) return (NULL); } -int +boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - return (tn->tn_member[t]); + return (tn->tn_member[t] != 0); } /* diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index a03e1c694..3cf0089ec 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -80,6 +80,7 @@ #include <sys/zfs_vnops.h> #include <sys/stat.h> #include <sys/dmu.h> +#include <sys/dsl_destroy.h> #include <sys/dsl_deleg.h> #include <sys/mount.h> #include <sys/zpl.h> @@ -488,13 +489,13 @@ zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name) */ /*ARGSUSED*/ int -zfsctl_snapdir_rename(struct inode *sdip, char *sname, - struct inode *tdip, char *tname, cred_t *cr, int flags) +zfsctl_snapdir_rename(struct inode *sdip, char *snm, + struct inode *tdip, char *tnm, cred_t *cr, int flags) { zfs_sb_t *zsb = ITOZSB(sdip); zfs_snapentry_t search, *sep; avl_index_t where; - char *to, *from, *real; + char *to, *from, *real, *fsname; int error; ZFS_ENTER(zsb); @@ -502,23 +503,26 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, to = kmem_alloc(MAXNAMELEN, KM_SLEEP); from = kmem_alloc(MAXNAMELEN, KM_SLEEP); real = kmem_alloc(MAXNAMELEN, KM_SLEEP); + fsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); if (zsb->z_case == ZFS_CASE_INSENSITIVE) { - error = dmu_snapshot_realname(zsb->z_os, sname, real, + error = dmu_snapshot_realname(zsb->z_os, snm, real, MAXNAMELEN, NULL); if (error == 0) { - sname = real; + snm = real; } else if (error != ENOTSUP) { goto out; } } - error = zfsctl_snapshot_zname(sdip, sname, MAXNAMELEN, from); - if (!error) - error = zfsctl_snapshot_zname(tdip, tname, MAXNAMELEN, to); - if (!error) + dmu_objset_name(zsb->z_os, fsname); + + error = zfsctl_snapshot_zname(sdip, snm, MAXNAMELEN, from); + if (error == 0) + error = zfsctl_snapshot_zname(tdip, tnm, MAXNAMELEN, to); + if (error == 0) error = zfs_secpolicy_rename_perms(from, to, cr); - if (error) + if (error != 0) goto out; /* @@ -532,21 +536,21 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, /* * No-op when names are identical. */ - if (strcmp(sname, tname) == 0) { + if (strcmp(snm, tnm) == 0) { error = 0; goto out; } mutex_enter(&zsb->z_ctldir_lock); - error = dmu_objset_rename(from, to, B_FALSE); + error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); if (error) goto out_unlock; - search.se_name = (char *)sname; + search.se_name = (char *)snm; sep = avl_find(&zsb->z_ctldir_snaps, &search, &where); if (sep) - zfsctl_rename_snap(zsb, sep, tname); + zfsctl_rename_snap(zsb, sep, tnm); out_unlock: mutex_exit(&zsb->z_ctldir_lock); @@ -554,6 +558,7 @@ out: kmem_free(from, MAXNAMELEN); kmem_free(to, MAXNAMELEN); kmem_free(real, MAXNAMELEN); + kmem_free(fsname, MAXNAMELEN); ZFS_EXIT(zsb); @@ -588,14 +593,14 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) } error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname); - if (!error) + if (error == 0) error = zfs_secpolicy_destroy_perms(snapname, cr); - if (error) + if (error != 0) goto out; error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE); if ((error == 0) || (error == ENOENT)) - error = dmu_objset_destroy(snapname, B_FALSE); + error = dsl_destroy_snapshot(snapname, B_FALSE); out: kmem_free(snapname, MAXNAMELEN); kmem_free(real, MAXNAMELEN); @@ -628,12 +633,12 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, dmu_objset_name(zsb->z_os, dsname); error = zfs_secpolicy_snapshot_perms(dsname, cr); - if (error) + if (error != 0) goto out; if (error == 0) { error = dmu_objset_snapshot_one(dsname, dirname); - if (error) + if (error != 0) goto out; error = zfsctl_snapdir_lookup(dip, dirname, ipp, diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index e64d6a1f0..acc54e5a7 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -159,6 +159,7 @@ #include <sys/dsl_deleg.h> #include <sys/dmu_objset.h> #include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/sunldi.h> @@ -175,9 +176,11 @@ #include <sys/zvol.h> #include <sys/dsl_scan.h> #include <sharefs/share.h> -#include <sys/dmu_objset.h> #include <sys/fm/util.h> +#include <sys/dmu_send.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_userhold.h> #include <sys/zfeature.h> #include <linux/miscdevice.h> @@ -242,11 +245,7 @@ static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); -static int zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature); -static int zfs_prop_activate_feature_check(void *arg1, void *arg2, - dmu_tx_t *tx); -static void zfs_prop_activate_feature_sync(void *arg1, void *arg2, - dmu_tx_t *tx); +static int zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature); static void history_str_free(char *buf) @@ -430,49 +429,48 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) { - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + if (dsl_prop_get_int_ds(ds, "zoned", &zoned)) return (ENOENT); - } - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int -zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, + const char *perm, cred_t *cr) { int error; - dsl_dataset_t *ds; - - error = dsl_dataset_hold(name, FTAG, &ds); - if (error != 0) - return (error); error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); - if (error) + if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } - - dsl_dataset_rele(ds, FTAG); return (error); } static int -zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, - const char *perm, cred_t *cr) +zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; + dsl_dataset_t *ds; + dsl_pool_t *dp; - error = zfs_dozonecheck_ds(name, ds, cr); - if (error == 0) { - error = secpolicy_zfs(cr); - if (error) - error = dsl_deleg_access_impl(ds, perm, cr); + error = dsl_pool_hold(name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); } + + error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); + + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -495,7 +493,7 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); - if (error) + if (error != 0) return (EPERM); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) @@ -545,7 +543,7 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, setsl_tag, &os); - if (error) + if (error != 0) return (EPERM); dmu_objset_disown(os, setsl_tag); @@ -638,7 +636,7 @@ zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) int error; error = zfs_dozonecheck(zc->zc_name, cr); - if (error) + if (error != 0) return (error); /* @@ -660,7 +658,6 @@ zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - spa_t *spa; dsl_pool_t *dp; dsl_dataset_t *ds; char *cp; @@ -673,23 +670,22 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (EINVAL); - error = spa_open(zc->zc_name, &spa, FTAG); - if (error) + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) return (error); - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - if (error) + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); + } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -820,12 +816,21 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (EINVAL); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { + dsl_pool_t *dp; dsl_dataset_t *ds; + error = dsl_pool_hold(nvpair_name(pair), FTAG, &dp); + if (error != 0) + break; nextpair = nvlist_next_nvpair(snaps, pair); - error = dsl_dataset_hold(nvpair_name(pair), FTAG, &ds); - if (error == 0) { + error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds); + if (error == 0) dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + if (error == 0) { + error = zfs_secpolicy_destroy_perms(nvpair_name(pair), + cr); } else if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider @@ -837,11 +842,7 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) */ fnvlist_remove_nvpair(snaps, pair); error = 0; - continue; - } else { - break; } - error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error != 0) break; } @@ -889,41 +890,47 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - char parentname[MAXNAMELEN]; - objset_t *clone; + dsl_pool_t *dp; + dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); - if (error) + if (error != 0) + return (error); + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) return (error); - error = dmu_objset_hold(zc->zc_name, FTAG, &clone); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { - dsl_dataset_t *pclone = NULL; + char parentname[MAXNAMELEN]; + dsl_dataset_t *origin = NULL; dsl_dir_t *dd; - dd = clone->os_dsl_dataset->ds_dir; + dd = clone->ds_dir; - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, FTAG, &pclone); - rw_exit(&dd->dd_pool->dp_config_rwlock); - if (error) { - dmu_objset_rele(clone, FTAG); + dd->dd_phys->dd_origin_obj, FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(clone, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } - error = zfs_secpolicy_write_perms(zc->zc_name, + error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); - dsl_dataset_name(pclone, parentname); - dmu_objset_rele(clone, FTAG); - dsl_dataset_rele(pclone, FTAG); - if (error == 0) - error = zfs_secpolicy_write_perms(parentname, + dsl_dataset_name(origin, parentname); + if (error == 0) { + error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); + } + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(origin, FTAG); } + dsl_pool_rele(dp, FTAG); return (error); } @@ -1132,16 +1139,47 @@ zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_HOLD, cr)); + nvpair_t *pair; + nvlist_t *holds; + int error; + + error = nvlist_lookup_nvlist(innvl, "holds", &holds); + if (error != 0) + return (EINVAL); + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + char fsname[MAXNAMELEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_HOLD, cr); + if (error != 0) + return (error); + } + return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_RELEASE, cr)); + nvpair_t *pair; + int error; + + for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(innvl, pair)) { + char fsname[MAXNAMELEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_RELEASE, cr); + if (error != 0) + return (error); + } + return (0); } /* @@ -1162,11 +1200,11 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); - if (!error) + if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); - if (!error) + if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); - if (!error) + if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); return (error); } @@ -1276,7 +1314,7 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) int error; error = dmu_objset_hold(dsname, FTAG, &os); - if (error) + if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); @@ -1379,7 +1417,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); - if (error) + if (error != 0) goto pool_props_bad; } @@ -1652,12 +1690,7 @@ zfs_ioc_pool_reguid(zfs_cmd_t *zc) static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { - int error; - - if ((error = dsl_dsobj_to_dsname(zc->zc_name,zc->zc_obj,zc->zc_value))) - return (error); - - return (0); + return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* @@ -1974,15 +2007,14 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { - objset_t *os = NULL; + objset_t *os; int error; - if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) - return (error); - - error = zfs_ioc_objset_stats_impl(zc, os); - - dmu_objset_rele(os, FTAG); + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, os); + dmu_objset_rele(os, FTAG); + } return (error); } @@ -2003,30 +2035,23 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { - objset_t *os = NULL; - int error; + int error = 0; nvlist_t *nv; - if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) - return (error); - /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ - if (!dsl_prop_get_hasrecvd(os)) { - dmu_objset_rele(os, FTAG); + if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (ENOTSUP); - } if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_received(os, &nv)) == 0) { + (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } - dmu_objset_rele(os, FTAG); return (error); } @@ -2141,20 +2166,6 @@ top: (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); - /* - * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 - * but is not declared void because its called by dmu_objset_find(). - */ - if (zc->zc_cookie == 0) { - uint64_t cookie = 0; - int len = sizeof (zc->zc_name) - (p - zc->zc_name); - - while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { - if (!dataset_name_hidden(zc->zc_name)) - (void) dmu_objset_prefetch(zc->zc_name, NULL); - } - } - do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, @@ -2197,14 +2208,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *os; int error; -top: - if (zc->zc_cookie == 0 && !zc->zc_simple) - (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, - NULL, DS_FIND_SNAPSHOTS); - error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error) + if (error != 0) { return (error == ENOENT ? ESRCH : error); + } /* * A dataset name of maximum length cannot have any snapshots, @@ -2224,24 +2231,8 @@ top: dsl_dataset_t *ds; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; - /* - * Since we probably don't have a hold on this snapshot, - * it's possible that the objsetid could have been destroyed - * and reused for a new objset. It's OK if this happens during - * a zfs send operation, since the new createtxg will be - * beyond the range we're interested in. - */ - rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (error) { - if (error == ENOENT) { - /* Racing with destroy, get the next one. */ - *strchr(zc->zc_name, '@') = '\0'; - dmu_objset_rele(os, FTAG); - goto top; - } - } else { + if (error == 0) { objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); @@ -2255,7 +2246,7 @@ top: dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ - if (error) + if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } @@ -2345,13 +2336,13 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: - err = dsl_dataset_set_quota(dsname, source, intval); + err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: - err = dsl_dataset_set_reservation(dsname, source, intval); + err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); @@ -2386,19 +2377,16 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, zfeature_info_t *feature = &spa_feature_table[SPA_FEATURE_LZ4_COMPRESS]; spa_t *spa; - dsl_pool_t *dp; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); - dp = spa->spa_dsl_pool; - /* * Setting the LZ4 compression algorithm activates * the feature. */ if (!spa_feature_is_active(spa, feature)) { - if ((err = zfs_prop_activate_feature(dp, + if ((err = zfs_prop_activate_feature(spa, feature)) != 0) { spa_close(spa, FTAG); return (err); @@ -2557,12 +2545,12 @@ retry: if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); - err = dsl_prop_set(dsname, propname, source, 1, - strlen(strval) + 1, strval); + err = dsl_prop_set_string(dsname, propname, + source, strval); } else { intval = fnvpair_value_uint64(propval); - err = dsl_prop_set(dsname, propname, source, 8, - 1, &intval); + err = dsl_prop_set_int(dsname, propname, source, + intval); } if (err != 0) { @@ -2628,7 +2616,7 @@ props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) } static int -clear_received_props(objset_t *os, const char *fs, nvlist_t *props, +clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; @@ -2640,8 +2628,8 @@ clear_received_props(objset_t *os, const char *fs, nvlist_t *props, * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | - (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); - err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); + (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); + err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); @@ -2673,22 +2661,19 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) if (received) { nvlist_t *origprops; - objset_t *os; - - if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { - if (dsl_prop_get_received(os, &origprops) == 0) { - (void) clear_received_props(os, - zc->zc_name, origprops, nvl); - nvlist_free(origprops); - } - dsl_prop_set_hasrecvd(os); - dmu_objset_rele(os, FTAG); + if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { + (void) clear_received_props(zc->zc_name, + origprops, nvl); + nvlist_free(origprops); } + + error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); - error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); + if (error == 0) + error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); @@ -2771,7 +2756,7 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) } /* property name has been validated by zfs_secpolicy_inherit_prop() */ - return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); + return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); } static int @@ -2907,7 +2892,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) */ error = secpolicy_zfs(CRED()); - if (error) { + if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); @@ -3214,7 +3199,7 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) - (void) dmu_objset_destroy(fsname, B_FALSE); + (void) dsl_destroy_head(fsname); } return (error); } @@ -3234,7 +3219,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) int error = 0; nvlist_t *nvprops = NULL; char *origin_name; - dsl_dataset_t *origin; if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) return (EINVAL); @@ -3246,14 +3230,8 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (EINVAL); - - error = dsl_dataset_hold(origin_name, FTAG, &origin); - if (error) - return (error); - - error = dmu_objset_clone(fsname, origin, 0); - dsl_dataset_rele(origin, FTAG); - if (error) + error = dmu_objset_clone(fsname, origin_name); + if (error != 0) return (error); /* @@ -3263,7 +3241,7 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) - (void) dmu_objset_destroy(fsname, B_FALSE); + (void) dsl_destroy_head(fsname); } return (error); } @@ -3275,7 +3253,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) * } * * outnvl: snapshot -> error code (int32) - * */ static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) @@ -3325,7 +3302,7 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) } } - error = dmu_objset_snapshot(snaps, props, outnvl); + error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } @@ -3371,43 +3348,71 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) } /* - * inputs: - * name dataset name, or when 'arg == NULL' the full snapshot name - * arg short snapshot name (i.e. part after the '@') + * The dp_config_rwlock must not be held when calling this, because the + * unmount may need to write out data. + * + * This function is best-effort. Callers must deal gracefully if it + * remains mounted (or is remounted after this call). */ -/* ARGSUSED */ -int -zfs_unmount_snap(const char *name, void *arg) +void +zfs_unmount_snap(const char *snapname) { zfs_sb_t *zsb = NULL; char *dsname; - char *snapname; char *fullname; char *ptr; - int error; - if ((ptr = strchr(name, '@')) == NULL) - return (0); + if ((ptr = strchr(snapname, '@')) == NULL) + return; - dsname = strdup(name); - dsname[ptr - name] = '\0'; + dsname = strdup(snapname); + dsname[ptr - snapname] = '\0'; snapname = strdup(ptr + 1); fullname = kmem_asprintf("%s@%s", dsname, snapname); - error = zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE); - if (error == 0) { - error = zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE); + if (zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE) == 0) { + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zsb->z_os))); + (void) zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE); zfs_sb_rele(zsb, FTAG); - - /* Allow ENOENT for consistency with upstream */ - if (error == ENOENT) - error = 0; } strfree(dsname); - strfree(snapname); strfree(fullname); - return (error); + return; +} + +/* ARGSUSED */ +static int +zfs_unmount_snap_cb(const char *snapname, void *arg) +{ + zfs_unmount_snap(snapname); + return (0); +} + +/* + * When a clone is destroyed, its origin may also need to be destroyed, + * in which case it must be unmounted. This routine will do that unmount + * if necessary. + */ +void +zfs_destroy_unmount_origin(const char *fsname) +{ + int error; + objset_t *os; + dsl_dataset_t *ds; + + error = dmu_objset_hold(fsname, FTAG, &os); + if (error != 0) + return; + ds = dmu_objset_ds(os); + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { + char originname[MAXNAMELEN]; + dsl_dataset_name(ds->ds_prev, originname); + dmu_objset_rele(os, FTAG); + zfs_unmount_snap(originname); + } else { + dmu_objset_rele(os, FTAG); + } } /* @@ -3442,15 +3447,11 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) (name[poollen] != '/' && name[poollen] != '@')) return (EXDEV); - /* - * Ignore failures to unmount; dmu_snapshots_destroy_nvl() - * will deal with this gracefully (by filling in outnvl). - */ - (void) zfs_unmount_snap(name, NULL); + zfs_unmount_snap(name); (void) zvol_remove_minor(name); } - return (dmu_snapshots_destroy_nvl(snaps, defer, outnvl)); + return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* @@ -3465,13 +3466,13 @@ static int zfs_ioc_destroy(zfs_cmd_t *zc) { int err; - if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { - err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); - } + if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) + zfs_unmount_snap(zc->zc_name); - err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); + if (strchr(zc->zc_name, '@')) + err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); + else + err = dsl_destroy_head(zc->zc_name); if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) (void) zvol_remove_minor(zc->zc_name); return (err); @@ -3486,79 +3487,35 @@ zfs_ioc_destroy(zfs_cmd_t *zc) static int zfs_ioc_rollback(zfs_cmd_t *zc) { - dsl_dataset_t *ds, *clone; - int error; zfs_sb_t *zsb; - char *clone_name; - - error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); - if (error) - return (error); - - /* must not be a snapshot */ - if (dsl_dataset_is_snapshot(ds)) { - dsl_dataset_rele(ds, FTAG); - return (EINVAL); - } - - /* must have a most recent snapshot */ - if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { - dsl_dataset_rele(ds, FTAG); - return (EINVAL); - } - - /* - * Create clone of most recent snapshot. - */ - clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); - error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); - if (error) - goto out; - - error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); - if (error) - goto out; + int error; - /* - * Do clone swap. - */ if (get_zfs_sb(zc->zc_name, &zsb) == 0) { error = zfs_suspend_fs(zsb); if (error == 0) { int resume_err; - if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { - error = dsl_dataset_clone_swap(clone, ds, - B_TRUE); - dsl_dataset_disown(ds, FTAG); - ds = NULL; - } else { - error = EBUSY; - } + error = dsl_dataset_rollback(zc->zc_name); resume_err = zfs_resume_fs(zsb, zc->zc_name); error = error ? error : resume_err; } deactivate_super(zsb->z_sb); } else { - if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { - error = dsl_dataset_clone_swap(clone, ds, B_TRUE); - dsl_dataset_disown(ds, FTAG); - ds = NULL; - } else { - error = EBUSY; - } + error = dsl_dataset_rollback(zc->zc_name); } + return (error); +} - /* - * Destroy clone (which also closes it). - */ - (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); +static int +recursive_unmount(const char *fsname, void *arg) +{ + const char *snapname = arg; + char *fullname; -out: - strfree(clone_name); - if (ds) - dsl_dataset_rele(ds, FTAG); - return (error); + fullname = kmem_asprintf("%s@%s", fsname, snapname); + zfs_unmount_snap(fullname); + strfree(fullname); + return (0); } /* @@ -3573,6 +3530,7 @@ static int zfs_ioc_rename(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie & 1; + char *at; int err; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; @@ -3580,25 +3538,29 @@ zfs_ioc_rename(zfs_cmd_t *zc) strchr(zc->zc_value, '%')) return (EINVAL); - /* - * Unmount snapshot unless we're doing a recursive rename, - * in which case the dataset code figures out which snapshots - * to unmount. - */ - if (!recursive && strchr(zc->zc_name, '@') != NULL && - zc->zc_objset_type == DMU_OST_ZFS) { - err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); - } - - err = dmu_objset_rename(zc->zc_name, zc->zc_value, recursive); - if ((err == 0) && (zc->zc_objset_type == DMU_OST_ZVOL)) { - (void) zvol_remove_minor(zc->zc_name); - (void) zvol_create_minor(zc->zc_value); + at = strchr(zc->zc_name, '@'); + if (at != NULL) { + /* snaps must be in same fs */ + if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) + return (EXDEV); + *at = '\0'; + if (zc->zc_objset_type == DMU_OST_ZFS) { + int error = dmu_objset_find(zc->zc_name, + recursive_unmount, at + 1, + recursive ? DS_FIND_CHILDREN : 0); + if (error != 0) + return (error); + } + return (dsl_dataset_rename_snapshot(zc->zc_name, + at + 1, strchr(zc->zc_value, '@') + 1, recursive)); + } else { + err = dsl_dir_rename(zc->zc_name, zc->zc_value); + if (!err && zc->zc_objset_type == DMU_OST_ZVOL) { + (void) zvol_remove_minor(zc->zc_name); + (void) zvol_create_minor(zc->zc_value); + } + return (err); } - - return (err); } static int @@ -3744,35 +3706,14 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } /* - * Activates a feature on a pool in response to a property setting. This - * creates a new sync task which modifies the pool to reflect the feature - * as being active. - */ -static int -zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature) -{ - int err; - - /* EBUSY here indicates that the feature is already active */ - err = dsl_sync_task_do(dp, zfs_prop_activate_feature_check, - zfs_prop_activate_feature_sync, dp->dp_spa, feature, 2); - - if (err != 0 && err != EBUSY) - return (err); - else - return (0); -} - -/* * Checks for a race condition to make sure we don't increment a feature flag * multiple times. */ -/*ARGSUSED*/ static int -zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) +zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - zfeature_info_t *feature = arg2; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; if (!spa_feature_is_active(spa, feature)) return (0); @@ -3785,15 +3726,36 @@ zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) * zfs_prop_activate_feature. */ static void -zfs_prop_activate_feature_sync(void *arg1, void *arg2, dmu_tx_t *tx) +zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - zfeature_info_t *feature = arg2; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; spa_feature_incr(spa, feature, tx); } /* + * Activates a feature on a pool in response to a property setting. This + * creates a new sync task which modifies the pool to reflect the feature + * as being active. + */ +static int +zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature) +{ + int err; + + /* EBUSY here indicates that the feature is already active */ + err = dsl_sync_task(spa_name(spa), + zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, + feature, 2); + + if (err != 0 && err != EBUSY) + return (err); + else + return (0); +} + +/* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. @@ -3947,7 +3909,6 @@ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; - objset_t *os; dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int fd; @@ -3957,7 +3918,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) offset_t off; nvlist_t *props = NULL; /* sent properties */ nvlist_t *origprops = NULL; /* existing properties */ - objset_t *origin = NULL; + char *origin = NULL; char *tosnap; char tofs[ZFS_MAXNAMELEN]; boolean_t first_recvd_props = B_FALSE; @@ -3985,18 +3946,31 @@ zfs_ioc_recv(zfs_cmd_t *zc) VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { - if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && - !dsl_prop_get_hasrecvd(os)) { + if (zc->zc_string[0]) + origin = zc->zc_string; + + error = dmu_recv_begin(tofs, tosnap, + &zc->zc_begin_record, force, origin, &drc); + if (error != 0) + goto out; + + /* + * Set properties before we receive the stream so that they are applied + * to the new data. Note that we must call dmu_recv_stream() if + * dmu_recv_begin() succeeds. + */ + if (props != NULL && !drc.drc_newfs) { + if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= + SPA_VERSION_RECVD_PROPS && + !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; - } /* * If new received properties are supplied, they are to * completely replace the existing received properties, so stash * away the existing ones. */ - if (dsl_prop_get_received(os, &origprops) == 0) { + if (dsl_prop_get_received(tofs, &origprops) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't @@ -4008,53 +3982,25 @@ zfs_ioc_recv(zfs_cmd_t *zc) */ if (!first_recvd_props) props_reduce(props, origprops); - if (zfs_check_clearable(tofs, origprops, - &errlist) != 0) + if (zfs_check_clearable(tofs, origprops, &errlist) != 0) (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); - } - dmu_objset_rele(os, FTAG); - } - - if (zc->zc_string[0]) { - error = dmu_objset_hold(zc->zc_string, FTAG, &origin); - if (error) - goto out; - } - - error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, - &zc->zc_begin_record, force, origin, &drc); - if (origin) - dmu_objset_rele(origin, FTAG); - if (error) - goto out; - - /* - * Set properties before we receive the stream so that they are applied - * to the new data. Note that we must call dmu_recv_stream() if - * dmu_recv_begin() succeeds. - */ - if (props) { - if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { - if (drc.drc_newfs) { - if (spa_version(os->os_spa) >= - SPA_VERSION_RECVD_PROPS) - first_recvd_props = B_TRUE; - } else if (origprops != NULL) { - if (clear_received_props(os, tofs, origprops, - first_recvd_props ? NULL : props) != 0) - zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } else { + if (clear_received_props(tofs, origprops, + first_recvd_props ? NULL : props) != 0) zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } - dsl_prop_set_hasrecvd(os); - } else if (!drc.drc_newfs) { + } else { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } + } + + if (props != NULL) { + props_error = dsl_prop_set_hasrecvd(tofs); - (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, - props, errors); + if (props_error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + props, errors); + } } if (zc->zc_nvlist_dst_size != 0 && @@ -4106,22 +4052,16 @@ zfs_ioc_recv(zfs_cmd_t *zc) /* * On error, restore the original props. */ - if (error && props) { - if (dmu_objset_hold(tofs, FTAG, &os) == 0) { - if (clear_received_props(os, tofs, props, NULL) != 0) { - /* - * We failed to clear the received properties. - * Since we may have left a $recvd value on the - * system, we can't clear the $hasrecvd flag. - */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; - } else if (first_recvd_props) { - dsl_prop_unset_hasrecvd(os); - } - dmu_objset_rele(os, FTAG); - } else if (!drc.drc_newfs) { - /* We failed to clear the received properties. */ + if (error != 0 && props != NULL && !drc.drc_newfs) { + if (clear_received_props(tofs, props, NULL) != 0) { + /* + * We failed to clear the received properties. + * Since we may have left a $recvd value on the + * system, we can't clear the $hasrecvd flag. + */ zc->zc_obj |= ZPROP_ERR_NORESTORE; + } else if (first_recvd_props) { + dsl_prop_unset_hasrecvd(tofs); } if (origprops == NULL && !drc.drc_newfs) { @@ -4173,100 +4113,75 @@ out: static int zfs_ioc_send(zfs_cmd_t *zc) { - objset_t *fromsnap = NULL; - objset_t *tosnap; int error; offset_t off; - dsl_dataset_t *ds; - dsl_dataset_t *dsfrom = NULL; - spa_t *spa; - dsl_pool_t *dp; boolean_t estimate = (zc->zc_guid != 0); - error = spa_open(zc->zc_name, &spa, FTAG); - if (error) - return (error); + if (zc->zc_obj != 0) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - if (error) - return (error); - - error = dmu_objset_from_ds(ds, &tosnap); - if (error) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - if (zc->zc_fromobj != 0) { - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom); - rw_exit(&dp->dp_config_rwlock); - if (error) { - dsl_dataset_rele(ds, FTAG); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) return (error); - } - error = dmu_objset_from_ds(dsfrom, &fromsnap); - if (error) { - dsl_dataset_rele(dsfrom, FTAG); - dsl_dataset_rele(ds, FTAG); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); } + + if (dsl_dir_is_clone(tosnap->ds_dir)) + zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj; + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); } - if (zc->zc_obj) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; + if (estimate) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + dsl_dataset_t *fromsnap = NULL; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); - if (fromsnap != NULL) { - dsl_dataset_rele(dsfrom, FTAG); - dsl_dataset_rele(ds, FTAG); - return (EINVAL); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); } - if (dsl_dir_is_clone(ds->ds_dir)) { - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &dsfrom); - rw_exit(&dp->dp_config_rwlock); - if (error) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - error = dmu_objset_from_ds(dsfrom, &fromsnap); - if (error) { - dsl_dataset_rele(dsfrom, FTAG); - dsl_dataset_rele(ds, FTAG); + if (zc->zc_fromobj != 0) { + error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, + FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } } - } - if (estimate) { error = dmu_send_estimate(tosnap, fromsnap, &zc->zc_objset_type); + + if (fromsnap != NULL) + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); } else { file_t *fp = getf(zc->zc_cookie); - if (fp == NULL) { - dsl_dataset_rele(ds, FTAG); - if (dsfrom) - dsl_dataset_rele(dsfrom, FTAG); + if (fp == NULL) return (EBADF); - } off = fp->f_offset; - error = dmu_send(tosnap, fromsnap, - zc->zc_cookie, fp->f_vnode, &off); + error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, + zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); } - if (dsfrom) - dsl_dataset_rele(dsfrom, FTAG); - dsl_dataset_rele(ds, FTAG); return (error); } @@ -4281,13 +4196,21 @@ zfs_ioc_send(zfs_cmd_t *zc) static int zfs_ioc_send_progress(zfs_cmd_t *zc) { + dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendarg_t *dsp = NULL; int error; - if ((error = dsl_dataset_hold(zc->zc_name, FTAG, &ds)) != 0) + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) return (error); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + mutex_enter(&ds->ds_sendstream_lock); /* @@ -4311,6 +4234,7 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -4417,7 +4341,7 @@ zfs_ioc_clear(zfs_cmd_t *zc) } } - if (error) + if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); @@ -4455,7 +4379,7 @@ zfs_ioc_pool_reopen(zfs_cmd_t *zc) int error; error = spa_open(zc->zc_name, &spa, FTAG); - if (error) + if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); @@ -4495,7 +4419,7 @@ zfs_ioc_promote(zfs_cmd_t *zc) if (cp) *cp = '\0'; (void) dmu_objset_find(zc->zc_value, - zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); + zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } @@ -4521,7 +4445,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) return (EINVAL); error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE); - if (error) + if (error != 0) return (error); error = zfs_userspace_one(zsb, @@ -4554,7 +4478,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) return (ENOMEM); error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE); - if (error) + if (error != 0) return (error); buf = vmem_alloc(bufsize, KM_SLEEP); @@ -4604,7 +4528,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error) + if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); @@ -4639,7 +4563,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error) + if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, @@ -4662,25 +4586,26 @@ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; + char *hold_name; int error; + minor_t minor; - snap_name = kmem_asprintf("%s@%s-%016llx", zc->zc_name, zc->zc_value, - (u_longlong_t)ddi_get_lbolt64()); - - if (strlen(snap_name) >= MAXPATHLEN) { - strfree(snap_name); - return (E2BIG); - } - - error = dmu_objset_snapshot_tmp(snap_name, "%temp", zc->zc_cleanup_fd); - if (error != 0) { - strfree(snap_name); + error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (error != 0) return (error); - } - (void) strcpy(zc->zc_value, strchr(snap_name, '@') + 1); + snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, + (u_longlong_t)ddi_get_lbolt64()); + hold_name = kmem_asprintf("%%%s", zc->zc_value); + + error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, + hold_name); + if (error == 0) + (void) strcpy(zc->zc_value, snap_name); strfree(snap_name); - return (0); + strfree(hold_name); + zfs_onexit_fd_rele(zc->zc_cleanup_fd); + return (error); } /* @@ -4695,39 +4620,22 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) static int zfs_ioc_diff(zfs_cmd_t *zc) { - objset_t *fromsnap; - objset_t *tosnap; file_t *fp; offset_t off; int error; - error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); - if (error) - return (error); - - error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap); - if (error) { - dmu_objset_rele(tosnap, FTAG); - return (error); - } - fp = getf(zc->zc_cookie); - if (fp == NULL) { - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); + if (fp == NULL) return (EBADF); - } off = fp->f_offset; - error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off); + error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); return (error); } @@ -4799,13 +4707,13 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) ZFS_SHARES_DIR); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { + if (error != 0) { dmu_tx_abort(tx); } else { error = zfs_create_share_dir(zsb, tx); dmu_tx_commit(tx); } - if (error) { + if (error != 0) { mutex_exit(&zsb->z_lock); VN_RELE(vp); ZFS_EXIT(zsb); @@ -4886,124 +4794,82 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) } /* - * inputs: - * zc_name name of filesystem - * zc_value short name of snap - * zc_string user-supplied tag for this hold - * zc_cookie recursive flag - * zc_temphold set if hold is temporary - * zc_cleanup_fd cleanup-on-exit file descriptor for calling process - * zc_sendobj if non-zero, the objid for zc_name@zc_value - * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg + * innvl: { + * "holds" -> { snapname -> holdname (string), ... } + * (optional) "cleanup_fd" -> fd (int32) + * } * - * outputs: none + * outnvl: { + * snapname -> error value (int32) + * ... + * } */ +/* ARGSUSED */ static int -zfs_ioc_hold(zfs_cmd_t *zc) +zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { - boolean_t recursive = zc->zc_cookie; - spa_t *spa; - dsl_pool_t *dp; - dsl_dataset_t *ds; + nvlist_t *holds; + int cleanup_fd = -1; int error; minor_t minor = 0; - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - - if (zc->zc_sendobj == 0) { - return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, - zc->zc_string, recursive, zc->zc_temphold, - zc->zc_cleanup_fd)); - } - - if (recursive) + error = nvlist_lookup_nvlist(args, "holds", &holds); + if (error != 0) return (EINVAL); - error = spa_open(zc->zc_name, &spa, FTAG); - if (error) - return (error); - - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - if (error) - return (error); - - /* - * Until we have a hold on this snapshot, it's possible that - * zc_sendobj could've been destroyed and reused as part - * of a later txg. Make sure we're looking at the right object. - */ - if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) { - dsl_dataset_rele(ds, FTAG); - return (ENOENT); - } - - if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) { - error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); - if (error) { - dsl_dataset_rele(ds, FTAG); + if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { + error = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (error != 0) return (error); - } - } - - error = dsl_dataset_user_hold_for_send(ds, zc->zc_string, - zc->zc_temphold); - if (minor != 0) { - if (error == 0) { - dsl_register_onexit_hold_cleanup(ds, zc->zc_string, - minor); - } - zfs_onexit_fd_rele(zc->zc_cleanup_fd); } - dsl_dataset_rele(ds, FTAG); + error = dsl_dataset_user_hold(holds, minor, errlist); + if (minor != 0) + zfs_onexit_fd_rele(cleanup_fd); return (error); } /* - * inputs: - * zc_name name of dataset from which we're releasing a user hold - * zc_value short name of snap - * zc_string user-supplied tag for this hold - * zc_cookie recursive flag + * innvl is not used. * - * outputs: none + * outnvl: { + * holdname -> time added (uint64 seconds since epoch) + * ... + * } */ +/* ARGSUSED */ static int -zfs_ioc_release(zfs_cmd_t *zc) +zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { - boolean_t recursive = zc->zc_cookie; - - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - - return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, - zc->zc_string, recursive)); + return (dsl_dataset_get_holds(snapname, outnvl)); } /* - * inputs: - * zc_name name of filesystem + * innvl: { + * snapname -> { holdname, ... } + * ... + * } * - * outputs: - * zc_nvlist_src{_size} nvlist of snapshot holds + * outnvl: { + * snapname -> error value (int32) + * ... + * } */ +/* ARGSUSED */ static int -zfs_ioc_get_holds(zfs_cmd_t *zc) +zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { - nvlist_t *nvp; - int error; + nvpair_t *pair; - if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { - error = put_nvlist(zc, nvp); - nvlist_free(nvp); - } + /* + * The release may cause the snapshot to be destroyed; make sure it + * is not mounted. + */ + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) + zfs_unmount_snap(nvpair_name(pair)); - return (error); + return (dsl_dataset_user_release(holds, errlist)); } /* @@ -5044,7 +4910,7 @@ zfs_ioc_events_next(zfs_cmd_t *zc) break; error = zfs_zevent_wait(ze); - if (error) + if (error != 0) break; } while (1); @@ -5082,14 +4948,21 @@ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; + dsl_pool_t *dp; dsl_dataset_t *new, *old; - error = dsl_dataset_hold(zc->zc_name, FTAG, &new); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); - error = dsl_dataset_hold(zc->zc_value, FTAG, &old); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -5097,6 +4970,7 @@ zfs_ioc_space_written(zfs_cmd_t *zc) &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -5115,6 +4989,7 @@ static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; + dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; @@ -5122,18 +4997,26 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) return (EINVAL); - error = dsl_dataset_hold(lastsnap, FTAG, &new); + error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); - error = dsl_dataset_hold(firstsnap, FTAG, &old); + + error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); @@ -5152,49 +5035,28 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { - objset_t *fromsnap = NULL; - objset_t *tosnap; int error; offset_t off; - char *fromname; + char *fromname = NULL; int fd; + file_t *fp; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) return (EINVAL); - error = dmu_objset_hold(snapname, FTAG, &tosnap); - if (error) - return (error); + (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); - error = nvlist_lookup_string(innvl, "fromsnap", &fromname); - if (error == 0) { - error = dmu_objset_hold(fromname, FTAG, &fromsnap); - if (error) { - dmu_objset_rele(tosnap, FTAG); - return (error); - } - } - - { - file_t *fp = getf(fd); - if (fp == NULL) { - dmu_objset_rele(tosnap, FTAG); - if (fromsnap != NULL) - dmu_objset_rele(fromsnap, FTAG); + if ((fp = getf(fd)) == NULL) return (EBADF); - } off = fp->f_offset; - error = dmu_send(tosnap, fromsnap, fd, fp->f_vnode, &off); + error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; - } + releasef(fd); - if (fromsnap != NULL) - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); return (error); } @@ -5213,21 +5075,29 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { - objset_t *fromsnap = NULL; - objset_t *tosnap; + dsl_pool_t *dp; + dsl_dataset_t *fromsnap = NULL; + dsl_dataset_t *tosnap; int error; char *fromname; uint64_t space; - error = dmu_objset_hold(snapname, FTAG, &tosnap); - if (error) + error = dsl_pool_hold(snapname, FTAG, &dp); + if (error != 0) return (error); + error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = nvlist_lookup_string(innvl, "fromsnap", &fromname); if (error == 0) { - error = dmu_objset_hold(fromname, FTAG, &fromsnap); - if (error) { - dmu_objset_rele(tosnap, FTAG); + error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } } @@ -5236,8 +5106,9 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) fnvlist_add_uint64(outnvl, "space", space); if (fromsnap != NULL) - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -5382,6 +5253,17 @@ zfs_ioctl_init(void) zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("hold", ZFS_IOC_HOLD, + zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("release", ZFS_IOC_RELEASE, + zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, + zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, @@ -5459,8 +5341,6 @@ zfs_ioctl_init(void) zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); - zfs_ioctl_register_dataset_read(ZFS_IOC_GET_HOLDS, - zfs_ioc_get_holds); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, @@ -5503,10 +5383,6 @@ zfs_ioctl_init(void) zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); - zfs_ioctl_register_dataset_modify(ZFS_IOC_HOLD, zfs_ioc_hold, - zfs_secpolicy_hold); - zfs_ioctl_register_dataset_modify(ZFS_IOC_RELEASE, zfs_ioc_release, - zfs_secpolicy_release); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, @@ -5866,7 +5742,7 @@ zfs_attach(void) offsetof(zfsdev_state_t, zs_next)); error = misc_register(&zfs_misc); - if (error) { + if (error != 0) { printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); return (error); } @@ -5880,7 +5756,7 @@ zfs_detach(void) int error; error = misc_deregister(&zfs_misc); - if (error) + if (error != 0) printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error); mutex_destroy(&zfsdev_state_lock); diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 8fee441b1..eeac0391c 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -248,28 +248,31 @@ zfs_register_callbacks(zfs_sb_t *zsb) * overboard... */ ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dsl_prop_register(ds, - "atime", atime_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "xattr", xattr_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "recordsize", blksz_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "readonly", readonly_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "devices", devices_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "setuid", setuid_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "exec", exec_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "snapdir", snapdir_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "aclinherit", acl_inherit_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zsb); error = error ? error : dsl_prop_register(ds, - "vscan", vscan_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, - "nbmand", nbmand_changed_cb, zsb); + zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zsb); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; @@ -284,18 +287,28 @@ unregister: * registered, but this is OK; it will simply return ENOMSG, * which we will ignore. */ - (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, - zsb); - (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zsb); - (void) dsl_prop_unregister(ds, "nbmand", nbmand_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME), + atime_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR), + xattr_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + blksz_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY), + readonly_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), + devices_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID), + setuid_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC), + exec_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), + snapdir_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), + acl_inherit_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), + vscan_changed_cb, zsb); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), + nbmand_changed_cb, zsb); return (error); } @@ -305,8 +318,6 @@ static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { - int error = 0; - /* * Is it a valid type of object to track? */ @@ -363,7 +374,7 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, *groupp = BSWAP_64(*groupp); } } - return (error); + return (0); } static void @@ -726,7 +737,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zsb->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - rrw_init(&zsb->z_teardown_lock); + rrw_init(&zsb->z_teardown_lock, B_FALSE); rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) @@ -1138,7 +1149,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) if (dsl_dataset_is_dirty(dmu_objset_ds(zsb->z_os)) && !zfs_is_readonly(zsb)) txg_wait_synced(dmu_objset_pool(zsb->z_os), 0); - (void) dmu_objset_evict_dbufs(zsb->z_os); + dmu_objset_evict_dbufs(zsb->z_os); return (0); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index c1796937b..d59c92c09 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -257,7 +257,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, } } - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + VERIFY(arc_buf_remove_ref(abuf, &abuf)); } return (error); @@ -356,7 +356,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); - if (error) + if (error != 0) break; for (lrp = lrbuf; lrp < end; lrp += reclen) { @@ -492,7 +492,7 @@ zilog_dirty(zilog_t *zilog, uint64_t txg) if (dsl_dataset_is_snapshot(ds)) panic("dirtying snapshot!"); - if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) { + if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); } @@ -658,8 +658,8 @@ zil_claim(const char *osname, void *txarg) objset_t *os; int error; - error = dmu_objset_hold(osname, FTAG, &os); - if (error) { + error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os); + if (error != 0) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } @@ -672,7 +672,7 @@ zil_claim(const char *osname, void *txarg) zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); BP_ZERO(&zh->zh_log); dsl_dataset_dirty(dmu_objset_ds(os), tx); - dmu_objset_rele(os, FTAG); + dmu_objset_disown(os, FTAG); return (0); } @@ -697,7 +697,7 @@ zil_claim(const char *osname, void *txarg) } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); - dmu_objset_rele(os, FTAG); + dmu_objset_disown(os, FTAG); return (0); } @@ -717,7 +717,7 @@ zil_check_log_chain(const char *osname, void *tx) ASSERT(tx == NULL); error = dmu_objset_hold(osname, FTAG, &os); - if (error) { + if (error != 0) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } @@ -1014,7 +1014,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) BP_ZERO(bp); use_slog = USE_SLOG(zilog); - error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog)); + error = zio_alloc_zil(spa, txg, bp, zil_blksz, + USE_SLOG(zilog)); if (use_slog) { ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); @@ -1025,7 +1026,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) ZIL_STAT_BUMP(zil_itx_metaslab_normal_count); ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused); } - if (!error) { + if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; @@ -1145,7 +1146,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } - if (error) { + if (error != 0) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); return (lwb); @@ -1807,6 +1808,9 @@ zil_free(zilog_t *zilog) zilog->zl_stop_sync = 1; + ASSERT0(zilog->zl_suspend); + ASSERT0(zilog->zl_suspending); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); @@ -1905,32 +1909,100 @@ zil_close(zilog_t *zilog) mutex_exit(&zilog->zl_lock); } +static char *suspend_tag = "zil suspending"; + /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. - * We suspend the log briefly when taking a snapshot so that the snapshot - * contains all the data it's supposed to, and has an empty intent log. + * On old version pools, we suspend the log briefly when taking a + * snapshot so that it will have an empty intent log. + * + * Long holds are not really intended to be used the way we do here -- + * held for such a short time. A concurrent caller of dsl_dataset_long_held() + * could fail. Therefore we take pains to only put a long hold if it is + * actually necessary. Fortunately, it will only be necessary if the + * objset is currently mounted (or the ZVOL equivalent). In that case it + * will already have a long hold, so we are not really making things any worse. + * + * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or + * zvol_state_t), and use their mechanism to prevent their hold from being + * dropped (e.g. VFS_HOLD()). However, that would be even more pain for + * very little gain. + * + * if cookiep == NULL, this does both the suspend & resume. + * Otherwise, it returns with the dataset "long held", and the cookie + * should be passed into zil_resume(). */ int -zil_suspend(zilog_t *zilog) +zil_suspend(const char *osname, void **cookiep) { - const zil_header_t *zh = zilog->zl_header; + objset_t *os; + zilog_t *zilog; + const zil_header_t *zh; + int error; + + error = dmu_objset_hold(osname, suspend_tag, &os); + if (error != 0) + return (error); + zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); + zh = zilog->zl_header; + if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); return (EBUSY); } - if (zilog->zl_suspend++ != 0) { + + /* + * Don't put a long hold in the cases where we can avoid it. This + * is when there is no cookie so we are doing a suspend & resume + * (i.e. called from zil_vdev_offline()), and there's nothing to do + * for the suspend because it's already suspended, or there's no ZIL. + */ + if (cookiep == NULL && !zilog->zl_suspending && + (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { + mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); + return (0); + } + + dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); + dsl_pool_rele(dmu_objset_pool(os), suspend_tag); + + zilog->zl_suspend++; + + if (zilog->zl_suspend > 1) { /* - * Someone else already began a suspend. + * Someone else is already suspending it. * Just wait for them to finish. */ + while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); + + if (cookiep == NULL) + zil_resume(os); + else + *cookiep = os; + return (0); + } + + /* + * If there is no pointer to an on-disk block, this ZIL must not + * be active (e.g. filesystem not mounted), so there's nothing + * to clean up. + */ + if (BP_IS_HOLE(&zh->zh_log)) { + ASSERT(cookiep != NULL); /* fast path already handled */ + + *cookiep = os; + mutex_exit(&zilog->zl_lock); return (0); } + zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); @@ -1943,16 +2015,25 @@ zil_suspend(zilog_t *zilog) cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); + if (cookiep == NULL) + zil_resume(os); + else + *cookiep = os; return (0); } void -zil_resume(zilog_t *zilog) +zil_resume(void *cookie) { + objset_t *os = cookie; + zilog_t *zilog = dmu_objset_zil(os); + mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); + dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); + dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { @@ -2025,7 +2106,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); - if (error) + if (error != 0) return (zil_replay_error(zilog, lr, error)); } @@ -2046,7 +2127,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); - if (error) { + if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with @@ -2056,7 +2137,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); - if (error) + if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); @@ -2128,21 +2209,12 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *tx) int zil_vdev_offline(const char *osname, void *arg) { - objset_t *os; - zilog_t *zilog; int error; - error = dmu_objset_hold(osname, FTAG, &os); - if (error) - return (error); - - zilog = dmu_objset_zil(os); - if (zil_suspend(zilog) != 0) - error = EEXIST; - else - zil_resume(zilog); - dmu_objset_rele(os, FTAG); - return (error); + error = zil_suspend(osname, NULL); + if (error != 0) + return (EEXIST); + return (0); } #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ccefaf8ac..0e2b463ac 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -767,6 +767,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies) void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { + metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } @@ -785,6 +786,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, arc_freed(spa, bp); + metaslab_check_free(spa, bp); + zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); @@ -2060,7 +2063,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = EEXIST; - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + VERIFY(arc_buf_remove_ref(abuf, &abuf)); } ddt_enter(ddt); @@ -2656,8 +2659,9 @@ zio_vdev_io_assess(zio_t *zio) * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && - vd != NULL && !vd->vdev_ops->vdev_op_leaf) + vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; + } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index b51615637..f52d8bbc1 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -315,6 +315,13 @@ zvol_set_volsize(const char *name, uint64_t volsize) uint64_t readonly; int error; + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); + if (error != 0) + return (error); + if (readonly) + return (EROFS); + mutex_enter(&zvol_state_lock); zv = zvol_find_by_name(name); @@ -1459,8 +1466,7 @@ zvol_remove_minor(const char *name) } static int -zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, - const char *dsname, void *arg) +zvol_create_minors_cb(const char *dsname, void *arg) { if (strchr(dsname, '/') == NULL) return 0; @@ -1474,7 +1480,7 @@ zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, * for all available pools. */ int -zvol_create_minors(const char *pool) +zvol_create_minors(char *pool) { spa_t *spa = NULL; int error = 0; @@ -1484,13 +1490,12 @@ zvol_create_minors(const char *pool) mutex_enter(&zvol_state_lock); if (pool) { - error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb, + error = dmu_objset_find(pool, zvol_create_minors_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); } else { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - error = dmu_objset_find_spa(NULL, - spa_name(spa), zvol_create_minors_cb, NULL, + error = dmu_objset_find(spa_name(spa), zvol_create_minors_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error) break; |