diff options
author | Paul Dagnelie <[email protected]> | 2019-06-19 09:48:13 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-06-19 09:48:12 -0700 |
commit | 30af21b02569ac192f52ce6e6511015f8a8d5729 (patch) | |
tree | e5f1091c2d3a6e511bbd2414782e490c18e0f59c /module/zfs/dsl_bookmark.c | |
parent | c1b5801bb5af0055e5f3d263beaa07026103e212 (diff) |
Implement Redacted Send/Receive
Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.
Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream. When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark. This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.
The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.
The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.
Reviewed-by: Dan Kimmel <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Prashanth Sreenivasa <[email protected]>
Reviewed-by: John Kennedy <[email protected]>
Reviewed-by: George Wilson <[email protected]>
Reviewed-by: Chris Williamson <[email protected]>
Reviewed-by: Pavel Zhakarov <[email protected]>
Reviewed-by: Sebastien Roy <[email protected]>
Reviewed-by: Prakash Surya <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Paul Dagnelie <[email protected]>
Closes #7958
Diffstat (limited to 'module/zfs/dsl_bookmark.c')
-rw-r--r-- | module/zfs/dsl_bookmark.c | 1253 |
1 files changed, 1138 insertions, 115 deletions
diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index a32198402..4da17488c 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ @@ -23,6 +23,7 @@ #include <sys/dsl_dir.h> #include <sys/dsl_prop.h> #include <sys/dsl_synctask.h> +#include <sys/dsl_destroy.h> #include <sys/dmu_impl.h> #include <sys/dmu_tx.h> #include <sys/arc.h> @@ -31,6 +32,7 @@ #include <sys/spa.h> #include <sys/dsl_bookmark.h> #include <zfs_namecheck.h> +#include <sys/dmu_send.h> static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, @@ -54,13 +56,15 @@ dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, /* * Returns ESRCH if bookmark is not found. + * Note, we need to use the ZAP rather than the AVL to look up bookmarks + * by name, because only the ZAP honors the casesensitivity setting. */ -static int -dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, +int +dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname, zfs_bookmark_phys_t *bmark_phys) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t bmark_zapobj = ds->ds_bookmarks; + uint64_t bmark_zapobj = ds->ds_bookmarks_obj; matchtype_t mt = 0; int err; @@ -77,15 +81,16 @@ dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, bzero(bmark_phys, sizeof (*bmark_phys)); err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), - sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, - NULL, 0, NULL); + sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0, + NULL); return (err == ENOENT ? ESRCH : err); } /* * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark - * does not represents an earlier point in later_ds's timeline. + * does not represents an earlier point in later_ds's timeline. However, + * bmp will still be filled in if we return EXDEV. * * Returns ENOENT if the dataset containing the bookmark does not exist. * Returns ESRCH if the dataset exists but the bookmark was not found in it. @@ -102,7 +107,7 @@ dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, if (error != 0) return (error); - error = dsl_dataset_bmark_lookup(ds, shortname, bmp); + error = dsl_bookmark_lookup_impl(ds, shortname, bmp); if (error == 0 && later_ds != NULL) { if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) error = SET_ERROR(EXDEV); @@ -111,6 +116,15 @@ dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, return (error); } +typedef struct dsl_bookmark_create_redacted_arg { + const char *dbcra_bmark; + const char *dbcra_snap; + redaction_list_t **dbcra_rl; + uint64_t dbcra_numsnaps; + uint64_t *dbcra_snaps; + void *dbcra_tag; +} dsl_bookmark_create_redacted_arg_t; + typedef struct dsl_bookmark_create_arg { nvlist_t *dbca_bmarks; nvlist_t *dbca_errors; @@ -124,7 +138,7 @@ dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, dsl_dataset_t *bmark_fs; char *shortname; int error; - zfs_bookmark_phys_t bmark_phys; + zfs_bookmark_phys_t bmark_phys = { 0 }; if (!snapds->ds_is_snapshot) return (SET_ERROR(EINVAL)); @@ -139,7 +153,7 @@ dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, return (SET_ERROR(EINVAL)); } - error = dsl_dataset_bmark_lookup(bmark_fs, shortname, + error = dsl_bookmark_lookup_impl(bmark_fs, shortname, &bmark_phys); dsl_dataset_rele(bmark_fs, FTAG); if (error == 0) @@ -182,77 +196,199 @@ dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) return (rv); } +static dsl_bookmark_node_t * +dsl_bookmark_node_alloc(char *shortname) +{ + dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP); + dbn->dbn_name = spa_strdup(shortname); + dbn->dbn_dirty = B_FALSE; + mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL); + return (dbn); +} + +/* + * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot. + */ static void -dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap) +{ + spa_t *spa = dsl_dataset_get_spa(snap); + objset_t *mos = spa_get_dsl(spa)->dp_meta_objset; + dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap); + zbm->zbm_guid = dsp->ds_guid; + zbm->zbm_creation_txg = dsp->ds_creation_txg; + zbm->zbm_creation_time = dsp->ds_creation_time; + zbm->zbm_redaction_obj = 0; + + /* + * If the dataset is encrypted create a larger bookmark to + * accommodate the IVset guid. The IVset guid was added + * after the encryption feature to prevent a problem with + * raw sends. If we encounter an encrypted dataset without + * an IVset guid we fall back to a normal bookmark. + */ + if (snap->ds_dir->dd_crypto_obj != 0 && + spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { + (void) zap_lookup(mos, snap->ds_object, + DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, + &zbm->zbm_ivset_guid); + } + + if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) { + zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN; + zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; + zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; + zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; + + dsl_dataset_t *nextds; + VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool, + dsp->ds_next_snap_obj, FTAG, &nextds)); + dsl_deadlist_space(&nextds->ds_deadlist, + &zbm->zbm_referenced_freed_before_next_snap, + &zbm->zbm_compressed_freed_before_next_snap, + &zbm->zbm_uncompressed_freed_before_next_snap); + dsl_dataset_rele(nextds, FTAG); + } else { + bzero(&zbm->zbm_flags, + sizeof (zfs_bookmark_phys_t) - + offsetof(zfs_bookmark_phys_t, zbm_flags)); + } +} + +void +dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn, + dmu_tx_t *tx) { - dsl_bookmark_create_arg_t *dbca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; - ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); + if (hds->ds_bookmarks_obj == 0) { + hds->ds_bookmarks_obj = zap_create_norm(mos, + U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, + tx); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + + dsl_dataset_zapify(hds, tx); + VERIFY0(zap_add(mos, hds->ds_object, + DS_FIELD_BOOKMARK_NAMES, + sizeof (hds->ds_bookmarks_obj), 1, + &hds->ds_bookmarks_obj, tx)); + } - for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { - dsl_dataset_t *snapds, *bmark_fs; - zfs_bookmark_phys_t bmark_phys = { 0 }; - char *shortname; - uint32_t bmark_len = BOOKMARK_PHYS_SIZE_V1; + avl_add(&hds->ds_bookmarks, dbn); - VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair), - FTAG, &snapds)); - VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), - &bmark_fs, FTAG, &shortname)); - if (bmark_fs->ds_bookmarks == 0) { - bmark_fs->ds_bookmarks = - zap_create_norm(mos, U8_TEXTPREP_TOUPPER, - DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); - spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); - - dsl_dataset_zapify(bmark_fs, tx); - VERIFY0(zap_add(mos, bmark_fs->ds_object, - DS_FIELD_BOOKMARK_NAMES, - sizeof (bmark_fs->ds_bookmarks), 1, - &bmark_fs->ds_bookmarks, tx)); - } + /* + * To maintain backwards compatibility with software that doesn't + * understand SPA_FEATURE_BOOKMARK_V2, we need to use the smallest + * possible bookmark size. + */ + uint64_t bookmark_phys_size = BOOKMARK_PHYS_SIZE_V1; + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2) && + (dbn->dbn_phys.zbm_ivset_guid != 0 || dbn->dbn_phys.zbm_flags & + ZBM_FLAG_HAS_FBN || dbn->dbn_phys.zbm_redaction_obj != 0)) { + bookmark_phys_size = BOOKMARK_PHYS_SIZE_V2; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx); + } - bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid; - bmark_phys.zbm_creation_txg = - dsl_dataset_phys(snapds)->ds_creation_txg; - bmark_phys.zbm_creation_time = - dsl_dataset_phys(snapds)->ds_creation_time; + __attribute__((unused)) zfs_bookmark_phys_t zero_phys = { 0 }; + ASSERT0(bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size, + &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size)); - /* - * If the dataset is encrypted create a larger bookmark to - * accommodate the IVset guid. The IVset guid was added - * after the encryption feature to prevent a problem with - * raw sends. If we encounter an encrypted dataset without - * an IVset guid we fall back to a normal bookmark. - */ - if (snapds->ds_dir->dd_crypto_obj != 0 && - spa_feature_is_enabled(dp->dp_spa, - SPA_FEATURE_BOOKMARK_V2)) { - int err = zap_lookup(mos, snapds->ds_object, - DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, - &bmark_phys.zbm_ivset_guid); - if (err == 0) { - bmark_len = BOOKMARK_PHYS_SIZE_V2; - spa_feature_incr(dp->dp_spa, - SPA_FEATURE_BOOKMARK_V2, tx); - } + VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name, + sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t), + &dbn->dbn_phys, tx)); +} + +/* + * If redaction_list is non-null, we create a redacted bookmark and redaction + * list, and store the object number of the redaction list in redact_obj. + */ +static void +dsl_bookmark_create_sync_impl(const char *bookmark, const char *snapshot, + dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag, + redaction_list_t **redaction_list) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *snapds, *bmark_fs; + char *shortname; + boolean_t bookmark_redacted; + uint64_t *dsredactsnaps; + uint64_t dsnumsnaps; + + VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds)); + VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG, + &shortname)); + + dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname); + dsl_bookmark_set_phys(&dbn->dbn_phys, snapds); + + bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds, + SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps); + if (redaction_list != NULL || bookmark_redacted) { + redaction_list_t *local_rl; + if (bookmark_redacted) { + redact_snaps = dsredactsnaps; + num_redact_snaps = dsnumsnaps; + } + dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) + + num_redact_snaps * sizeof (uint64_t), tx); + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + + VERIFY0(dsl_redaction_list_hold_obj(dp, + dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl)); + dsl_redaction_list_long_hold(dp, local_rl, tag); + + ASSERT3U((local_rl)->rl_dbuf->db_size, >=, + sizeof (redaction_list_phys_t) + num_redact_snaps * + sizeof (uint64_t)); + dmu_buf_will_dirty(local_rl->rl_dbuf, tx); + bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps, + sizeof (uint64_t) * num_redact_snaps); + local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; + if (bookmark_redacted) { + ASSERT3P(redaction_list, ==, NULL); + local_rl->rl_phys->rlp_last_blkid = UINT64_MAX; + local_rl->rl_phys->rlp_last_object = UINT64_MAX; + dsl_redaction_list_long_rele(local_rl, tag); + dsl_redaction_list_rele(local_rl, tag); + } else { + *redaction_list = local_rl; } + } + + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } - VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, - shortname, sizeof (uint64_t), - bmark_len / sizeof (uint64_t), &bmark_phys, tx)); + dsl_bookmark_node_add(bmark_fs, dbn, tx); - spa_history_log_internal_ds(bmark_fs, "bookmark", tx, - "name=%s creation_txg=%llu target_snap=%llu", - shortname, - (longlong_t)bmark_phys.zbm_creation_txg, - (longlong_t)snapds->ds_object); + spa_history_log_internal_ds(bmark_fs, "bookmark", tx, + "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu", + shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg, + (longlong_t)snapds->ds_object, + (longlong_t)dbn->dbn_phys.zbm_redaction_obj); - dsl_dataset_rele(bmark_fs, FTAG); - dsl_dataset_rele(snapds, FTAG); + dsl_dataset_rele(bmark_fs, FTAG); + dsl_dataset_rele(snapds, FTAG); +} + +static void +dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_arg_t *dbca = arg; + + ASSERT(spa_feature_is_enabled(dmu_tx_pool(tx)->dp_spa, + SPA_FEATURE_BOOKMARKS)); + + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { + dsl_bookmark_create_sync_impl(nvpair_name(pair), + fnvpair_value_string(pair), tx, 0, NULL, NULL, NULL); } } @@ -277,58 +413,270 @@ dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); } -int -dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) +static int +dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx) { - int err = 0; - zap_cursor_t zc; - zap_attribute_t attr; - dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_bookmark_create_redacted_arg_t *dbcra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *snapds; + int rv = 0; - uint64_t bmark_zapobj = ds->ds_bookmarks; - if (bmark_zapobj == 0) - return (0); + if (!spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_REDACTION_BOOKMARKS)) + return (SET_ERROR(ENOTSUP)); + /* + * If the list of redact snaps will not fit in the bonus buffer with + * the furthest reached object and offset, fail. + */ + if (dbcra->dbcra_numsnaps > (dmu_bonus_max() - + sizeof (redaction_list_phys_t)) / sizeof (uint64_t)) + return (SET_ERROR(E2BIG)); + + rv = dsl_dataset_hold(dp, dbcra->dbcra_snap, + FTAG, &snapds); + if (rv == 0) { + rv = dsl_bookmark_create_check_impl(snapds, dbcra->dbcra_bmark, + tx); + dsl_dataset_rele(snapds, FTAG); + } + return (rv); +} - for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj); - zap_cursor_retrieve(&zc, &attr) == 0; - zap_cursor_advance(&zc)) { - char *bmark_name = attr.za_name; - zfs_bookmark_phys_t bmark_phys = { 0 }; +static void +dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_redacted_arg_t *dbcra = arg; + dsl_bookmark_create_sync_impl(dbcra->dbcra_bmark, dbcra->dbcra_snap, tx, + dbcra->dbcra_numsnaps, dbcra->dbcra_snaps, dbcra->dbcra_tag, + dbcra->dbcra_rl); +} - err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys); - ASSERT3U(err, !=, ENOENT); - if (err != 0) - break; +int +dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot, + uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl) +{ + dsl_bookmark_create_redacted_arg_t dbcra; + + dbcra.dbcra_bmark = bookmark; + dbcra.dbcra_snap = snapshot; + dbcra.dbcra_rl = rl; + dbcra.dbcra_numsnaps = numsnaps; + dbcra.dbcra_snaps = snapguids; + dbcra.dbcra_tag = tag; + + return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check, + dsl_bookmark_create_redacted_sync, &dbcra, 5, + ZFS_SPACE_CHECK_NORMAL)); +} - nvlist_t *out_props = fnvlist_alloc(); - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_GUID))) { +/* + * Retrieve the list of properties given in the 'props' nvlist for a bookmark. + * If 'props' is NULL, retrieves all properties. + */ +static void +dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys, + nvlist_t *props, nvlist_t *out_props) +{ + ASSERT3P(dp, !=, NULL); + ASSERT3P(bmark_phys, !=, NULL); + ASSERT3P(out_props, !=, NULL); + ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock)); + + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_GUID, bmark_phys->zbm_guid); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATETXG))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATION))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATION, bmark_phys->zbm_creation_time); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_IVSET_GUID, bmark_phys->zbm_ivset_guid); + } + if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) { + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_REFERENCED))) { dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_GUID, bmark_phys.zbm_guid); + ZFS_PROP_REFERENCED, + bmark_phys->zbm_referenced_bytes_refd); } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_CREATETXG))) { + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) { dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg); + ZFS_PROP_LOGICALREFERENCED, + bmark_phys->zbm_uncompressed_bytes_refd); } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_CREATION))) { + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_REFRATIO))) { + uint64_t ratio = + bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 : + bmark_phys->zbm_uncompressed_bytes_refd * 100 / + bmark_phys->zbm_compressed_bytes_refd; dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_CREATION, bmark_phys.zbm_creation_time); + ZFS_PROP_REFRATIO, ratio); } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) { - dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_IVSET_GUID, bmark_phys.zbm_ivset_guid); + } + + if ((props == NULL || nvlist_exists(props, "redact_snaps") || + nvlist_exists(props, "redact_complete")) && + bmark_phys->zbm_redaction_obj != 0) { + redaction_list_t *rl; + int err = dsl_redaction_list_hold_obj(dp, + bmark_phys->zbm_redaction_obj, FTAG, &rl); + if (err == 0) { + if (nvlist_exists(props, "redact_snaps")) { + nvlist_t *nvl; + nvl = fnvlist_alloc(); + fnvlist_add_uint64_array(nvl, ZPROP_VALUE, + rl->rl_phys->rlp_snaps, + rl->rl_phys->rlp_num_snaps); + fnvlist_add_nvlist(out_props, "redact_snaps", + nvl); + nvlist_free(nvl); + } + if (nvlist_exists(props, "redact_complete")) { + nvlist_t *nvl; + nvl = fnvlist_alloc(); + fnvlist_add_boolean_value(nvl, ZPROP_VALUE, + rl->rl_phys->rlp_last_blkid == UINT64_MAX && + rl->rl_phys->rlp_last_object == UINT64_MAX); + fnvlist_add_nvlist(out_props, "redact_complete", + nvl); + nvlist_free(nvl); + } + dsl_redaction_list_rele(rl, FTAG); } + } +} + +int +dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; - fnvlist_add_nvlist(outnvl, bmark_name, out_props); + ASSERT(dsl_pool_config_held(dp)); + + if (dsl_dataset_is_snapshot(ds)) + return (SET_ERROR(EINVAL)); + + for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); + dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { + nvlist_t *out_props = fnvlist_alloc(); + + dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props); + + fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props); fnvlist_free(out_props); } + return (0); +} + +/* + * Comparison func for ds_bookmarks AVL tree. We sort the bookmarks by + * their TXG, then by their FBN-ness. The "FBN-ness" component ensures + * that all bookmarks at the same TXG that HAS_FBN are adjacent, which + * dsl_bookmark_destroy_sync_impl() depends on. Note that there may be + * multiple bookmarks at the same TXG (with the same FBN-ness). In this + * case we differentiate them by an arbitrary metric (in this case, + * their names). + */ +static int +dsl_bookmark_compare(const void *l, const void *r) +{ + const dsl_bookmark_node_t *ldbn = l; + const dsl_bookmark_node_t *rdbn = r; + + int64_t cmp = AVL_CMP(ldbn->dbn_phys.zbm_creation_txg, + rdbn->dbn_phys.zbm_creation_txg); + if (likely(cmp)) + return (cmp); + cmp = AVL_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN), + (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); + if (likely(cmp)) + return (cmp); + cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name); + return (AVL_ISIGN(cmp)); +} + +/* + * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree. + */ +int +dsl_bookmark_init_ds(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + ASSERT(!ds->ds_is_snapshot); + + avl_create(&ds->ds_bookmarks, dsl_bookmark_compare, + sizeof (dsl_bookmark_node_t), + offsetof(dsl_bookmark_node_t, dbn_node)); + + if (!dsl_dataset_is_zapified(ds)) + return (0); + + int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, + sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj); + if (zaperr == ENOENT) + return (0); + if (zaperr != 0) + return (zaperr); + + if (ds->ds_bookmarks_obj == 0) + return (0); + + int err = 0; + zap_cursor_t zc; + zap_attribute_t attr; + + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + (err = zap_cursor_retrieve(&zc, &attr)) == 0; + zap_cursor_advance(&zc)) { + dsl_bookmark_node_t *dbn = + dsl_bookmark_node_alloc(attr.za_name); + + err = dsl_bookmark_lookup_impl(ds, + dbn->dbn_name, &dbn->dbn_phys); + ASSERT3U(err, !=, ENOENT); + if (err != 0) { + kmem_free(dbn, sizeof (*dbn)); + break; + } + avl_add(&ds->ds_bookmarks, dbn); + } zap_cursor_fini(&zc); + if (err == ENOENT) + err = 0; return (err); } +void +dsl_bookmark_fini_ds(dsl_dataset_t *ds) +{ + void *cookie = NULL; + dsl_bookmark_node_t *dbn; + + if (ds->ds_is_snapshot) + return; + + while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + } + avl_destroy(&ds->ds_bookmarks); +} + /* * Retrieve the bookmarks that exist in the specified dataset, and the * requested properties of each bookmark. @@ -359,27 +707,69 @@ dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) return (err); } +/* + * Retrieve all properties for a single bookmark in the given dataset. + */ +int +dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + zfs_bookmark_phys_t bmark_phys = { 0 }; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys); + if (err != 0) + goto out; + + dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props); +out: + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); +} + typedef struct dsl_bookmark_destroy_arg { nvlist_t *dbda_bmarks; nvlist_t *dbda_success; nvlist_t *dbda_errors; } dsl_bookmark_destroy_arg_t; -static int -dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) +static void +dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name, + dmu_tx_t *tx) { - int err; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t bmark_zapobj = ds->ds_bookmarks; + uint64_t bmark_zapobj = ds->ds_bookmarks_obj; matchtype_t mt = 0; uint64_t int_size, num_ints; + /* + * 'search' must be zeroed so that dbn_flags (which is used in + * dsl_bookmark_compare()) will be zeroed even if the on-disk + * (in ZAP) bookmark is shorter than offsetof(dbn_flags). + */ + dsl_bookmark_node_t search = { 0 }; + char realname[ZFS_MAX_DATASET_NAME_LEN]; + + /* + * Find the real name of this bookmark, which may be different + * from the given name if the dataset is case-insensitive. Then + * use the real name to find the node in the ds_bookmarks AVL tree. + */ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; - err = zap_length(mos, bmark_zapobj, name, &int_size, &num_ints); - if (err != 0) - return (err); + VERIFY0(zap_length(mos, bmark_zapobj, name, &int_size, &num_ints)); ASSERT3U(int_size, ==, sizeof (uint64_t)); @@ -387,8 +777,70 @@ dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_BOOKMARK_V2, tx); } + VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t), + num_ints, &search.dbn_phys, mt, realname, sizeof (realname), NULL)); + + search.dbn_name = realname; + dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL); + ASSERT(dbn != NULL); + + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + /* + * If this bookmark HAS_FBN, and it is before the most + * recent snapshot, then its TXG is a key in the head's + * deadlist (and all clones' heads' deadlists). If this is + * the last thing keeping the key (i.e. there are no more + * bookmarks with HAS_FBN at this TXG, and there is no + * snapshot at this TXG), then remove the key. + * + * Note that this algorithm depends on ds_bookmarks being + * sorted such that all bookmarks at the same TXG with + * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks + * at the same TXG in between them). If this were not + * the case, we would need to examine *all* bookmarks + * at this TXG, rather than just the adjacent ones. + */ + + dsl_bookmark_node_t *dbn_prev = + AVL_PREV(&ds->ds_bookmarks, dbn); + dsl_bookmark_node_t *dbn_next = + AVL_NEXT(&ds->ds_bookmarks, dbn); + + boolean_t more_bookmarks_at_this_txg = + (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg == + dbn->dbn_phys.zbm_creation_txg && + (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) || + (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg == + dbn->dbn_phys.zbm_creation_txg && + (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); + + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) && + !more_bookmarks_at_this_txg && + dbn->dbn_phys.zbm_creation_txg < + dsl_dataset_phys(ds)->ds_prev_snap_txg) { + dsl_dir_remove_clones_key(ds->ds_dir, + dbn->dbn_phys.zbm_creation_txg, tx); + dsl_deadlist_remove_key(&ds->ds_deadlist, + dbn->dbn_phys.zbm_creation_txg, tx); + } + + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + VERIFY0(dmu_object_free(mos, + dbn->dbn_phys.zbm_redaction_obj, tx)); + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + } + + avl_remove(&ds->ds_bookmarks, dbn); + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); - return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); + VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); } static int @@ -419,7 +871,7 @@ dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) continue; } if (error == 0) { - error = dsl_dataset_bmark_lookup(ds, shortname, &bm); + error = dsl_bookmark_lookup_impl(ds, shortname, &bm); dsl_dataset_rele(ds, FTAG); if (error == ESRCH) { /* @@ -428,6 +880,20 @@ dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) */ continue; } + if (error == 0 && bm.zbm_redaction_obj != 0) { + redaction_list_t *rl = NULL; + error = dsl_redaction_list_hold_obj(tx->tx_pool, + bm.zbm_redaction_obj, FTAG, &rl); + if (error == ENOENT) { + error = 0; + } else if (error == 0 && + dsl_redaction_list_long_held(rl)) { + error = SET_ERROR(EBUSY); + } + if (rl != NULL) { + dsl_redaction_list_rele(rl, FTAG); + } + } } if (error == 0) { if (dmu_tx_is_syncing(tx)) { @@ -457,18 +923,17 @@ dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), &ds, FTAG, &shortname)); - VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx)); + dsl_bookmark_destroy_sync_impl(ds, shortname, tx); /* * If all of this dataset's bookmarks have been destroyed, * free the zap object and decrement the feature's use count. */ - VERIFY0(zap_count(mos, ds->ds_bookmarks, - &zap_cnt)); + VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt)); if (zap_cnt == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); - VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); - ds->ds_bookmarks = 0; + VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); + ds->ds_bookmarks_obj = 0; spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); VERIFY0(zap_remove(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, tx)); @@ -503,3 +968,561 @@ dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) fnvlist_free(dbda.dbda_success); return (rv); } + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_redaction_list_long_held(redaction_list_t *rl) +{ + return (!zfs_refcount_is_zero(&rl->rl_longholds)); +} + +void +dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag) +{ + ASSERT(dsl_pool_config_held(dp)); + (void) zfs_refcount_add(&rl->rl_longholds, tag); +} + +void +dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag) +{ + (void) zfs_refcount_remove(&rl->rl_longholds, tag); +} + +/* ARGSUSED */ +static void +redaction_list_evict_sync(void *rlu) +{ + redaction_list_t *rl = rlu; + zfs_refcount_destroy(&rl->rl_longholds); + + kmem_free(rl, sizeof (redaction_list_t)); +} + +void +dsl_redaction_list_rele(redaction_list_t *rl, void *tag) +{ + dmu_buf_rele(rl->rl_dbuf, tag); +} + +int +dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag, + redaction_list_t **rlp) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + redaction_list_t *rl; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dmu_bonus_hold(mos, rlobj, tag, &dbuf); + if (err != 0) + return (err); + + rl = dmu_buf_get_user(dbuf); + if (rl == NULL) { + redaction_list_t *winner = NULL; + + rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP); + rl->rl_dbuf = dbuf; + rl->rl_object = rlobj; + rl->rl_phys = dbuf->db_data; + rl->rl_mos = dp->dp_meta_objset; + zfs_refcount_create(&rl->rl_longholds); + dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL, + &rl->rl_dbuf); + if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) { + kmem_free(rl, sizeof (*rl)); + rl = winner; + } + } + *rlp = rl; + return (0); +} + +/* + * Snapshot ds is being destroyed. + * + * Adjust the "freed_before_next" of any bookmarks between this snap + * and the previous snapshot, because their "next snapshot" is changing. + * + * If there are any bookmarks with HAS_FBN at this snapshot, remove + * their HAS_SNAP flag (note: there can be at most one snapshot of + * each filesystem at a given txg), and return B_TRUE. In this case + * the caller can not remove the key in the deadlist at this TXG, because + * the HAS_FBN bookmarks require the key be there. + * + * Returns B_FALSE if there are no bookmarks with HAS_FBN at this + * snapshot's TXG. In this case the caller can remove the key in the + * deadlist at this TXG. + */ +boolean_t +dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + dsl_dataset_t *head, *next; + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next)); + + /* + * Find the first bookmark that HAS_FBN at or after the + * previous snapshot. + */ + dsl_bookmark_node_t search = { 0 }; + avl_index_t idx; + search.dbn_phys.zbm_creation_txg = + dsl_dataset_phys(ds)->ds_prev_snap_txg; + search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; + /* + * The empty-string name can't be in the AVL, and it compares + * before any entries with this TXG. + */ + search.dbn_name = ""; + VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); + dsl_bookmark_node_t *dbn = + avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); + + /* + * Iterate over all bookmarks that are at or after the previous + * snapshot, and before this (being deleted) snapshot. Adjust + * their FBN based on their new next snapshot. + */ + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg < + dsl_dataset_phys(ds)->ds_creation_txg; + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) + continue; + /* + * Increase our FBN by the amount of space that was live + * (referenced) at the time of this bookmark (i.e. + * birth <= zbm_creation_txg), and killed between this + * (being deleted) snapshot and the next snapshot (i.e. + * on the next snapshot's deadlist). (Space killed before + * this are already on our FBN.) + */ + uint64_t referenced, compressed, uncompressed; + dsl_deadlist_space_range(&next->ds_deadlist, + 0, dbn->dbn_phys.zbm_creation_txg, + &referenced, &compressed, &uncompressed); + dbn->dbn_phys.zbm_referenced_freed_before_next_snap += + referenced; + dbn->dbn_phys.zbm_compressed_freed_before_next_snap += + compressed; + dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += + uncompressed; + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + } + dsl_dataset_rele(next, FTAG); + + /* + * There may be several bookmarks at this txg (the TXG of the + * snapshot being deleted). We need to clear the SNAPSHOT_EXISTS + * flag on all of them, and return TRUE if there is at least 1 + * bookmark here with HAS_FBN (thus preventing the deadlist + * key from being removed). + */ + boolean_t rv = B_FALSE; + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == + dsl_dataset_phys(ds)->ds_creation_txg; + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { + ASSERT(!(dbn->dbn_phys.zbm_flags & + ZBM_FLAG_SNAPSHOT_EXISTS)); + continue; + } + ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS); + dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS; + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + rv = B_TRUE; + } + dsl_dataset_rele(head, FTAG); + return (rv); +} + +/* + * A snapshot is being created of this (head) dataset. + * + * We don't keep keys in the deadlist for the most recent snapshot, or any + * bookmarks at or after it, because there can't be any blocks on the + * deadlist in this range. Now that the most recent snapshot is after + * all bookmarks, we need to add these keys. Note that the caller always + * adds a key at the previous snapshot, so we only add keys for bookmarks + * after that. + */ +void +dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t last_key_added = UINT64_MAX; + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg > + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg; + ASSERT3U(creation_txg, <=, last_key_added); + /* + * Note, there may be multiple bookmarks at this TXG, + * and we only want to add the key for this TXG once. + * The ds_bookmarks AVL is sorted by TXG, so we will visit + * these bookmarks in sequence. + */ + if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) && + creation_txg != last_key_added) { + dsl_deadlist_add_key(&ds->ds_deadlist, + creation_txg, tx); + last_key_added = creation_txg; + } + } +} + +/* + * The next snapshot of the origin dataset has changed, due to + * promote or clone swap. If there are any bookmarks at this dataset, + * we need to update their zbm_*_freed_before_next_snap to reflect this. + * The head dataset has the relevant bookmarks in ds_bookmarks. + */ +void +dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + + /* + * Find the first bookmark that HAS_FBN at the origin snapshot. + */ + dsl_bookmark_node_t search = { 0 }; + avl_index_t idx; + search.dbn_phys.zbm_creation_txg = + dsl_dataset_phys(origin)->ds_creation_txg; + search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; + /* + * The empty-string name can't be in the AVL, and it compares + * before any entries with this TXG. + */ + search.dbn_name = ""; + VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); + dsl_bookmark_node_t *dbn = + avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); + + /* + * Iterate over all bookmarks that are at the origin txg. + * Adjust their FBN based on their new next snapshot. + */ + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == + dsl_dataset_phys(origin)->ds_creation_txg && + (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + + /* + * Bookmark is at the origin, therefore its + * "next dataset" is changing, so we need + * to reset its FBN by recomputing it in + * dsl_bookmark_set_phys(). + */ + ASSERT3U(dbn->dbn_phys.zbm_guid, ==, + dsl_dataset_phys(origin)->ds_guid); + ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==, + dsl_dataset_phys(origin)->ds_referenced_bytes); + ASSERT(dbn->dbn_phys.zbm_flags & + ZBM_FLAG_SNAPSHOT_EXISTS); + /* + * Save and restore the zbm_redaction_obj, which + * is zeroed by dsl_bookmark_set_phys(). + */ + uint64_t redaction_obj = + dbn->dbn_phys.zbm_redaction_obj; + dsl_bookmark_set_phys(&dbn->dbn_phys, origin); + dbn->dbn_phys.zbm_redaction_obj = redaction_obj; + + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + } +} + +/* + * This block is no longer referenced by this (head) dataset. + * + * Adjust the FBN of any bookmarks that reference this block, whose "next" + * is the head dataset. + */ +/* ARGSUSED */ +void +dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) +{ + /* + * Iterate over bookmarks whose "next" is the head dataset. + */ + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + /* + * If the block was live (referenced) at the time of this + * bookmark, add its space to the bookmark's FBN. + */ + if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg && + (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { + mutex_enter(&dbn->dbn_lock); + dbn->dbn_phys.zbm_referenced_freed_before_next_snap += + bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp); + dbn->dbn_phys.zbm_compressed_freed_before_next_snap += + BP_GET_PSIZE(bp); + dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += + BP_GET_UCSIZE(bp); + /* + * Changing the ZAP object here would be too + * expensive. Also, we may be called from the zio + * interrupt thread, which can't block on i/o. + * Therefore, we mark this bookmark as dirty and + * modify the ZAP once per txg, in + * dsl_bookmark_sync_done(). + */ + dbn->dbn_dirty = B_TRUE; + mutex_exit(&dbn->dbn_lock); + } + } +} + +void +dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (dsl_dataset_is_snapshot(ds)) + return; + + /* + * We only dirty bookmarks that are at or after the most recent + * snapshot. We can't create snapshots between + * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we + * don't need to look at any bookmarks before ds_prev_snap_txg. + */ + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + if (dbn->dbn_dirty) { + /* + * We only dirty nodes with HAS_FBN, therefore + * we can always use the current bookmark struct size. + */ + ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + VERIFY0(zap_update(dp->dp_meta_objset, + ds->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + dbn->dbn_dirty = B_FALSE; + } + } +#ifdef ZFS_DEBUG + for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); + dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { + ASSERT(!dbn->dbn_dirty); + } +#endif +} + +/* + * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks). + */ +uint64_t +dsl_bookmark_latest_txg(dsl_dataset_t *ds) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + if (dbn == NULL) + return (0); + return (dbn->dbn_phys.zbm_creation_txg); +} + +static inline unsigned int +redact_block_buf_num_entries(unsigned int size) +{ + return (size / sizeof (redact_block_phys_t)); +} + +/* + * This function calculates the offset of the last entry in the array of + * redact_block_phys_t. If we're reading the redaction list into buffers of + * size bufsize, then for all but the last buffer, the last valid entry in the + * array will be the last entry in the array. However, for the last buffer, any + * amount of it may be filled. Thus, we check to see if we're looking at the + * last buffer in the redaction list, and if so, we return the total number of + * entries modulo the number of entries per buffer. Otherwise, we return the + * number of entries per buffer minus one. + */ +static inline unsigned int +last_entry(redaction_list_t *rl, unsigned int bufsize, uint64_t bufid) +{ + if (bufid == (rl->rl_phys->rlp_num_entries - 1) / + redact_block_buf_num_entries(bufsize)) { + return ((rl->rl_phys->rlp_num_entries - 1) % + redact_block_buf_num_entries(bufsize)); + } + return (redact_block_buf_num_entries(bufsize) - 1); +} + +/* + * Compare the redact_block_phys_t to the bookmark. If the last block in the + * redact_block_phys_t is before the bookmark, return -1. If the first block in + * the redact_block_phys_t is after the bookmark, return 1. Otherwise, the + * bookmark is inside the range of the redact_block_phys_t, and we return 0. + */ +static int +redact_block_zb_compare(redact_block_phys_t *first, + zbookmark_phys_t *second) +{ + /* + * If the block_phys is for a previous object, or the last block in the + * block_phys is strictly before the block in the bookmark, the + * block_phys is earlier. + */ + if (first->rbp_object < second->zb_object || + (first->rbp_object == second->zb_object && + first->rbp_blkid + (redact_block_get_count(first) - 1) < + second->zb_blkid)) { + return (-1); + } + + /* + * If the bookmark is for a previous object, or the block in the + * bookmark is strictly before the first block in the block_phys, the + * bookmark is earlier. + */ + if (first->rbp_object > second->zb_object || + (first->rbp_object == second->zb_object && + first->rbp_blkid > second->zb_blkid)) { + return (1); + } + + return (0); +} + +/* + * Traverse the redaction list in the provided object, and call the callback for + * each entry we find. Don't call the callback for any records before resume. + */ +int +dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, + rl_traverse_callback_t cb, void *arg) +{ + objset_t *mos = rl->rl_mos; + redact_block_phys_t *buf; + unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; + int err = 0; + + if (rl->rl_phys->rlp_last_object != UINT64_MAX || + rl->rl_phys->rlp_last_blkid != UINT64_MAX) { + /* + * When we finish a send, we update the last object and offset + * to UINT64_MAX. If a send fails partway through, the last + * object and offset will have some other value, indicating how + * far the send got. The redaction list must be complete before + * it can be traversed, so return EINVAL if the last object and + * blkid are not set to UINT64_MAX. + */ + return (SET_ERROR(EINVAL)); + } + + /* + * Binary search for the point to resume from. The goal is to minimize + * the number of disk reads we have to perform. + */ + buf = zio_data_buf_alloc(bufsize); + uint64_t maxbufid = (rl->rl_phys->rlp_num_entries - 1) / + redact_block_buf_num_entries(bufsize); + uint64_t minbufid = 0; + while (resume != NULL && maxbufid - minbufid >= 1) { + ASSERT3U(maxbufid, >, minbufid); + uint64_t midbufid = minbufid + ((maxbufid - minbufid) / 2); + err = dmu_read(mos, rl->rl_object, midbufid * bufsize, bufsize, + buf, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + int cmp0 = redact_block_zb_compare(&buf[0], resume); + int cmpn = redact_block_zb_compare( + &buf[last_entry(rl, bufsize, maxbufid)], resume); + + /* + * If the first block is before or equal to the resume point, + * and the last one is equal or after, then the resume point is + * in this buf, and we should start here. + */ + if (cmp0 <= 0 && cmpn >= 0) + break; + + if (cmp0 > 0) + maxbufid = midbufid - 1; + else if (cmpn < 0) + minbufid = midbufid + 1; + else + panic("No progress in binary search for resume point"); + } + + for (uint64_t curidx = minbufid * redact_block_buf_num_entries(bufsize); + err == 0 && curidx < rl->rl_phys->rlp_num_entries; + curidx++) { + /* + * We read in the redaction list one block at a time. Once we + * finish with all the entries in a given block, we read in a + * new one. The predictive prefetcher will take care of any + * prefetching, and this code shouldn't be the bottleneck, so we + * don't need to do manual prefetching. + */ + if (curidx % redact_block_buf_num_entries(bufsize) == 0) { + err = dmu_read(mos, rl->rl_object, curidx * + sizeof (*buf), bufsize, buf, + DMU_READ_PREFETCH); + if (err != 0) + break; + } + redact_block_phys_t *rb = &buf[curidx % + redact_block_buf_num_entries(bufsize)]; + /* + * If resume is non-null, we should either not send the data, or + * null out resume so we don't have to keep doing these + * comparisons. + */ + if (resume != NULL) { + if (redact_block_zb_compare(rb, resume) < 0) { + continue; + } else { + /* + * If the place to resume is in the middle of + * the range described by this + * redact_block_phys, then modify the + * redact_block_phys in memory so we generate + * the right records. + */ + if (resume->zb_object == rb->rbp_object && + resume->zb_blkid > rb->rbp_blkid) { + uint64_t diff = resume->zb_blkid - + rb->rbp_blkid; + rb->rbp_blkid = resume->zb_blkid; + redact_block_set_count(rb, + redact_block_get_count(rb) - diff); + } + resume = NULL; + } + } + + if (cb(rb, arg) != 0) + break; + } + + zio_data_buf_free(buf, bufsize); + return (err); +} |