diff options
author | Rob Norris <[email protected]> | 2024-10-03 13:47:11 +1000 |
---|---|---|
committer | GitHub <[email protected]> | 2024-10-02 20:47:11 -0700 |
commit | 224393a3211b12c2cbac90a1d4dc730ceee1bbd0 (patch) | |
tree | b0a0f84956ce1f10a6a7b55fe29c0c79d1f5f06b /module | |
parent | 412105977c5cb1dcdd9a0b742ceaf04c75da24d0 (diff) |
feature: large_microzap
In a4b21eadec we added the zap_micro_max_size tuneable to raise the size
at which "micro" (single-block) ZAPs are upgraded to "fat" (multi-block)
ZAPs. Before this, a microZAP was limited to 128KiB, which was the old
largest block size. The side effect of raising the max size past 128KiB
is that it be stored in a large block, requiring the large_blocks
feature.
Unfortunately, this means that a backup stream created without the
--large-block (-L) flag to zfs send would split the microZAP block into
smaller blocks and send those, as is normal behaviour for large blocks.
This would be received correctly, but since microZAPs are limited to the
first block in the object by definition, the entries in the later blocks
would be inaccessible. For directory ZAPs, this gives the appearance of
files being lost.
This commit adds a feature flag, large_microzap, that must be enabled
for microZAPs to grow beyond 128KiB, and which will be activated the
first time that occurs. This feature is later checked when generating
the stream and if active, the send operation will abort unless
--large-block has also been requested.
Changing the limit still requires zap_micro_max_size to be changed. The
state of this flag effectively sets the upper value for this tuneable,
that is, if the feature is disabled, the tuneable will be clamped to
128KiB.
A stream flag is also added to ensure that the receiver also activates
its own feature flag upon receiving the stream. This is not strictly
necessary to _use_ the received microZAP, since it doesn't care how
large its block is, but it is required to send the microZAP object on,
otherwise the original problem occurs again.
Because it's difficult to reliably distinguish a microZAP from a fatZAP
from outside the ZAP code, and because it seems unlikely that most
users are affected (a fairly niche tuneable combined with what should be
an uncommon use of send), and for the sake of expediency, this change
activates the feature the first time a microZAP grows to use a large
block, and is never deactivated after that. This can be improved in the
future.
This commit changes nothing for existing pools that already have large
microZAPs. The feature will not be retroactively applied, but will be
activated the next time a microZAP grows past the limit.
Don't use large_blocks feature for enable/disable tests. The
large_microzap depends on large_blocks, so it gets enabled as a
dependency, breaking the test. Instead use feature "longname", which has
the exact same feature characteristics.
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Allan Jude <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Alexander Motin <[email protected]>
Signed-off-by: Rob Norris <[email protected]>
Closes #16593
Diffstat (limited to 'module')
-rw-r--r-- | module/zcommon/zfeature_common.c | 15 | ||||
-rw-r--r-- | module/zfs/dmu_recv.c | 23 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 13 | ||||
-rw-r--r-- | module/zfs/dmu_tx.c | 4 | ||||
-rw-r--r-- | module/zfs/zap_micro.c | 55 |
5 files changed, 103 insertions, 7 deletions
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 881deb5bf..96f0086d7 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -25,7 +25,7 @@ * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude */ @@ -772,6 +772,19 @@ zpool_feature_init(void) longname_deps, sfeatures); } + { + static const spa_feature_t large_microzap_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_LARGE_BLOCKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_MICROZAP, + "com.klarasystems:large_microzap", "large_microzap", + "Support for microzaps larger than 128KB.", + ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, + ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 4877eb7e6..b1cd981ce 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -25,7 +25,7 @@ * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2018, loli10K <[email protected]>. All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019 Datto Inc. * Copyright (c) 2022 Axcient. @@ -593,6 +593,9 @@ recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) + return (SET_ERROR(ENOTSUP)); /* * Receiving redacted streams requires that redacted datasets are @@ -994,6 +997,24 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) numredactsnaps, tx); } + if (featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) { + /* + * The source has seen a large microzap at least once in its + * life, so we activate the feature here to match. It's not + * strictly necessary since a large microzap is usable without + * the feature active, but if that object is sent on from here, + * we need this info to know to add the stream feature. + * + * There may be no large microzap in the incoming stream, or + * ever again, but this is a very niche feature and its very + * difficult to spot a large microzap in the stream, so its + * not worth the effort of trying harder to activate the + * feature at first use. + */ + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_LARGE_MICROZAP, + (void *)B_TRUE, tx); + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index c7d3a5cb6..a174972e9 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -26,7 +26,7 @@ * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2016 Actifio, Inc. All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude */ @@ -2015,6 +2015,17 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LONGNAME)) { *featureflags |= DMU_BACKUP_FEATURE_LONGNAME; } + + if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_MICROZAP)) { + /* + * We must never split a large microzap block, so we can only + * send large microzaps if LARGE_BLOCKS is already enabled. + */ + if (!(*featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ZFS_ERR_STREAM_LARGE_MICROZAP)); + *featureflags |= DMU_BACKUP_FEATURE_LARGE_MICROZAP; + } + return (0); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 2c2a6c764..3fdcebdff 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #include <sys/dmu.h> @@ -575,7 +576,6 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; int err; - extern int zap_micro_max_size; ASSERT(tx->tx_txg == 0); @@ -591,7 +591,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) * - 2 grown ptrtbl blocks */ (void) zfs_refcount_add_many(&txh->txh_space_towrite, - zap_micro_max_size, FTAG); + zap_get_micro_max_size(tx->tx_pool->dp_spa), FTAG); if (dn == NULL) return; diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index a428a040a..12938022e 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -24,6 +24,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. */ #include <sys/zio.h> @@ -36,12 +37,37 @@ #include <sys/btree.h> #include <sys/arc.h> #include <sys/dmu_objset.h> +#include <sys/spa_impl.h> #ifdef _KERNEL #include <sys/sunddi.h> #endif -int zap_micro_max_size = MZAP_MAX_BLKSZ; +/* + * The maximum size (in bytes) of a microzap before it is converted to a + * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE). + * + * By definition, a microzap must fit into a single block, so this has + * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default. + * Setting this higher requires both the large_blocks feature (to even create + * blocks that large) and the large_microzap feature (to enable the stream + * machinery to understand not to try to split a microzap block). + * + * If large_microzap is enabled, this value will be clamped to + * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE. + */ +static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE; + +uint64_t +zap_get_micro_max_size(spa_t *spa) +{ + uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE); + if (maxsz <= SPA_OLD_MAXBLOCKSIZE) + return (maxsz); + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) + return (MIN(maxsz, spa_maxblocksize(spa))); + return (SPA_OLD_MAXBLOCKSIZE); +} static int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags); @@ -638,7 +664,7 @@ zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; - if (newsz > zap_micro_max_size) { + if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { dprintf("upgrading obj %llu: num_entries=%u\n", (u_longlong_t)obj, zap->zap_m.zap_num_entries); *zapp = zap; @@ -650,6 +676,31 @@ zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; + + if (newsz > SPA_OLD_MAXBLOCKSIZE) { + dsl_dataset_t *ds = dmu_objset_ds(os); + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_LARGE_MICROZAP)) { + /* + * A microzap just grew beyond the old limit + * for the first time, so we have to ensure the + * feature flag is activated. + * zap_get_micro_max_size() won't let us get + * here if the feature is not enabled, so we + * don't need any other checks beforehand. + * + * Since we're in open context, we can't + * activate the feature directly, so we instead + * flag it on the dataset for next sync. + */ + dsl_dataset_dirty(ds, tx); + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation + [SPA_FEATURE_LARGE_MICROZAP] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + } + } } *zapp = zap; |