diff options
author | Christopher Siden <[email protected]> | 2012-12-13 15:24:15 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2013-01-08 10:35:35 -0800 |
commit | 9ae529ec5dbdc828ff8326beae58062971d74b2e (patch) | |
tree | d65c2d8913391cd03a3e8e06ad77721c5e9cdadc | |
parent | 15313c5e1866e81e2f4a30d2c50b43b5435e547a (diff) |
Illumos #2619 and #2747
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Richard Lowe <[email protected]>
Reviewed by: Dan Kruchinin <[email protected]>
Approved by: Eric Schrock <[email protected]>
References:
illumos/illumos-gate@53089ab7c84db6fb76c16ca50076c147cda11757
illumos/illumos-gate@ad135b5d644628e791c3188a6ecbd9c257961ef8
illumos changeset: 13700:2889e2596bd6
https://www.illumos.org/issues/2619
https://www.illumos.org/issues/2747
NOTE: The grub specific changes were not ported. This change
must be made to the Linux grub packages.
Ported-by: Brian Behlendorf <[email protected]>
67 files changed, 4262 insertions, 462 deletions
diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 478da2616..afdba3440 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -1,2 +1,2 @@ -SUBDIRS = zfs zpool zdb zinject zstreamdump ztest zpios mount_zfs +SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios mount_zfs SUBDIRS += zpool_layout zvol_id zpool_id vdev_id diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index ce6318ea1..de4ac510e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <stdio.h> @@ -54,6 +56,7 @@ #include <sys/zfs_fuid.h> #include <sys/arc.h> #include <sys/ddt.h> +#include <sys/zfeature.h> #undef ZFS_MAXNAMELEN #include <libzfs.h> @@ -62,7 +65,8 @@ #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ zio_checksum_table[(idx)].ci_name : "UNKNOWN") #define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ - dmu_ot[(idx)].ot_name : "UNKNOWN") + dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ + dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES) #ifndef lint @@ -1099,7 +1103,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; - zdb_nicenum(ds->ds_used_bytes, used); + zdb_nicenum(ds->ds_referenced_bytes, used); zdb_nicenum(ds->ds_compressed_bytes, compressed); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed); zdb_nicenum(ds->ds_unique_bytes, unique); @@ -1143,6 +1147,44 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) /* ARGSUSED */ static int +dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (bp->blk_birth != 0) { + sprintf_blkptr(blkbuf, bp); + (void) printf("\t%s\n", blkbuf); + } + return (0); +} + +static void +dump_bptree(objset_t *os, uint64_t obj, char *name) +{ + char bytes[32]; + bptree_phys_t *bt; + dmu_buf_t *db; + + if (dump_opt['d'] < 3) + return; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + zdb_nicenum(bt->bt_bytes, bytes); + (void) printf("\n %s: %llu datasets, %s\n", + name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); + dmu_buf_rele(db, FTAG); + + if (dump_opt['d'] < 5) + return; + + (void) printf("\n"); + + (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); +} + +/* ARGSUSED */ +static int dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; @@ -1888,11 +1930,13 @@ typedef struct zdb_blkstats { */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) -#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static char *zdb_ot_extname[] = { "deferred free", "dedup ditto", + "other", "Total", }; @@ -1974,9 +2018,10 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, type = BP_GET_TYPE(bp); - zdb_count_block(zcb, zilog, bp, type); + zdb_count_block(zcb, zilog, bp, + (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); - is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata); + is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { int ioerr; @@ -2207,6 +2252,12 @@ dump_block_stats(spa_t *spa) count_block_cb, &zcb, NULL); (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, count_block_cb, &zcb, NULL); + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, + &zcb, NULL)); + } if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; @@ -2383,7 +2434,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || - BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) + BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); @@ -2491,7 +2542,14 @@ dump_zpool(spa_t *spa) dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees"); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj, - "Pool frees"); + "Pool snapshot frees"); + } + + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + dump_bptree(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, + "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } diff --git a/cmd/zhack/.gitignore b/cmd/zhack/.gitignore new file mode 100644 index 000000000..763a18898 --- /dev/null +++ b/cmd/zhack/.gitignore @@ -0,0 +1 @@ +/zhack diff --git a/cmd/zhack/Makefile.am b/cmd/zhack/Makefile.am new file mode 100644 index 000000000..47da2453e --- /dev/null +++ b/cmd/zhack/Makefile.am @@ -0,0 +1,18 @@ +include $(top_srcdir)/config/Rules.am + +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib/libspl/include + +sbin_PROGRAMS = zhack + +zhack_SOURCES = \ + $(top_srcdir)/cmd/zhack/zhack.c + +zhack_LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzpool/libzpool.la \ + $(top_builddir)/lib/libzfs/libzfs.la + +zhack_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c new file mode 100644 index 000000000..b2cf815ca --- /dev/null +++ b/cmd/zhack/zhack.c @@ -0,0 +1,533 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * zhack is a debugging tool that can write changes to ZFS pool using libzpool + * for testing purposes. Altering pools with zhack is unsupported and may + * result in corrupted pools. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/zfs_znode.h> +#include <sys/dsl_synctask.h> +#include <sys/vdev.h> +#include <sys/fs/zfs.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_pool.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/zfeature.h> +#undef ZFS_MAXNAMELEN +#include <libzfs.h> + +extern boolean_t zfeature_checks_disable; + +const char cmdname[] = "zhack"; +libzfs_handle_t *g_zfs; +static importargs_t g_importargs; +static char *g_pool; +static boolean_t g_readonly; + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage: %s [-c cachefile] [-d dir] <subcommand> <args> ...\n" + "where <subcommand> <args> is one of the following:\n" + "\n", cmdname); + + (void) fprintf(stderr, + " feature stat <pool>\n" + " print information about enabled features\n" + " feature enable [-d desc] <pool> <feature>\n" + " add a new enabled feature to the pool\n" + " -d <desc> sets the feature's description\n" + " feature ref [-md] <pool> <feature>\n" + " change the refcount on the given feature\n" + " -d decrease instead of increase the refcount\n" + " -m add the feature to the label if increasing refcount\n" + "\n" + " <feature> : should be a feature guid\n"); + exit(1); +} + + +static void +fatal(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void) fprintf(stderr, "%s: ", cmdname); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); + (void) fprintf(stderr, "\n"); + + exit(1); +} + +/* ARGSUSED */ +static int +space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp) +{ + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (ENOENT); + (void) fprintf(stderr, "modifying object that needs user accounting"); + abort(); + /* NOTREACHED */ +} + +/* + * Target is the dataset whose pool we want to open. + */ +static void +import_pool(const char *target, boolean_t readonly) +{ + nvlist_t *config; + nvlist_t *pools; + int error; + char *sepp; + spa_t *spa; + nvpair_t *elem; + nvlist_t *props; + const char *name; + + kernel_init(readonly ? FREAD : (FREAD | FWRITE)); + g_zfs = libzfs_init(); + ASSERT(g_zfs != NULL); + + dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb); + + g_readonly = readonly; + + /* + * If we only want readonly access, it's OK if we find + * a potentially-active (ie, imported into the kernel) pool from the + * default cachefile. + */ + if (readonly && spa_open(target, &spa, FTAG) == 0) { + spa_close(spa, FTAG); + return; + } + + g_importargs.unique = B_TRUE; + g_importargs.can_be_active = readonly; + g_pool = strdup(target); + if ((sepp = strpbrk(g_pool, "/@")) != NULL) + *sepp = '\0'; + g_importargs.poolname = g_pool; + pools = zpool_search_import(g_zfs, &g_importargs); + + if (pools == NULL || nvlist_next_nvpair(pools, NULL) == NULL) { + if (!g_importargs.can_be_active) { + g_importargs.can_be_active = B_TRUE; + if (zpool_search_import(g_zfs, &g_importargs) != NULL || + spa_open(target, &spa, FTAG) == 0) { + fatal("cannot import '%s': pool is active; run " + "\"zpool export %s\" first\n", + g_pool, g_pool); + } + } + + fatal("cannot import '%s': no such pool available\n", g_pool); + } + + elem = nvlist_next_nvpair(pools, NULL); + name = nvpair_name(elem); + VERIFY(nvpair_value_nvlist(elem, &config) == 0); + + props = NULL; + if (readonly) { + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0); + } + + zfeature_checks_disable = B_TRUE; + error = spa_import(name, config, props, ZFS_IMPORT_NORMAL); + zfeature_checks_disable = B_FALSE; + if (error == EEXIST) + error = 0; + + if (error) + fatal("can't import '%s': %s", name, strerror(error)); +} + +static void +zhack_spa_open(const char *target, boolean_t readonly, void *tag, spa_t **spa) +{ + int err; + + import_pool(target, readonly); + + zfeature_checks_disable = B_TRUE; + err = spa_open(target, spa, tag); + zfeature_checks_disable = B_FALSE; + + if (err != 0) + fatal("cannot open '%s': %s", target, strerror(err)); + if (spa_version(*spa) < SPA_VERSION_FEATURES) { + fatal("'%s' has version %d, features not enabled", target, + (int)spa_version(*spa)); + } +} + +static void +dump_obj(objset_t *os, uint64_t obj, const char *name) +{ + zap_cursor_t zc; + zap_attribute_t za; + + (void) printf("%s_obj:\n", name); + + for (zap_cursor_init(&zc, os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (za.za_integer_length == 8) { + ASSERT(za.za_num_integers == 1); + (void) printf("\t%s = %llu\n", + za.za_name, (u_longlong_t)za.za_first_integer); + } else { + ASSERT(za.za_integer_length == 1); + char val[1024]; + VERIFY(zap_lookup(os, obj, za.za_name, + 1, sizeof (val), val) == 0); + (void) printf("\t%s = %s\n", za.za_name, val); + } + } + zap_cursor_fini(&zc); +} + +static void +dump_mos(spa_t *spa) +{ + nvlist_t *nv = spa->spa_label_features; + nvpair_t *pair; + + (void) printf("label config:\n"); + for (pair = nvlist_next_nvpair(nv, NULL); + pair != NULL; + pair = nvlist_next_nvpair(nv, pair)) { + (void) printf("\t%s\n", nvpair_name(pair)); + } +} + +static void +zhack_do_feature_stat(int argc, char **argv) +{ + spa_t *spa; + objset_t *os; + char *target; + + argc--; + argv++; + + if (argc < 1) { + (void) fprintf(stderr, "error: missing pool name\n"); + usage(); + } + target = argv[0]; + + zhack_spa_open(target, B_TRUE, FTAG, &spa); + os = spa->spa_meta_objset; + + dump_obj(os, spa->spa_feat_for_read_obj, "for_read"); + dump_obj(os, spa->spa_feat_for_write_obj, "for_write"); + dump_obj(os, spa->spa_feat_desc_obj, "descriptions"); + dump_mos(spa); + + spa_close(spa, FTAG); +} + +static void +feature_enable_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + zfeature_info_t *feature = arg2; + + spa_feature_enable(spa, feature, tx); +} + +static void +zhack_do_feature_enable(int argc, char **argv) +{ + char c; + char *desc, *target; + spa_t *spa; + objset_t *mos; + zfeature_info_t feature; + zfeature_info_t *nodeps[] = { NULL }; + + /* + * Features are not added to the pool's label until their refcounts + * are incremented, so fi_mos can just be left as false for now. + */ + desc = NULL; + feature.fi_uname = "zhack"; + feature.fi_mos = B_FALSE; + feature.fi_can_readonly = B_FALSE; + feature.fi_depends = nodeps; + + optind = 1; + while ((c = getopt(argc, argv, "rmd:")) != -1) { + switch (c) { + case 'r': + feature.fi_can_readonly = B_TRUE; + break; + case 'd': + desc = strdup(optarg); + break; + default: + usage(); + break; + } + } + + if (desc == NULL) + desc = strdup("zhack injected"); + feature.fi_desc = desc; + + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, "error: missing feature or pool name\n"); + usage(); + } + target = argv[0]; + feature.fi_guid = argv[1]; + + if (!zfeature_is_valid_guid(feature.fi_guid)) + fatal("invalid feature guid: %s", feature.fi_guid); + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + mos = spa->spa_meta_objset; + + if (0 == zfeature_lookup_guid(feature.fi_guid, NULL)) + fatal("'%s' is a real feature, will not enable"); + if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid)) + fatal("feature already enabled: %s", feature.fi_guid); + + VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL, + feature_enable_sync, spa, &feature, 5)); + + spa_close(spa, FTAG); + + free(desc); +} + +static void +feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + zfeature_info_t *feature = arg2; + + spa_feature_incr(spa, feature, tx); +} + +static void +feature_decr_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + zfeature_info_t *feature = arg2; + + spa_feature_decr(spa, feature, tx); +} + +static void +zhack_do_feature_ref(int argc, char **argv) +{ + char c; + char *target; + boolean_t decr = B_FALSE; + spa_t *spa; + objset_t *mos; + zfeature_info_t feature; + zfeature_info_t *nodeps[] = { NULL }; + + /* + * fi_desc does not matter here because it was written to disk + * when the feature was enabled, but we need to properly set the + * feature for read or write based on the information we read off + * disk later. + */ + feature.fi_uname = "zhack"; + feature.fi_mos = B_FALSE; + feature.fi_desc = NULL; + feature.fi_depends = nodeps; + + optind = 1; + while ((c = getopt(argc, argv, "md")) != -1) { + switch (c) { + case 'm': + feature.fi_mos = B_TRUE; + break; + case 'd': + decr = B_TRUE; + break; + default: + usage(); + break; + } + } + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, "error: missing feature or pool name\n"); + usage(); + } + target = argv[0]; + feature.fi_guid = argv[1]; + + if (!zfeature_is_valid_guid(feature.fi_guid)) + fatal("invalid feature guid: %s", feature.fi_guid); + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + mos = spa->spa_meta_objset; + + if (0 == zfeature_lookup_guid(feature.fi_guid, NULL)) + fatal("'%s' is a real feature, will not change refcount"); + + if (0 == zap_contains(mos, spa->spa_feat_for_read_obj, + feature.fi_guid)) { + feature.fi_can_readonly = B_FALSE; + } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj, + feature.fi_guid)) { + feature.fi_can_readonly = B_TRUE; + } else { + fatal("feature is not enabled: %s", feature.fi_guid); + } + + if (decr && !spa_feature_is_active(spa, &feature)) + fatal("feature refcount already 0: %s", feature.fi_guid); + + VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL, + decr ? feature_decr_sync : feature_incr_sync, spa, &feature, 5)); + + spa_close(spa, FTAG); +} + +static int +zhack_do_feature(int argc, char **argv) +{ + char *subcommand; + + argc--; + argv++; + if (argc == 0) { + (void) fprintf(stderr, + "error: no feature operation specified\n"); + usage(); + } + + subcommand = argv[0]; + if (strcmp(subcommand, "stat") == 0) { + zhack_do_feature_stat(argc, argv); + } else if (strcmp(subcommand, "enable") == 0) { + zhack_do_feature_enable(argc, argv); + } else if (strcmp(subcommand, "ref") == 0) { + zhack_do_feature_ref(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + return (0); +} + +#define MAX_NUM_PATHS 1024 + +int +main(int argc, char **argv) +{ + extern void zfs_prop_init(void); + + char *path[MAX_NUM_PATHS]; + const char *subcommand; + int rv = 0; + char c; + + g_importargs.path = path; + + dprintf_setup(&argc, argv); + zfs_prop_init(); + + while ((c = getopt(argc, argv, "c:d:")) != -1) { + switch (c) { + case 'c': + g_importargs.cachefile = optarg; + break; + case 'd': + assert(g_importargs.paths < MAX_NUM_PATHS); + g_importargs.path[g_importargs.paths++] = optarg; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + optind = 1; + + if (argc == 0) { + (void) fprintf(stderr, "error: no command specified\n"); + usage(); + } + + subcommand = argv[0]; + + if (strcmp(subcommand, "feature") == 0) { + rv = zhack_do_feature(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_TRUE) != 0) { + fatal("pool export failed; " + "changes may not be committed to disk\n"); + } + + libzfs_fini(g_zfs); + kernel_fini(); + + return (rv); +} diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index cb0535a98..d0c0a923f 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -54,6 +54,7 @@ #include "zpool_util.h" #include "zfs_comutil.h" +#include "zfeature_common.h" #include "statcommon.h" @@ -208,7 +209,7 @@ get_usage(zpool_help_t idx) { case HELP_CLEAR: return (gettext("\tclear [-nF] <pool> [device]\n")); case HELP_CREATE: - return (gettext("\tcreate [-fn] [-o property=value] ... \n" + return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] <pool> <vdev> ...\n")); case HELP_DESTROY: @@ -341,6 +342,12 @@ usage(boolean_t requested) /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); + + (void) fprintf(fp, "\t%-15s ", "feature@..."); + (void) fprintf(fp, "YES disabled | enabled | active\n"); + + (void) fprintf(fp, gettext("\nThe feature@ properties must be " + "appended with a feature name.\nSee zpool-features(5).\n")); } /* @@ -407,12 +414,16 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, proplist = *props; if (poolprop) { - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) { + if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL && + !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } - normnm = zpool_prop_to_name(prop); + if (zpool_prop_feature(propname)) + normnm = propname; + else + normnm = zpool_prop_to_name(prop); } else { if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { normnm = zfs_prop_to_name(fprop); @@ -601,7 +612,7 @@ zpool_do_remove(int argc, char **argv) } /* - * zpool create [-fn] [-o property=value] ... + * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] <pool> <dev> ... * @@ -610,8 +621,10 @@ zpool_do_remove(int argc, char **argv) * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's - * '/<pool>' + * '/<pool>' * -o Set property=value. + * -d Don't automatically enable all supported pool features + * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The @@ -624,6 +637,7 @@ zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; + boolean_t enable_all_pool_feat = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; @@ -635,7 +649,7 @@ zpool_do_create(int argc, char **argv) char *propval; /* check options */ - while ((c = getopt(argc, argv, ":fnR:m:o:O:")) != -1) { + while ((c = getopt(argc, argv, ":fndR:m:o:O:")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -643,6 +657,9 @@ zpool_do_create(int argc, char **argv) case 'n': dryrun = B_TRUE; break; + case 'd': + enable_all_pool_feat = B_FALSE; + break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( @@ -670,6 +687,21 @@ zpool_do_create(int argc, char **argv) if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; + + /* + * If the user is creating a pool that doesn't support + * feature flags, don't enable any features. + */ + if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { + char *end; + u_longlong_t ver; + + ver = strtoull(propval, &end, 10); + if (*end == '\0' && + ver < SPA_VERSION_FEATURES) { + enable_all_pool_feat = B_FALSE; + } + } break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { @@ -735,7 +767,6 @@ zpool_do_create(int argc, char **argv) goto errout; } - if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); @@ -817,6 +848,27 @@ zpool_do_create(int argc, char **argv) /* * Hand off to libzfs. */ + if (enable_all_pool_feat) { + int i; + for (i = 0; i < SPA_FEATURES; i++) { + char propname[MAXPATHLEN]; + zfeature_info_t *feat = &spa_feature_table[i]; + + (void) snprintf(propname, sizeof (propname), + "feature@%s", feat->fi_uname); + + /* + * Skip feature if user specified it manually + * on the command line. + */ + if (nvlist_exists(props, propname)) + continue; + + if (add_prop_list(propname, ZFS_FEATURE_ENABLED, + &props, B_TRUE) != 0) + goto errout; + } + } if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, poolname, @@ -1148,6 +1200,10 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, (void) printf(gettext("newer version")); break; + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &cb.cb_guid) == 0); @@ -1265,6 +1321,10 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) (void) printf(gettext("newer version")); break; + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; @@ -1431,6 +1491,20 @@ show_import(nvlist_t *config) "incompatible version.\n")); break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("status: The pool uses the following " + "feature(s) not supported on this sytem:\n")); + zpool_print_unsup_feat(config); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("status: The pool can only be accessed " + "in read-only mode on this system. It\n\tcannot be " + "accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + zpool_print_unsup_feat(config); + break; + case ZPOOL_STATUS_HOSTID_MISMATCH: (void) printf(gettext(" status: The pool was last accessed by " "another system.\n")); @@ -1488,6 +1562,20 @@ show_import(nvlist_t *config) "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("action: The pool cannot be " + "imported. Access the pool on a system that " + "supports\n\tthe required feature(s), or recreate " + "the pool from backup.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("action: The pool cannot be " + "imported in read-write mode. Import the pool " + "with\n" + "\t\"-o readonly=on\", access the pool on a system " + "that supports the\n\trequired feature(s), or " + "recreate the pool from backup.\n")); + break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: @@ -1563,9 +1651,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, ZPOOL_CONFIG_POOL_STATE, &state) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); - if (version > SPA_VERSION) { + if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " - "is formatted using a newer ZFS version\n"), name); + "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (state != POOL_STATE_EXPORTED && !(flags & ZFS_IMPORT_ANY_HOST)) { @@ -2556,15 +2644,13 @@ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; + char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { - if (pl->pl_prop == ZPROP_INVAL) - continue; - width = pl->pl_width; if (first && cb->cb_verbose) { /* @@ -2579,8 +2665,18 @@ print_header(list_cbdata_t *cb) else first = B_FALSE; - header = zpool_prop_column_name(pl->pl_prop); - right_justify = zpool_prop_align_right(pl->pl_prop); + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zpool_prop_column_name(pl->pl_prop); + right_justify = zpool_prop_align_right(pl->pl_prop); + } else { + int i; + + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); @@ -2639,6 +2735,11 @@ print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); + } else if ((zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop)) && + zpool_prop_get_feature(zhp, pl->pl_user_prop, property, + sizeof (property)) == 0) { + propstr = property; } else { propstr = "-"; } @@ -3958,6 +4059,31 @@ status_callback(zpool_handle_t *zhp, void *data) "backup.\n")); break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("status: The pool cannot be accessed on " + "this system because it uses the\n\tfollowing feature(s) " + "not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + (void) printf(gettext("action: Access the pool from a system " + "that supports the required feature(s),\n\tor restore the " + "pool from backup.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("status: The pool can only be accessed " + "in read-only mode on this system. It\n\tcannot be " + "accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + (void) printf(gettext("action: The pool cannot be accessed in " + "read-write mode. Import the pool with\n" + "\t\"-o readonly=on\", access the pool from a system that " + "supports the\n\trequired feature(s), or restore the " + "pool from backup.\n")); + break; + case ZPOOL_STATUS_FAULTED_DEV_R: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors.\n\tSufficient " @@ -4182,7 +4308,8 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); - if (!cbp->cb_newer && version < SPA_VERSION) { + if (!cbp->cb_newer && SPA_VERSION_IS_SUPPORTED(version) && + version != SPA_VERSION) { if (!cbp->cb_all) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " @@ -4205,13 +4332,14 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) "'%s'\n\n"), zpool_get_name(zhp)); } } - } else if (cbp->cb_newer && version > SPA_VERSION) { + } else if (cbp->cb_newer && !SPA_VERSION_IS_SUPPORTED(version)) { assert(!cbp->cb_all); if (cbp->cb_first) { (void) printf(gettext("The following pools are " - "formatted using a newer software version and\n" - "cannot be accessed on the current system.\n\n")); + "formatted using an unsupported software version " + "and\ncannot be accessed on the current " + "system.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; @@ -4295,8 +4423,8 @@ zpool_do_upgrade(int argc, char **argv) break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); - if (*end != '\0' || cb.cb_version > SPA_VERSION || - cb.cb_version < SPA_VERSION_1) { + if (*end != '\0' || + !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); @@ -4341,8 +4469,8 @@ zpool_do_upgrade(int argc, char **argv) } } - (void) printf(gettext("This system is currently running " - "ZFS pool version %llu.\n\n"), SPA_VERSION); + (void) printf(gettext("This system supports ZFS pool feature " + "flags.\n\n")); cb.cb_first = B_TRUE; if (showversions) { (void) printf(gettext("The following versions are " @@ -4923,13 +5051,26 @@ get_callback(zpool_handle_t *zhp, void *data) pl == cbp->cb_proplist) continue; - if (zpool_get_prop(zhp, pl->pl_prop, - value, sizeof (value), &srctype) != 0) - continue; + if (pl->pl_prop == ZPROP_INVAL && + (zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop))) { + srctype = ZPROP_SRC_LOCAL; + + if (zpool_prop_get_feature(zhp, pl->pl_user_prop, + value, sizeof (value)) == 0) { + zprop_print_one_property(zpool_get_name(zhp), + cbp, pl->pl_user_prop, value, srctype, + NULL, NULL); + } + } else { + if (zpool_get_prop(zhp, pl->pl_prop, value, + sizeof (value), &srctype) != 0) + continue; - zprop_print_one_property(zpool_get_name(zhp), cbp, - zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, - NULL); + zprop_print_one_property(zpool_get_name(zhp), cbp, + zpool_prop_to_name(pl->pl_prop), value, srctype, + NULL, NULL); + } } return (0); } @@ -4941,8 +5082,11 @@ zpool_do_get(int argc, char **argv) zprop_list_t fake_name = { 0 }; int ret; - if (argc < 3) + if (argc < 2) { + (void) fprintf(stderr, gettext("missing property " + "argument\n")); usage(B_FALSE); + } cb.cb_first = B_TRUE; cb.cb_sources = ZPROP_SRC_ALL; @@ -4952,7 +5096,7 @@ zpool_do_get(int argc, char **argv) cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; - if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist, + if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 7e941b5cb..cc2db31e4 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -109,6 +109,7 @@ #include <sys/dsl_scan.h> #include <sys/zio_checksum.h> #include <sys/refcount.h> +#include <sys/zfeature.h> #include <stdio.h> #include <stdio_ext.h> #include <stdlib.h> @@ -5771,10 +5772,9 @@ make_random_props(void) { nvlist_t *props; - if (ztest_random(2) == 0) - return (NULL); - VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + if (ztest_random(2) == 0) + return (props); VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); return (props); @@ -5789,6 +5789,7 @@ ztest_init(ztest_shared_t *zs) { spa_t *spa; nvlist_t *nvroot, *props; + int i; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&ztest_name_lock, NULL, RW_DEFAULT, NULL); @@ -5805,6 +5806,13 @@ ztest_init(ztest_shared_t *zs) nvroot = make_vdev_root(NULL, NULL, ztest_opts.zo_vdev_size, 0, 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); props = make_random_props(); + for (i = 0; i < SPA_FEATURES; i++) { + char *buf; + VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", + spa_feature_table[i].fi_uname)); + VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); + free(buf); + } VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); nvlist_free(nvroot); diff --git a/configure.ac b/configure.ac index 2b79f5ff2..fc38f2c55 100644 --- a/configure.ac +++ b/configure.ac @@ -89,6 +89,7 @@ AC_CONFIG_FILES([ lib/libshare/Makefile cmd/Makefile cmd/zdb/Makefile + cmd/zhack/Makefile cmd/zfs/Makefile cmd/zinject/Makefile cmd/zpool/Makefile diff --git a/include/Makefile.am b/include/Makefile.am index 8f9c8d729..8325a8ec0 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -1,6 +1,7 @@ SUBDIRS = linux sys COMMON_H = \ + $(top_srcdir)/include/zfeature_common.h \ $(top_srcdir)/include/zfs_comutil.h \ $(top_srcdir)/include/zfs_deleg.h \ $(top_srcdir)/include/zfs_fletcher.h \ diff --git a/include/libzfs.h b/include/libzfs.h index e59350c9d..83ac34394 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -301,6 +301,15 @@ typedef enum { ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ /* + * If the pool has unsupported features but can still be opened in + * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the + * pool has unsupported features but cannot be opened at all, its + * status is ZPOOL_STATUS_UNSUP_FEAT_READ. + */ + ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ + ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ + + /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. @@ -332,6 +341,7 @@ extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); * Statistics and configuration functions. */ extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); +extern nvlist_t *zpool_get_features(zpool_handle_t *); extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); @@ -344,6 +354,7 @@ extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); +extern void zpool_print_unsup_feat(nvlist_t *config); /* * Search for pools to import @@ -435,6 +446,8 @@ extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); +extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, + char *buf, size_t len); extern int zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, uint64_t *usedp); extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); @@ -462,10 +475,19 @@ extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" +#define ZFS_FEATURE_DISABLED "disabled" +#define ZFS_FEATURE_ENABLED "enabled" +#define ZFS_FEATURE_ACTIVE "active" + +#define ZFS_UNSUPPORTED_INACTIVE "inactive" +#define ZFS_UNSUPPORTED_READONLY "readonly" + /* * zpool property management */ extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); +extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, + size_t); extern const char *zpool_prop_default_string(zpool_prop_t); extern uint64_t zpool_prop_default_numeric(zpool_prop_t); extern const char *zpool_prop_column_name(zpool_prop_t); diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 651e68b0c..a0cc9d1d6 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -6,6 +6,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/avl_impl.h \ $(top_srcdir)/include/sys/bplist.h \ $(top_srcdir)/include/sys/bpobj.h \ + $(top_srcdir)/include/sys/bptree.h \ $(top_srcdir)/include/sys/dbuf.h \ $(top_srcdir)/include/sys/ddt.h \ $(top_srcdir)/include/sys/dmu.h \ @@ -53,6 +54,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/zap.h \ $(top_srcdir)/include/sys/zap_impl.h \ $(top_srcdir)/include/sys/zap_leaf.h \ + $(top_srcdir)/include/sys/zfeature.h \ $(top_srcdir)/include/sys/zfs_acl.h \ $(top_srcdir)/include/sys/zfs_context.h \ $(top_srcdir)/include/sys/zfs_ctldir.h \ diff --git a/include/sys/bptree.h b/include/sys/bptree.h new file mode 100644 index 000000000..971507211 --- /dev/null +++ b/include/sys/bptree.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BPTREE_H +#define _SYS_BPTREE_H + +#include <sys/spa.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bptree_phys { + uint64_t bt_begin; + uint64_t bt_end; + uint64_t bt_bytes; + uint64_t bt_comp; + uint64_t bt_uncomp; +} bptree_phys_t; + +typedef struct bptree_entry_phys { + blkptr_t be_bp; + uint64_t be_birth_txg; /* only delete blocks born after this txg */ + zbookmark_t be_zb; /* holds traversal resume point if needed */ +} bptree_entry_phys_t; + +typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx); +int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); + +void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx); + +int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, + bptree_itor_t func, void *arg, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPTREE_H */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index fe317c835..ce3169731 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -71,6 +71,53 @@ typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; +typedef enum dmu_object_byteswap { + DMU_BSWAP_UINT8, + DMU_BSWAP_UINT16, + DMU_BSWAP_UINT32, + DMU_BSWAP_UINT64, + DMU_BSWAP_ZAP, + DMU_BSWAP_DNODE, + DMU_BSWAP_OBJSET, + DMU_BSWAP_ZNODE, + DMU_BSWAP_OLDACL, + DMU_BSWAP_ACL, + /* + * Allocating a new byteswap type number makes the on-disk format + * incompatible with any other format that uses the same number. + * + * Data can usually be structured to work with one of the + * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. + */ + DMU_BSWAP_NUMFUNCS +} dmu_object_byteswap_t; + +#define DMU_OT_NEWTYPE 0x80 +#define DMU_OT_METADATA 0x40 +#define DMU_OT_BYTESWAP_MASK 0x3f + +/* + * Defines a uint8_t object type. Object types specify if the data + * in the object is metadata (boolean) and how to byteswap the data + * (dmu_object_byteswap_t). + */ +#define DMU_OT(byteswap, metadata) \ + (DMU_OT_NEWTYPE | \ + ((metadata) ? DMU_OT_METADATA : 0) | \ + ((byteswap) & DMU_OT_BYTESWAP_MASK)) + +#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ + (ot) < DMU_OT_NUMTYPES) + +#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_METADATA) : \ + dmu_ot[(int)(ot)].ot_metadata) + +#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) : \ + dmu_ot[(int)(ot)].ot_byteswap) + typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ @@ -135,7 +182,35 @@ typedef enum dmu_object_type { DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ - DMU_OT_NUMTYPES + /* + * Do not allocate new object types here. Doing so makes the on-disk + * format incompatible with any other format that uses the same object + * type number. + * + * When creating an object which does not have one of the above types + * use the DMU_OTN_* type with the correct byteswap and metadata + * values. + * + * The DMU_OTN_* types do not have entries in the dmu_ot table, + * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead + * of indexing into dmu_ot directly (this works for both DMU_OT_* types + * and DMU_OTN_* types). + */ + DMU_OT_NUMTYPES, + + /* + * Names for valid types declared with DMU_OT(). + */ + DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), + DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), + DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), + DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), + DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), + DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), + DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), + DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), + DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), + DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; typedef enum dmu_objset_type { @@ -215,6 +290,9 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" +#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" +#define DMU_POOL_FEATURES_FOR_READ "features_for_read" +#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" @@ -230,6 +308,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" +#define DMU_POOL_BPTREE_OBJ "bptree_obj" /* * Allocate an object from this objset. The range of object numbers @@ -490,7 +569,7 @@ void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, /* * Free up the data blocks for a defined range of a file. If size is - * zero, the range from offset to end-of-file is freed. + * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); @@ -564,12 +643,18 @@ typedef struct dmu_object_info { typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { - arc_byteswap_func_t *ot_byteswap; + dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; +typedef struct dmu_object_byteswap_info { + arc_byteswap_func_t *ob_func; + char *ob_name; +} dmu_object_byteswap_info_t; + extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; +extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. diff --git a/include/sys/dmu_traverse.h b/include/sys/dmu_traverse.h index 5b326cd99..3cbf42f56 100644 --- a/include/sys/dmu_traverse.h +++ b/include/sys/dmu_traverse.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -54,6 +55,9 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg); int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 38ce3c567..547951cd0 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -86,7 +86,12 @@ typedef struct dsl_dataset_phys { uint64_t ds_creation_time; /* seconds since 1970 */ uint64_t ds_creation_txg; uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ - uint64_t ds_used_bytes; + /* + * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes + * include all blocks referenced by this dataset, including those + * shared with any other datasets. + */ + uint64_t ds_referenced_bytes; uint64_t ds_compressed_bytes; uint64_t ds_uncompressed_bytes; uint64_t ds_unique_bytes; /* only relevant to snapshots */ diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 40e96101d..16fb98669 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_POOL_H @@ -34,6 +35,7 @@ #include <sys/ddt.h> #include <sys/arc.h> #include <sys/bpobj.h> +#include <sys/bptree.h> #ifdef __cplusplus extern "C" { @@ -48,7 +50,8 @@ struct dsl_scan; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE -#define DMU_OT_TOTAL DMU_OT_NUMTYPES +#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */ +#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1) typedef struct zfs_blkstat { uint64_t zb_count; @@ -93,6 +96,7 @@ typedef struct dsl_pool { uint64_t dp_write_limit; uint64_t dp_tmp_userrefs_obj; bpobj_t dp_free_bpobj; + uint64_t dp_bptree_obj; struct dsl_scan *dp_scan; @@ -121,7 +125,8 @@ typedef struct dsl_pool { zfs_all_blkstats_t *dp_blkstats; } dsl_pool_t; -int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +int dsl_pool_open(dsl_pool_t *dp); void dsl_pool_close(dsl_pool_t *dp); dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index c79666e67..5691f4d14 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_SCAN_H @@ -79,6 +80,9 @@ typedef struct dsl_scan { uint64_t scn_sync_start_time; zio_t *scn_zio_root; + /* for freeing blocks */ + boolean_t scn_is_bptree; + /* for debugging / information */ uint64_t scn_visited_this_txg; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index f72c74fc9..61596f7d7 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -170,6 +170,7 @@ typedef enum { ZPOOL_PROP_ASHIFT, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, + ZPOOL_PROP_FREEING, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -244,6 +245,8 @@ const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); +boolean_t zpool_prop_feature(const char *); +boolean_t zpool_prop_unsupported(const char *); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); @@ -356,6 +359,7 @@ typedef enum { #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL +#define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk @@ -363,8 +367,8 @@ typedef enum { * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_28 -#define SPA_VERSION_STRING "28" +#define SPA_VERSION SPA_VERSION_5000 +#define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -415,6 +419,12 @@ typedef enum { #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 +#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 +#define SPA_VERSION_FEATURES SPA_VERSION_5000 + +#define SPA_VERSION_IS_SUPPORTED(v) \ + (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ + ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -512,6 +522,11 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ +#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ +#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ +#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ +#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" +#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -590,6 +605,7 @@ typedef enum vdev_aux { VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ + VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h index 30ff4e066..cc399fd16 100644 --- a/include/sys/nvpair.h +++ b/include/sys/nvpair.h @@ -20,12 +20,14 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_NVPAIR_H #define _SYS_NVPAIR_H #include <sys/types.h> +#include <sys/time.h> #include <sys/errno.h> #include <sys/va_list.h> @@ -274,6 +276,73 @@ int nvpair_value_hrtime(nvpair_t *, hrtime_t *); int nvpair_value_double(nvpair_t *, double *); #endif +nvlist_t *fnvlist_alloc(void); +void fnvlist_free(nvlist_t *); +size_t fnvlist_size(nvlist_t *); +char *fnvlist_pack(nvlist_t *, size_t *); +void fnvlist_pack_free(char *, size_t); +nvlist_t *fnvlist_unpack(char *, size_t); +nvlist_t *fnvlist_dup(nvlist_t *); +void fnvlist_merge(nvlist_t *, nvlist_t *); + +void fnvlist_add_boolean(nvlist_t *, const char *); +void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +void fnvlist_add_byte(nvlist_t *, const char *, uchar_t); +void fnvlist_add_int8(nvlist_t *, const char *, int8_t); +void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t); +void fnvlist_add_int16(nvlist_t *, const char *, int16_t); +void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t); +void fnvlist_add_int32(nvlist_t *, const char *, int32_t); +void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t); +void fnvlist_add_int64(nvlist_t *, const char *, int64_t); +void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t); +void fnvlist_add_string(nvlist_t *, const char *, const char *); +void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +void fnvlist_add_nvpair(nvlist_t *, nvpair_t *); +void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); +void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); +void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); +void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); +void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); +void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); +void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); +void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); +void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); +void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); +void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t); +void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); + +void fnvlist_remove(nvlist_t *, const char *); +void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); + +nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name); +boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name); +boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name); +uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name); +int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name); +int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name); +int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name); +int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name); +uint8_t fnvlist_lookup_uint8(nvlist_t *nvl, const char *name); +uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name); +uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name); +uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name); +char *fnvlist_lookup_string(nvlist_t *nvl, const char *name); +nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name); + +boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); +uchar_t fnvpair_value_byte(nvpair_t *nvp); +int8_t fnvpair_value_int8(nvpair_t *nvp); +int16_t fnvpair_value_int16(nvpair_t *nvp); +int32_t fnvpair_value_int32(nvpair_t *nvp); +int64_t fnvpair_value_int64(nvpair_t *nvp); +uint8_t fnvpair_value_uint8(nvpair_t *nvp); +uint16_t fnvpair_value_uint16(nvpair_t *nvp); +uint32_t fnvpair_value_uint32(nvpair_t *nvp); +uint64_t fnvpair_value_uint64(nvpair_t *nvp); +char *fnvpair_value_string(nvpair_t *nvp); +nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); + #ifdef __cplusplus } #endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 28bb4e1de..821172297 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -94,7 +94,7 @@ struct dsl_pool; /* * Size of block to hold the configuration data (a packed nvlist) */ -#define SPA_CONFIG_BLOCKSIZE (1 << 14) +#define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. @@ -262,7 +262,7 @@ typedef struct blkptr { DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_GET_UCSIZE(bp) \ - ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ @@ -404,8 +404,8 @@ typedef struct blkptr { #include <sys/dmu.h> #define BP_GET_BUFC_TYPE(bp) \ - (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA); + (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, @@ -416,8 +416,8 @@ typedef enum spa_import_type { extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); -extern int spa_get_stats(const char *pool, nvlist_t **config, - char *altroot, size_t buflen); +extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, + size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); extern int spa_import_rootpool(char *devpath, char *devid); @@ -574,6 +574,7 @@ extern void spa_claim_notify(zio_t *zio); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern boolean_t spa_is_initializing(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); @@ -605,6 +606,8 @@ extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); /* Miscellaneous support routines */ +extern void spa_activate_mos_feature(spa_t *spa, const char *feature); +extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern int spa_rename(const char *oldname, const char *newname); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index d367486a0..85a825d08 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -127,6 +127,7 @@ struct spa { uint64_t spa_import_flags; /* import specific flags */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; + boolean_t spa_is_initializing; /* true while opening pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ uint64_t spa_first_txg; /* first txg after spa_open() */ @@ -144,6 +145,7 @@ struct spa { list_t spa_state_dirty_list; /* vdevs with dirty state */ spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ @@ -220,7 +222,10 @@ struct spa { boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ - uint64_t spa_prev_software_version; + uint64_t spa_prev_software_version; /* See ub_software_version */ + uint64_t spa_feat_for_write_obj; /* required to write to pool */ + uint64_t spa_feat_for_read_obj; /* required to read from pool */ + uint64_t spa_feat_desc_obj; /* Feature descriptions */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 005578398..51eb855ee 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. @@ -140,8 +141,8 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, struct uberblock; extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); -extern nvlist_t *vdev_label_read_config(vdev_t *vd); -extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); +extern nvlist_t *vdev_label_read_config(vdev_t *vd, int label); +extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 0b532dcdd..4133f2cf3 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -210,7 +210,7 @@ struct vdev { * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is - * larger in userland, the offsets for the rest fields would be + * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ @@ -265,6 +265,7 @@ typedef struct vdev_label { #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 +#define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 diff --git a/include/sys/zap.h b/include/sys/zap.h index 6237f8bf5..4d7b31559 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZAP_H @@ -132,6 +133,8 @@ uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) @@ -300,10 +303,6 @@ int zap_add_int_key(objset_t *os, uint64_t obj, int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep); -/* - * They name is a stringified version of key; increment its value by - * delta. Zero values will be zap_remove()-ed. - */ int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx); diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h new file mode 100644 index 000000000..9ff1c93df --- /dev/null +++ b/include/sys/zfeature.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFEATURE_H +#define _SYS_ZFEATURE_H + +#include <sys/dmu.h> +#include <sys/nvpair.h> +#include "zfeature_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t feature_is_supported(objset_t *os, uint64_t obj, + uint64_t desc_obj, nvlist_t *unsup_feat); + +struct spa; +extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *); +extern void spa_feature_enable(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern void spa_feature_incr(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern void spa_feature_decr(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *); +extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFEATURE_H */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 289238c36..052797928 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -24,6 +24,7 @@ */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _ZIO_H @@ -278,6 +279,14 @@ typedef struct zbookmark { #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) +#define ZB_IS_ZERO(zb) \ + ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ + (zb)->zb_level == 0 && (zb)->zb_blkid == 0) +#define ZB_IS_ROOT(zb) \ + ((zb)->zb_object == ZB_ROOT_OBJECT && \ + (zb)->zb_level == ZB_ROOT_LEVEL && \ + (zb)->zb_blkid == ZB_ROOT_BLKID) + typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; @@ -295,6 +304,7 @@ typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ +struct dnode_phys; struct zio_cksum_report { struct zio_cksum_report *zcr_next; @@ -567,6 +577,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); +/* zbookmark functions */ +boolean_t zbookmark_is_before(const struct dnode_phys *dnp, + const zbookmark_t *zb1, const zbookmark_t *zb2); + #ifdef __cplusplus } #endif diff --git a/include/zfeature_common.h b/include/zfeature_common.h new file mode 100644 index 000000000..27f8f00a0 --- /dev/null +++ b/include/zfeature_common.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _ZFEATURE_COMMON_H +#define _ZFEATURE_COMMON_H + +#include <sys/fs/zfs.h> +#include <sys/inttypes.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct zfeature_info; + +typedef struct zfeature_info { + const char *fi_uname; /* User-facing feature name */ + const char *fi_guid; /* On-disk feature identifier */ + const char *fi_desc; /* Feature description */ + boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */ + boolean_t fi_mos; /* Is the feature necessary to read the MOS? */ + struct zfeature_info **fi_depends; /* array; null terminated */ +} zfeature_info_t; + +typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg); + +#define ZFS_FEATURE_DEBUG + +typedef enum spa_feature { + SPA_FEATURE_ASYNC_DESTROY, + SPA_FEATURES +} spa_feature_t; + +extern zfeature_info_t spa_feature_table[SPA_FEATURES]; + +extern boolean_t zfeature_is_valid_guid(const char *); + +extern boolean_t zfeature_is_supported(const char *); +extern int zfeature_lookup_guid(const char *, zfeature_info_t **res); +extern int zfeature_lookup_name(const char *, zfeature_info_t **res); + +extern void zpool_feature_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFEATURE_COMMON_H */ diff --git a/lib/libnvpair/Makefile.am b/lib/libnvpair/Makefile.am index 467f461a3..7a8f96a05 100644 --- a/lib/libnvpair/Makefile.am +++ b/lib/libnvpair/Makefile.am @@ -12,7 +12,8 @@ libnvpair_la_SOURCES = \ $(top_srcdir)/lib/libnvpair/libnvpair.c \ $(top_srcdir)/lib/libnvpair/nvpair_alloc_system.c \ $(top_srcdir)/module/nvpair/nvpair_alloc_fixed.c \ - $(top_srcdir)/module/nvpair/nvpair.c + $(top_srcdir)/module/nvpair/nvpair.c \ + $(top_srcdir)/module/nvpair/fnvpair.c libnvpair_la_LIBADD = \ $(top_builddir)/lib/libuutil/libuutil.la diff --git a/lib/libnvpair/libnvpair.c b/lib/libnvpair/libnvpair.c index 606a919a4..b852cb617 100644 --- a/lib/libnvpair/libnvpair.c +++ b/lib/libnvpair/libnvpair.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <unistd.h> @@ -803,6 +804,10 @@ dump_nvlist(nvlist_t *list, int indent) while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN: + (void) printf("%*s%s\n", indent, "", nvpair_name(elem)); + break; + case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(elem, &bool_value); (void) printf("%*s%s: %s\n", indent, "", diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c index b36dee1e5..ee94fe106 100644 --- a/lib/libzfs/libzfs_config.c +++ b/lib/libzfs/libzfs_config.c @@ -18,12 +18,17 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* * The pool configuration repository is stored in /etc/zfs/zpool.cache as a * single packed nvlist. While it would be nice to just read in this * file from userland, this wouldn't work from a local zone. So we have to have @@ -218,6 +223,36 @@ zpool_get_config(zpool_handle_t *zhp, nvlist_t **oldconfig) } /* + * Retrieves a list of enabled features and their refcounts and caches it in + * the pool handle. + */ +nvlist_t * +zpool_get_features(zpool_handle_t *zhp) +{ + nvlist_t *config, *features; + + config = zpool_get_config(zhp, NULL); + + if (config == NULL || !nvlist_exists(config, + ZPOOL_CONFIG_FEATURE_STATS)) { + int error; + boolean_t missing = B_FALSE; + + error = zpool_refresh_stats(zhp, &missing); + + if (error != 0 || missing) + return (NULL); + + config = zpool_get_config(zhp, NULL); + } + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + &features) == 0); + + return (features); +} + +/* * Refresh the vdev statistics associated with the given pool. This is used in * iostat to show configuration changes and determine the delta from the last * time the function was called. This function can fail, in case the pool has diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index e2a67c6a7..eca1dc36a 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -45,6 +45,7 @@ #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_comutil.h" +#include "zfeature_common.h" static int read_efi_label(nvlist_t *config, diskaddr_t *sb); @@ -273,6 +274,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: + case ZPOOL_PROP_FREEING: case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_ASHIFT: (void) zfs_nicenum(intval, buf, len); @@ -299,6 +301,12 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, (void) strlcpy(buf, zpool_state_to_name(intval, vs->vs_aux), len); break; + case ZPOOL_PROP_VERSION: + if (intval >= SPA_VERSION_FEATURES) { + (void) snprintf(buf, len, "-"); + break; + } + /* FALLTHROUGH */ default: (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } @@ -403,10 +411,48 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); + prop = zpool_name_to_prop(propname); + if (prop == ZPROP_INVAL && zpool_prop_feature(propname)) { + int err; + zfeature_info_t *feature; + char *fname = strchr(propname, '@') + 1; + + err = zfeature_lookup_name(fname, &feature); + if (err != 0) { + ASSERT3U(err, ==, ENOENT); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid feature '%s'"), fname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvpair_type(elem) != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + (void) nvpair_value_string(elem, &strval); + if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set to " + "'enabled'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvlist_add_uint64(retprops, propname, 0) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + /* * Make sure this property is valid and applies to this type. */ - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) { + if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); @@ -431,7 +477,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, default: break; case ZPOOL_PROP_VERSION: - if (intval < version || intval > SPA_VERSION) { + if (intval < version || + !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %d is invalid."), propname, intval); @@ -673,10 +720,79 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; + nvlist_t *features = NULL; + nvpair_t *nvp; + zprop_list_t **last; + boolean_t firstexpand = (NULL == *plp); + int i; if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0) return (-1); + last = plp; + while (*last != NULL) + last = &(*last)->pl_next; + + if ((*plp)->pl_all) + features = zpool_get_features(zhp); + + if ((*plp)->pl_all && firstexpand) { + for (i = 0; i < SPA_FEATURES; i++) { + zprop_list_t *entry = zfs_alloc(hdl, + sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_INVAL; + entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", + spa_feature_table[i].fi_uname); + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + } + + /* add any unsupported features */ + for (nvp = nvlist_next_nvpair(features, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { + char *propname; + boolean_t found; + zprop_list_t *entry; + + if (zfeature_is_supported(nvpair_name(nvp))) + continue; + + propname = zfs_asprintf(hdl, "unsupported@%s", + nvpair_name(nvp)); + + /* + * Before adding the property to the list make sure that no + * other pool already added the same property. + */ + found = B_FALSE; + entry = *plp; + while (entry != NULL) { + if (entry->pl_user_prop != NULL && + strcmp(propname, entry->pl_user_prop) == 0) { + found = B_TRUE; + break; + } + entry = entry->pl_next; + } + if (found) { + free(propname); + continue; + } + + entry = zfs_alloc(hdl, sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_INVAL; + entry->pl_user_prop = propname; + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed) @@ -693,6 +809,66 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) return (0); } +/* + * Get the state for the given feature on the given ZFS pool. + */ +int +zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, + size_t len) +{ + uint64_t refcount; + boolean_t found = B_FALSE; + nvlist_t *features = zpool_get_features(zhp); + boolean_t supported; + const char *feature = strchr(propname, '@') + 1; + + supported = zpool_prop_feature(propname); + ASSERT(supported || zpool_prop_unsupported(propname)); + + /* + * Convert from feature name to feature guid. This conversion is + * unecessary for unsupported@... properties because they already + * use guids. + */ + if (supported) { + int ret; + zfeature_info_t *fi; + + ret = zfeature_lookup_name(feature, &fi); + if (ret != 0) { + (void) strlcpy(buf, "-", len); + return (ENOTSUP); + } + feature = fi->fi_guid; + } + + if (nvlist_lookup_uint64(features, feature, &refcount) == 0) + found = B_TRUE; + + if (supported) { + if (!found) { + (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); + } else { + if (refcount == 0) + (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); + else + (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); + } + } else { + if (found) { + if (refcount == 0) { + (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); + } else { + (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); + } + } else { + (void) strlcpy(buf, "-", len); + return (ENOTSUP); + } + } + + return (0); +} /* * Don't start the slice at the default block of 34; many storage @@ -1291,8 +1467,10 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, if (!hdl->libzfs_printerr || config == NULL) return; - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0) + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { return; + } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; @@ -1349,6 +1527,7 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; @@ -1473,6 +1652,31 @@ print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, } } +void +zpool_print_unsup_feat(nvlist_t *config) +{ + nvlist_t *nvinfo, *unsup_feat; + nvpair_t *nvp; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == + 0); + verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT, + &unsup_feat) == 0); + + for (nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(unsup_feat, nvp)) { + char *desc; + + verify(nvpair_type(nvp) == DATA_TYPE_STRING); + verify(nvpair_value_string(nvp, &desc) == 0); + + if (strlen(desc) > 0) + (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); + else + (void) printf("\t%s\n", nvpair_name(nvp)); + } +} + /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from @@ -1579,6 +1783,22 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, switch (error) { case ENOTSUP: + if (nv != NULL && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && + nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { + (void) printf(dgettext(TEXT_DOMAIN, "This " + "pool uses the following feature(s) not " + "supported by this system:\n")); + zpool_print_unsup_feat(nv); + if (nvlist_exists(nvinfo, + ZPOOL_CONFIG_CAN_RDONLY)) { + (void) printf(dgettext(TEXT_DOMAIN, + "All unsupported features are only " + "required for writing to the pool." + "\nThe pool can be imported using " + "'-o readonly=on'.\n")); + } + } /* * Unsupported version. */ diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index d56baf0bf..ef6a64133 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -215,6 +217,20 @@ check_status(nvlist_t *config, boolean_t isimport) return (ZPOOL_STATUS_VERSION_NEWER); /* + * Unsupported feature(s). + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_UNSUP_FEAT) { + nvlist_t *nvinfo; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + &nvinfo) == 0); + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) + return (ZPOOL_STATUS_UNSUP_FEAT_WRITE); + return (ZPOOL_STATUS_UNSUP_FEAT_READ); + } + + /* * Check that the config is complete. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && @@ -301,7 +317,7 @@ check_status(nvlist_t *config, boolean_t isimport) /* * Outdated, but usable, version */ - if (version < SPA_VERSION) + if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) return (ZPOOL_STATUS_VERSION_OLDER); return (ZPOOL_STATUS_OK); diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 42700877a..a4e1255bb 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -18,9 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -47,6 +48,7 @@ #include "libzfs_impl.h" #include "zfs_prop.h" +#include "zfeature_common.h" int libzfs_errno(libzfs_handle_t *hdl) @@ -114,7 +116,8 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_RESILVERING: return (dgettext(TEXT_DOMAIN, "currently resilvering")); case EZFS_BADVERSION: - return (dgettext(TEXT_DOMAIN, "unsupported version")); + return (dgettext(TEXT_DOMAIN, "unsupported version or " + "feature")); case EZFS_POOLUNAVAIL: return (dgettext(TEXT_DOMAIN, "pool is unavailable")); case EZFS_DEVOVERFLOW: @@ -709,6 +712,7 @@ libzfs_init(void) zfs_prop_init(); zpool_prop_init(); + zpool_feature_init(); libzfs_mnttab_init(hdl); return (hdl); @@ -1532,9 +1536,11 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, * this is a pool property or if this isn't a user-defined * dataset property, */ - if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || - (!zfs_prop_user(propname) && !zfs_prop_userquota(propname) && - !zfs_prop_written(propname)))) { + if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL && + !zpool_prop_feature(propname) && + !zpool_prop_unsupported(propname)) || + (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) && + !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, @@ -1546,7 +1552,8 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, entry->pl_prop = prop; if (prop == ZPROP_INVAL) { - if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == NULL) { + if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == + NULL) { free(entry); return (-1); } diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index cbe3acd34..1285af325 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -23,6 +23,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/arc.c \ $(top_srcdir)/module/zfs/bplist.c \ $(top_srcdir)/module/zfs/bpobj.c \ + $(top_srcdir)/module/zfs/bptree.c \ $(top_srcdir)/module/zfs/dbuf.c \ $(top_srcdir)/module/zfs/ddt.c \ $(top_srcdir)/module/zfs/ddt_zap.c \ @@ -74,6 +75,8 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/zap.c \ $(top_srcdir)/module/zfs/zap_leaf.c \ $(top_srcdir)/module/zfs/zap_micro.c \ + $(top_srcdir)/module/zfs/zfeature.c \ + $(top_srcdir)/module/zfs/zfeature_common.c \ $(top_srcdir)/module/zfs/zfs_byteswap.c \ $(top_srcdir)/module/zfs/zfs_debug.c \ $(top_srcdir)/module/zfs/zfs_fm.c \ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index c38efd0aa..0e10c8951 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -647,7 +647,9 @@ vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, * To simulate partial disk writes, we split writes into two * system calls so that the process can be killed in between. */ - split = (len > 0 ? rand() % len : 0); + int sectors = len >> SPA_MINBLOCKSHIFT; + split = (sectors > 0 ? rand() % sectors : 0) << + SPA_MINBLOCKSHIFT; rc = pwrite64(vp->v_fd, addr, split, offset); if (rc != -1) { done = rc; diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 new file mode 100644 index 000000000..453e1ad2a --- /dev/null +++ b/man/man5/zpool-features.5 @@ -0,0 +1,172 @@ +'\" te +.\" Copyright (c) 2012 by Delphix. All rights reserved. +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.TH ZPOOL-FEATURES 5 "Mar 16, 2012" +.SH NAME +zpool\-features \- ZFS pool feature descriptions +.SH DESCRIPTION +.sp +.LP +ZFS pool on\-disk format versions are specified via "features" which replace +the old on\-disk format numbers (the last supported on\-disk format number is +28). To enable a feature on a pool use the \fBzpool\fR(1M) command to set +the \fBfeature@\fR\fIfeature_name\fR property to \fBenabled\fR. +.sp +.LP +The pool format does not affect file system version compatibility or the ability +to send file systems between pools. +.sp +.LP +Since most features can be enabled independently of each other the on\-disk +format of the pool is specified by the set of all features marked as +\fBactive\fR on the pool. If the pool was created by another software version +this set may include unsupported features. +.SS "Identifying features" +.sp +.LP +Every feature has a guid of the form \fIcom.example:feature_name\fR. The reverse +DNS name ensures that the feature's guid is unique across all ZFS +implementations. When unsupported features are encountered on a pool they will +be identified by their guids. Refer to the documentation for the ZFS +implementation that created the pool for information about those features. +.sp +.LP +Each supported feature also has a short name. By convention a feature's short +name is the portion of its guid which follows the ':' (e.g. +\fIcom.example:feature_name\fR would have the short name \fIfeature_name\fR), +however a feature's short name may differ across ZFS implementations if +following the convention would result in name conflicts. +.SS "Feature states" +.sp +.LP +Features can be in one of three states: +.sp +.ne 2 +.na +\fB\fBactive\fR\fR +.ad +.RS 12n +This feature's on\-disk format changes are in effect on the pool. Support for +this feature is required to import the pool in read\-write mode. If this +feature is not read-only compatible, support is also required to import the pool +in read\-only mode (see "Read\-only compatibility"). +.RE + +.sp +.ne 2 +.na +\fB\fBenabled\fR\fR +.ad +.RS 12n +An administrator has marked this feature as enabled on the pool, but the +feature's on\-disk format changes have not been made yet. The pool can still be +imported by software that does not support this feature, but changes may be made +to the on\-disk format at any time which will move the feature to the +\fBactive\fR state. Some features may support returning to the \fBenabled\fR +state after becoming \fBactive\fR. See feature\-specific documentation for +details. +.RE + +.sp +.ne 2 +.na +\fBdisabled\fR +.ad +.RS 12n +This feature's on\-disk format changes have not been made and will not be made +unless an administrator moves the feature to the \fBenabled\fR state. Features +cannot be disabled once they have been enabled. +.RE + +.sp +.LP +The state of supported features is exposed through pool properties of the form +\fIfeature@short_name\fR. +.SS "Read\-only compatibility" +.sp +.LP +Some features may make on\-disk format changes that do not interfere with other +software's ability to read from the pool. These features are referred to as +"read\-only compatible". If all unsupported features on a pool are read\-only +compatible, the pool can be imported in read\-only mode by setting the +\fBreadonly\fR property during import (see \fBzpool\fR(1M) for details on +importing pools). +.SS "Unsupported features" +.sp +.LP +For each unsupported feature enabled on an imported pool a pool property +named \fIunsupported@feature_guid\fR will indicate why the import was allowed +despite the unsupported feature. Possible values for this property are: + +.sp +.ne 2 +.na +\fB\fBinactive\fR\fR +.ad +.RS 12n +The feature is in the \fBenabled\fR state and therefore the pool's on\-disk +format is still compatible with software that does not support this feature. +.RE + +.sp +.ne 2 +.na +\fB\fBreadonly\fR\fR +.ad +.RS 12n +The feature is read\-only compatible and the pool has been imported in +read\-only mode. +.RE + +.SS "Feature dependencies" +.sp +.LP +Some features depend on other features being enabled in order to function +properly. Enabling a feature will automatically enable any features it +depends on. +.SH FEATURES +.sp +.LP +The following features are supported on this system: +.sp +.ne 2 +.na +\fB\fBasync_destroy\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:async_destroy +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +Destroying a file system requires traversing all of its data in order to +return its used space to the pool. Without \fBasync_destroy\fR the file system +is not fully removed until all space has been reclaimed. If the destroy +operation is interrupted by a reboot or power outage the next attempt to open +the pool will need to complete the destroy operation synchronously. + +When \fBasync_destroy\fR is enabled the file system's data will be reclaimed +by a background process, allowing the destroy operation to complete without +traversing the entire file system. The background process is able to resume +interrupted destroys after the pool has been opened, eliminating the need +to finish interrupted destroys as part of the open operation. The amount +of space remaining to be reclaimed by the background process is available +through the \fBfreeing\fR property. + +This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero. +.RE +.SH "SEE ALSO" +\fBzpool\fR(1M) diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 35feda7c8..8c110511a 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -3,10 +3,19 @@ .\" Copyright 2011 Nexenta Systems, Inc. All rights reserved. .\" Copyright (c) 2012 by Delphix. All Rights Reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. -.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the -.\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH zpool 8 "2 August 2012" "ZFS pool 28, filesystem 5" "System Administration Commands" +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.TH zpool 8 "14 December 2012" "ZFS pool 28, filesystem 5" "System Administration Commands" .SH NAME zpool \- configures ZFS storage pools .SH SYNOPSIS @@ -32,7 +41,7 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] +\fBzpool create\fR [\fB-fnd\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ... .fi @@ -467,24 +476,34 @@ Percentage of pool space used. This property can also be referred to by its shor .ne 2 .mk .na -\fB\fBcomment\fR\fR +\fB\fBexpandsize\fR\fR .ad .RS 20n -.rt -A text string consisting of printable ASCII characters that will be stored such that it is available even if the pool becomes faulted. An administrator can provide additional information about a pool using this property. +Amount of uninitialized space within the pool or device that can be used to +increase the total capacity of the pool. Uninitialized space consists of +any space on an EFI labeled vdev which has not been brought online +(i.e. zpool online -e). This space occurs when a LUN is dynamically expanded. .RE .sp .ne 2 -.mk .na -\fB\fBexpandsize\fR\fR +\fB\fBfree\fR\fR .ad .RS 20n -Amount of uninitialized space within the pool or device that can be used to -increase the total capacity of the pool. Uninitialized space consists of -any space on an EFI labeled vdev which has not been brought online -(i.e. zpool online -e). This space occurs when a LUN is dynamically expanded. +The amount of free space available in the pool. +.RE + +.sp +.ne 2 +.na +\fB\fBfreeing\fR\fR +.ad +.RS 20n +After a file system or snapshot is destroyed, the space it was using is +returned to the pool asynchronously. \fB\fBfreeing\fR\fR is the amount of +space remaining to be reclaimed. Over time \fB\fBfreeing\fR\fR will decrease +while \fB\fBfree\fR\fR increases. .RE .sp @@ -521,6 +540,16 @@ Total size of the storage pool. .sp .ne 2 +.na +\fB\fBunsupported@\fR\fIfeature_guid\fR\fR +.ad +.RS 20n +Information about unsupported features that are enabled on the pool. See +\fBzpool-features\fR(5) for details. +.RE + +.sp +.ne 2 .mk .na \fB\fBused\fR\fR @@ -532,7 +561,7 @@ Amount of storage space used within the pool. .sp .LP -These space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(8) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable. +The space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(8) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable. .sp .LP @@ -617,6 +646,17 @@ Multiple pools can share the same cache file. Because the kernel destroys and re .ne 2 .mk .na +\fB\fBcomment\fR=\fB\fItext\fR\fR +.ad +.sp .6 +.RS 4n +A text string consisting of printable ASCII characters that will be stored such that it is available even if the pool becomes faulted. An administrator can provide additional information about a pool using this property. +.RE + +.sp +.ne 2 +.mk +.na \fB\fBdelegation\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 @@ -670,6 +710,18 @@ Prints out a message to the console and generates a system crash dump. .sp .ne 2 +.na +\fB\fBfeature@\fR\fIfeature_name\fR=\fBenabled\fR\fR +.ad +.RS 4n +The value of this property is the current state of \fIfeature_name\fR. The +only valid value when setting this property is \fBenabled\fR which moves +\fIfeature_name\fR to the enabled state. See \fBzpool-features\fR(5) for +details on feature states. +.RE + +.sp +.ne 2 .mk .na \fB\fBlistsnaps\fR=on | off\fR @@ -687,7 +739,7 @@ Controls whether information about snapshots associated with this pool is output .ad .sp .6 .RS 4n -The current on-disk version of the pool. This can be increased, but never decreased. The preferred method of updating pools is with the "\fBzpool upgrade\fR" command, though this property can be used when a specific version is needed for backwards compatibility. This property can be any number between 1 and the current version reported by "\fBzpool upgrade -v\fR". +The current on-disk version of the pool. This can be increased, but never decreased. The preferred method of updating pools is with the "\fBzpool upgrade\fR" command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. .RE .SS "Subcommands" @@ -801,7 +853,7 @@ Clears device errors in a pool. If no arguments are specified, all device errors .ne 2 .mk .na -\fB\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...\fR +\fB\fBzpool create\fR [\fB-fnd\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...\fR .ad .sp .6 .RS 4n @@ -813,6 +865,8 @@ The command also checks that the replication strategy for the pool is consistent .sp Unless the \fB-R\fR option is specified, the default mount point is "/\fIpool\fR". The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the \fB-m\fR option. .sp +By default all supported features are enabled on the new pool unless the \fB-d\fR option is specified. +.sp .ne 2 .mk .na @@ -838,6 +892,16 @@ Displays the configuration that would be used without actually creating the pool .ne 2 .mk .na +\fB\fB-d\fR\fR +.ad +.sp .6 +.RS 4n +Do not enable any features on the new pool. Individual features can be enabled by setting their corresponding properties to \fBenabled\fR with the \fB-o\fR option. See \fBzpool-features\fR(5) for details about feature properties. +.RE + +.sp +.ne 2 +.na \fB\fB-o\fR \fIproperty=value\fR [\fB-o\fR \fIproperty=value\fR] ...\fR .ad .sp .6 @@ -1965,4 +2029,4 @@ Invalid command line options were specified. .SH SEE ALSO .sp .LP -\fBzfs\fR(8) +\fBzfs\fR(8), \fBzpool-features\fR(5) diff --git a/module/nvpair/Makefile.in b/module/nvpair/Makefile.in index b53381f6a..211fc726d 100644 --- a/module/nvpair/Makefile.in +++ b/module/nvpair/Makefile.in @@ -5,5 +5,6 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ obj-$(CONFIG_ZFS) := $(MODULE).o $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair.o +$(MODULE)-objs += @top_srcdir@/module/nvpair/fnvpair.o $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_spl.o $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_fixed.o diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c new file mode 100644 index 000000000..fa28afc75 --- /dev/null +++ b/module/nvpair/fnvpair.c @@ -0,0 +1,566 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/nvpair.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#ifndef _KERNEL +#include <stdlib.h> +#endif + +/* + * "Force" nvlist wrapper. + * + * These functions wrap the nvlist_* functions with assertions that assume + * the operation is successful. This allows the caller's code to be much + * more readable, especially for the fnvlist_lookup_* and fnvpair_value_* + * functions, which can return the requested value (rather than filling in + * a pointer). + * + * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate + * with KM_SLEEP. + * + * More wrappers should be added as needed -- for example + * nvlist_lookup_*_array and nvpair_value_*_array. + */ + +nvlist_t * +fnvlist_alloc(void) +{ + nvlist_t *nvl; + VERIFY3U(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP), ==, 0); + return (nvl); +} + +void +fnvlist_free(nvlist_t *nvl) +{ + nvlist_free(nvl); +} + +size_t +fnvlist_size(nvlist_t *nvl) +{ + size_t size; + VERIFY3U(nvlist_size(nvl, &size, NV_ENCODE_NATIVE), ==, 0); + return (size); +} + +/* + * Returns allocated buffer of size *sizep. Caller must free the buffer with + * fnvlist_pack_free(). + */ +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + char *packed = 0; + VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, + KM_SLEEP), ==, 0); + return (packed); +} + +/*ARGSUSED*/ +void +fnvlist_pack_free(char *pack, size_t size) +{ +#ifdef _KERNEL + kmem_free(pack, size); +#else + free(pack); +#endif +} + +nvlist_t * +fnvlist_unpack(char *buf, size_t buflen) +{ + nvlist_t *rv; + VERIFY3U(nvlist_unpack(buf, buflen, &rv, KM_SLEEP), ==, 0); + return (rv); +} + +nvlist_t * +fnvlist_dup(nvlist_t *nvl) +{ + nvlist_t *rv; + VERIFY3U(nvlist_dup(nvl, &rv, KM_SLEEP), ==, 0); + return (rv); +} + +void +fnvlist_merge(nvlist_t *dst, nvlist_t *src) +{ + VERIFY3U(nvlist_merge(dst, src, KM_SLEEP), ==, 0); +} + +void +fnvlist_add_boolean(nvlist_t *nvl, const char *name) +{ + VERIFY3U(nvlist_add_boolean(nvl, name), ==, 0); +} + +void +fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) +{ + VERIFY3U(nvlist_add_boolean_value(nvl, name, val), ==, 0); +} + +void +fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) +{ + VERIFY3U(nvlist_add_byte(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) +{ + VERIFY3U(nvlist_add_int8(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) +{ + VERIFY3U(nvlist_add_uint8(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) +{ + VERIFY3U(nvlist_add_int16(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) +{ + VERIFY3U(nvlist_add_uint16(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) +{ + VERIFY3U(nvlist_add_int32(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) +{ + VERIFY3U(nvlist_add_uint32(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) +{ + VERIFY3U(nvlist_add_int64(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) +{ + VERIFY3U(nvlist_add_uint64(nvl, name, val), ==, 0); +} + +void +fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val) +{ + VERIFY3U(nvlist_add_string(nvl, name, val), ==, 0); +} + +void +fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + VERIFY3U(nvlist_add_nvlist(nvl, name, val), ==, 0); +} + +void +fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY3U(nvlist_add_nvpair(nvl, pair), ==, 0); +} + +void +fnvlist_add_boolean_array(nvlist_t *nvl, const char *name, + boolean_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_boolean_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_byte_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int8_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint8_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int16_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint16_array(nvlist_t *nvl, const char *name, + uint16_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint16_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int32_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint32_array(nvlist_t *nvl, const char *name, + uint32_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint32_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int64_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint64_array(nvlist_t *nvl, const char *name, + uint64_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint64_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_string_array(nvlist_t *nvl, const char *name, + char * const *val, uint_t n) +{ + VERIFY3U(nvlist_add_string_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name, + nvlist_t **val, uint_t n) +{ + VERIFY3U(nvlist_add_nvlist_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_remove(nvlist_t *nvl, const char *name) +{ + VERIFY3U(nvlist_remove_all(nvl, name), ==, 0); +} + +void +fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY3U(nvlist_remove_nvpair(nvl, pair), ==, 0); +} + +nvpair_t * +fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name) +{ + nvpair_t *rv; + VERIFY3U(nvlist_lookup_nvpair(nvl, name, &rv), ==, 0); + return (rv); +} + +/* returns B_TRUE if the entry exists */ +boolean_t +fnvlist_lookup_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_lookup_boolean(nvl, name) == 0); +} + +boolean_t +fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name) +{ + boolean_t rv; + VERIFY3U(nvlist_lookup_boolean_value(nvl, name, &rv), ==, 0); + return (rv); +} + +uchar_t +fnvlist_lookup_byte(nvlist_t *nvl, const char *name) +{ + uchar_t rv; + VERIFY3U(nvlist_lookup_byte(nvl, name, &rv), ==, 0); + return (rv); +} + +int8_t +fnvlist_lookup_int8(nvlist_t *nvl, const char *name) +{ + int8_t rv; + VERIFY3U(nvlist_lookup_int8(nvl, name, &rv), ==, 0); + return (rv); +} + +int16_t +fnvlist_lookup_int16(nvlist_t *nvl, const char *name) +{ + int16_t rv; + VERIFY3U(nvlist_lookup_int16(nvl, name, &rv), ==, 0); + return (rv); +} + +int32_t +fnvlist_lookup_int32(nvlist_t *nvl, const char *name) +{ + int32_t rv; + VERIFY3U(nvlist_lookup_int32(nvl, name, &rv), ==, 0); + return (rv); +} + +int64_t +fnvlist_lookup_int64(nvlist_t *nvl, const char *name) +{ + int64_t rv; + VERIFY3U(nvlist_lookup_int64(nvl, name, &rv), ==, 0); + return (rv); +} + +uint8_t +fnvlist_lookup_uint8(nvlist_t *nvl, const char *name) +{ + uint8_t rv; + VERIFY3U(nvlist_lookup_uint8(nvl, name, &rv), ==, 0); + return (rv); +} + +uint16_t +fnvlist_lookup_uint16(nvlist_t *nvl, const char *name) +{ + uint16_t rv; + VERIFY3U(nvlist_lookup_uint16(nvl, name, &rv), ==, 0); + return (rv); +} + +uint32_t +fnvlist_lookup_uint32(nvlist_t *nvl, const char *name) +{ + uint32_t rv; + VERIFY3U(nvlist_lookup_uint32(nvl, name, &rv), ==, 0); + return (rv); +} + +uint64_t +fnvlist_lookup_uint64(nvlist_t *nvl, const char *name) +{ + uint64_t rv; + VERIFY3U(nvlist_lookup_uint64(nvl, name, &rv), ==, 0); + return (rv); +} + +char * +fnvlist_lookup_string(nvlist_t *nvl, const char *name) +{ + char *rv; + VERIFY3U(nvlist_lookup_string(nvl, name, &rv), ==, 0); + return (rv); +} + +nvlist_t * +fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name) +{ + nvlist_t *rv; + VERIFY3U(nvlist_lookup_nvlist(nvl, name, &rv), ==, 0); + return (rv); +} + +boolean_t +fnvpair_value_boolean_value(nvpair_t *nvp) +{ + boolean_t rv; + VERIFY3U(nvpair_value_boolean_value(nvp, &rv), ==, 0); + return (rv); +} + +uchar_t +fnvpair_value_byte(nvpair_t *nvp) +{ + uchar_t rv; + VERIFY3U(nvpair_value_byte(nvp, &rv), ==, 0); + return (rv); +} + +int8_t +fnvpair_value_int8(nvpair_t *nvp) +{ + int8_t rv; + VERIFY3U(nvpair_value_int8(nvp, &rv), ==, 0); + return (rv); +} + +int16_t +fnvpair_value_int16(nvpair_t *nvp) +{ + int16_t rv; + VERIFY3U(nvpair_value_int16(nvp, &rv), ==, 0); + return (rv); +} + +int32_t +fnvpair_value_int32(nvpair_t *nvp) +{ + int32_t rv; + VERIFY3U(nvpair_value_int32(nvp, &rv), ==, 0); + return (rv); +} + +int64_t +fnvpair_value_int64(nvpair_t *nvp) +{ + int64_t rv; + VERIFY3U(nvpair_value_int64(nvp, &rv), ==, 0); + return (rv); +} + +uint8_t +fnvpair_value_uint8(nvpair_t *nvp) +{ + uint8_t rv; + VERIFY3U(nvpair_value_uint8(nvp, &rv), ==, 0); + return (rv); +} + +uint16_t +fnvpair_value_uint16(nvpair_t *nvp) +{ + uint16_t rv; + VERIFY3U(nvpair_value_uint16(nvp, &rv), ==, 0); + return (rv); +} + +uint32_t +fnvpair_value_uint32(nvpair_t *nvp) +{ + uint32_t rv; + VERIFY3U(nvpair_value_uint32(nvp, &rv), ==, 0); + return (rv); +} + +uint64_t +fnvpair_value_uint64(nvpair_t *nvp) +{ + uint64_t rv; + VERIFY3U(nvpair_value_uint64(nvp, &rv), ==, 0); + return (rv); +} + +char * +fnvpair_value_string(nvpair_t *nvp) +{ + char *rv; + VERIFY3U(nvpair_value_string(nvp, &rv), ==, 0); + return (rv); +} + +nvlist_t * +fnvpair_value_nvlist(nvpair_t *nvp) +{ + nvlist_t *rv; + VERIFY3U(nvpair_value_nvlist(nvp, &rv), ==, 0); + return (rv); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) + +EXPORT_SYMBOL(fnvlist_alloc); +EXPORT_SYMBOL(fnvlist_free); +EXPORT_SYMBOL(fnvlist_size); +EXPORT_SYMBOL(fnvlist_pack); +EXPORT_SYMBOL(fnvlist_unpack); +EXPORT_SYMBOL(fnvlist_dup); +EXPORT_SYMBOL(fnvlist_merge); + +EXPORT_SYMBOL(fnvlist_add_nvpair); +EXPORT_SYMBOL(fnvlist_add_boolean); +EXPORT_SYMBOL(fnvlist_add_boolean_value); +EXPORT_SYMBOL(fnvlist_add_byte); +EXPORT_SYMBOL(fnvlist_add_int8); +EXPORT_SYMBOL(fnvlist_add_uint8); +EXPORT_SYMBOL(fnvlist_add_int16); +EXPORT_SYMBOL(fnvlist_add_uint16); +EXPORT_SYMBOL(fnvlist_add_int32); +EXPORT_SYMBOL(fnvlist_add_uint32); +EXPORT_SYMBOL(fnvlist_add_int64); +EXPORT_SYMBOL(fnvlist_add_uint64); +EXPORT_SYMBOL(fnvlist_add_string); +EXPORT_SYMBOL(fnvlist_add_nvlist); +EXPORT_SYMBOL(fnvlist_add_boolean_array); +EXPORT_SYMBOL(fnvlist_add_byte_array); +EXPORT_SYMBOL(fnvlist_add_int8_array); +EXPORT_SYMBOL(fnvlist_add_uint8_array); +EXPORT_SYMBOL(fnvlist_add_int16_array); +EXPORT_SYMBOL(fnvlist_add_uint16_array); +EXPORT_SYMBOL(fnvlist_add_int32_array); +EXPORT_SYMBOL(fnvlist_add_uint32_array); +EXPORT_SYMBOL(fnvlist_add_int64_array); +EXPORT_SYMBOL(fnvlist_add_uint64_array); +EXPORT_SYMBOL(fnvlist_add_string_array); +EXPORT_SYMBOL(fnvlist_add_nvlist_array); + +EXPORT_SYMBOL(fnvlist_remove); +EXPORT_SYMBOL(fnvlist_remove_nvpair); + +EXPORT_SYMBOL(fnvlist_lookup_nvpair); +EXPORT_SYMBOL(fnvlist_lookup_boolean); +EXPORT_SYMBOL(fnvlist_lookup_boolean_value); +EXPORT_SYMBOL(fnvlist_lookup_byte); +EXPORT_SYMBOL(fnvlist_lookup_int8); +EXPORT_SYMBOL(fnvlist_lookup_uint8); +EXPORT_SYMBOL(fnvlist_lookup_int16); +EXPORT_SYMBOL(fnvlist_lookup_uint16); +EXPORT_SYMBOL(fnvlist_lookup_int32); +EXPORT_SYMBOL(fnvlist_lookup_uint32); +EXPORT_SYMBOL(fnvlist_lookup_int64); +EXPORT_SYMBOL(fnvlist_lookup_uint64); +EXPORT_SYMBOL(fnvlist_lookup_string); +EXPORT_SYMBOL(fnvlist_lookup_nvlist); + +EXPORT_SYMBOL(fnvpair_value_boolean_value); +EXPORT_SYMBOL(fnvpair_value_byte); +EXPORT_SYMBOL(fnvpair_value_int8); +EXPORT_SYMBOL(fnvpair_value_uint8); +EXPORT_SYMBOL(fnvpair_value_int16); +EXPORT_SYMBOL(fnvpair_value_uint16); +EXPORT_SYMBOL(fnvpair_value_int32); +EXPORT_SYMBOL(fnvpair_value_uint32); +EXPORT_SYMBOL(fnvpair_value_int64); +EXPORT_SYMBOL(fnvpair_value_uint64); +EXPORT_SYMBOL(fnvpair_value_string); +EXPORT_SYMBOL(fnvpair_value_nvlist); + +#endif diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 303edcefc..1173fc0c9 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -79,6 +79,8 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "<size>", "SIZE"); zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "FREE"); + zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<size>", "FREEING"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, @@ -170,6 +172,26 @@ zpool_prop_default_numeric(zpool_prop_t prop) return (zpool_prop_table[prop].pd_numdefault); } +/* + * Returns true if this is a valid feature@ property. + */ +boolean_t +zpool_prop_feature(const char *name) +{ + static const char *prefix = "feature@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* + * Returns true if this is a valid unsupported@ property. + */ +boolean_t +zpool_prop_unsupported(const char *name) +{ + static const char *prefix = "unsupported@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + int zpool_prop_string_to_index(zpool_prop_t prop, const char *string, uint64_t *index) @@ -223,6 +245,8 @@ EXPORT_SYMBOL(zpool_prop_to_name); EXPORT_SYMBOL(zpool_prop_default_string); EXPORT_SYMBOL(zpool_prop_default_numeric); EXPORT_SYMBOL(zpool_prop_readonly); +EXPORT_SYMBOL(zpool_prop_feature); +EXPORT_SYMBOL(zpool_prop_unsupported); EXPORT_SYMBOL(zpool_prop_index_to_string); EXPORT_SYMBOL(zpool_prop_string_to_index); #endif diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 98576d1d2..f1e32a19d 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o +$(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o $(MODULE)-objs += @top_srcdir@/module/zfs/dmu.o @@ -59,6 +60,8 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_root.o $(MODULE)-objs += @top_srcdir@/module/zfs/zap.o $(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o $(MODULE)-objs += @top_srcdir@/module/zfs/zap_micro.o +$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature.o +$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature_common.o $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_acl.o $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_byteswap.o $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ctldir.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6ec9f04b7..5d9b34fbf 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2729,9 +2729,11 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + dmu_ot_byteswap[bswap].ob_func; func(buf->b_data, hdr->b_size); } diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c new file mode 100644 index 000000000..8c5a7d40e --- /dev/null +++ b/module/zfs/bptree.c @@ -0,0 +1,224 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/arc.h> +#include <sys/bptree.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/refcount.h> +#include <sys/spa.h> + +/* + * A bptree is a queue of root block pointers from destroyed datasets. When a + * dataset is destroyed its root block pointer is put on the end of the pool's + * bptree queue so the dataset's blocks can be freed asynchronously by + * dsl_scan_sync. This allows the delete operation to finish without traversing + * all the dataset's blocks. + * + * Note that while bt_begin and bt_end are only ever incremented in this code + * they are effectively reset to 0 every time the entire bptree is freed because + * the bptree's object is destroyed and re-created. + */ + +struct bptree_args { + bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ + boolean_t ba_free; /* true if freeing during traversal */ + + bptree_itor_t *ba_func; /* function to call for each blockpointer */ + void *ba_arg; /* caller supplied argument to ba_func */ + dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ +} bptree_args_t; + +uint64_t +bptree_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t obj; + dmu_buf_t *db; + bptree_phys_t *bt; + + obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, + SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (bptree_phys_t), tx); + + /* + * Bonus buffer contents are already initialized to 0, but for + * readability we make it explicit. + */ + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + bt = db->db_data; + bt->bt_begin = 0; + bt->bt_end = 0; + bt->bt_bytes = 0; + bt->bt_comp = 0; + bt->bt_uncomp = 0; + dmu_buf_rele(db, FTAG); + + return (obj); +} + +int +bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + ASSERT3U(bt->bt_begin, ==, bt->bt_end); + ASSERT3U(bt->bt_bytes, ==, 0); + ASSERT3U(bt->bt_comp, ==, 0); + ASSERT3U(bt->bt_uncomp, ==, 0); + dmu_buf_rele(db, FTAG); + + return (dmu_object_free(os, obj, tx)); +} + +void +bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + bptree_entry_phys_t bte; + + /* + * bptree objects are in the pool mos, therefore they can only be + * modified in syncing context. Furthermore, this is only modified + * by the sync thread, so no locking is necessary. + */ + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + + bte.be_birth_txg = birth_txg; + bte.be_bp = *bp; + bzero(&bte.be_zb, sizeof (bte.be_zb)); + dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + + dmu_buf_will_dirty(db, tx); + bt->bt_end++; + bt->bt_bytes += bytes; + bt->bt_comp += comp; + bt->bt_uncomp += uncomp; + dmu_buf_rele(db, FTAG); +} + +/* ARGSUSED */ +static int +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + int err; + struct bptree_args *ba = arg; + + if (bp == NULL) + return (0); + + err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); + if (err == 0 && ba->ba_free) { + ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); + ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); + ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); + } + return (err); +} + +int +bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, + void *arg, dmu_tx_t *tx) +{ + int err; + uint64_t i; + dmu_buf_t *db; + struct bptree_args ba; + + ASSERT(!free || dmu_tx_is_syncing(tx)); + + err = dmu_bonus_hold(os, obj, FTAG, &db); + if (err != 0) + return (err); + + if (free) + dmu_buf_will_dirty(db, tx); + + ba.ba_phys = db->db_data; + ba.ba_free = free; + ba.ba_func = func; + ba.ba_arg = arg; + ba.ba_tx = tx; + + err = 0; + for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { + bptree_entry_phys_t bte; + + ASSERT(!free || i == ba.ba_phys->bt_begin); + + err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), + &bte, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, + bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST, + bptree_visit_cb, &ba); + if (free) { + ASSERT(err == 0 || err == ERESTART); + if (err != 0) { + /* save bookmark for future resume */ + ASSERT3U(bte.be_zb.zb_objset, ==, + ZB_DESTROYED_OBJSET); + ASSERT3U(bte.be_zb.zb_level, ==, 0); + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + break; + } else { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } + } + + ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + + /* if all blocks are free there should be no used space */ + if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + ASSERT3U(ba.ba_phys->bt_bytes, ==, 0); + ASSERT3U(ba.ba_phys->bt_comp, ==, 0); + ASSERT3U(ba.ba_phys->bt_uncomp, ==, 0); + } + + dmu_buf_rele(db, FTAG); + + return (err); +} diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 1f6fa9340..99f728f4b 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -260,7 +261,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db) boolean_t is_metadata; DB_DNODE_ENTER(db); - is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 6221157f8..ef868619a 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -1120,11 +1121,9 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { - spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, - DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, - &spa->spa_ddt_stat_object, tx) == 0); + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); } while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 00a7a07f4..5d3f70d4c 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -46,60 +47,73 @@ #endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { byteswap_uint8_array, TRUE, "unallocated" }, - { zap_byteswap, TRUE, "object directory" }, - { byteswap_uint64_array, TRUE, "object array" }, - { byteswap_uint8_array, TRUE, "packed nvlist" }, - { byteswap_uint64_array, TRUE, "packed nvlist size" }, - { byteswap_uint64_array, TRUE, "bpobj" }, - { byteswap_uint64_array, TRUE, "bpobj header" }, - { byteswap_uint64_array, TRUE, "SPA space map header" }, - { byteswap_uint64_array, TRUE, "SPA space map" }, - { byteswap_uint64_array, TRUE, "ZIL intent log" }, - { dnode_buf_byteswap, TRUE, "DMU dnode" }, - { dmu_objset_byteswap, TRUE, "DMU objset" }, - { byteswap_uint64_array, TRUE, "DSL directory" }, - { zap_byteswap, TRUE, "DSL directory child map"}, - { zap_byteswap, TRUE, "DSL dataset snap map" }, - { zap_byteswap, TRUE, "DSL props" }, - { byteswap_uint64_array, TRUE, "DSL dataset" }, - { zfs_znode_byteswap, TRUE, "ZFS znode" }, - { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, - { byteswap_uint8_array, FALSE, "ZFS plain file" }, - { zap_byteswap, TRUE, "ZFS directory" }, - { zap_byteswap, TRUE, "ZFS master node" }, - { zap_byteswap, TRUE, "ZFS delete queue" }, - { byteswap_uint8_array, FALSE, "zvol object" }, - { zap_byteswap, TRUE, "zvol prop" }, - { byteswap_uint8_array, FALSE, "other uint8[]" }, - { byteswap_uint64_array, FALSE, "other uint64[]" }, - { zap_byteswap, TRUE, "other ZAP" }, - { zap_byteswap, TRUE, "persistent error log" }, - { byteswap_uint8_array, TRUE, "SPA history" }, - { byteswap_uint64_array, TRUE, "SPA history offsets" }, - { zap_byteswap, TRUE, "Pool properties" }, - { zap_byteswap, TRUE, "DSL permissions" }, - { zfs_acl_byteswap, TRUE, "ZFS ACL" }, - { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, - { byteswap_uint8_array, TRUE, "FUID table" }, - { byteswap_uint64_array, TRUE, "FUID table size" }, - { zap_byteswap, TRUE, "DSL dataset next clones"}, - { zap_byteswap, TRUE, "scan work queue" }, - { zap_byteswap, TRUE, "ZFS user/group used" }, - { zap_byteswap, TRUE, "ZFS user/group quota" }, - { zap_byteswap, TRUE, "snapshot refcount tags"}, - { zap_byteswap, TRUE, "DDT ZAP algorithm" }, - { zap_byteswap, TRUE, "DDT statistics" }, - { byteswap_uint8_array, TRUE, "System attributes" }, - { zap_byteswap, TRUE, "SA master node" }, - { zap_byteswap, TRUE, "SA attr registration" }, - { zap_byteswap, TRUE, "SA attr layouts" }, - { zap_byteswap, TRUE, "scan translations" }, - { byteswap_uint8_array, FALSE, "deduplicated block" }, - { zap_byteswap, TRUE, "DSL deadlist map" }, - { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, - { zap_byteswap, TRUE, "DSL dir clones" }, - { byteswap_uint64_array, TRUE, "bpobj subobj" }, + { DMU_BSWAP_UINT8, TRUE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, + { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, + { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } +}; + +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { + { byteswap_uint8_array, "uint8" }, + { byteswap_uint16_array, "uint16" }, + { byteswap_uint32_array, "uint32" }, + { byteswap_uint64_array, "uint64" }, + { zap_byteswap, "zap" }, + { dnode_buf_byteswap, "dnode" }, + { dmu_objset_byteswap, "objset" }, + { zfs_znode_byteswap, "znode" }, + { zfs_oldacl_byteswap, "oldacl" }, + { zfs_acl_byteswap, "acl" } }; int @@ -176,7 +190,7 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (type > DMU_OT_NUMTYPES) { + if (!DMU_OT_IS_VALID(type)) { error = EINVAL; } else if (dn->dn_bonus != db) { error = EINVAL; @@ -1695,7 +1709,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; - boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata || + boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 949f4d773..97c23cb07 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1077,8 +1077,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) void *data = NULL; if (drro->drr_type == DMU_OT_NONE || - drro->drr_type >= DMU_OT_NUMTYPES || - drro->drr_bonustype >= DMU_OT_NUMTYPES || + !DMU_OT_IS_VALID(drro->drr_type) || + !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || @@ -1143,7 +1143,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { - dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drro->drr_bonustype); + dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); @@ -1186,7 +1188,7 @@ restore_write(struct restorearg *ra, objset_t *os, int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || - drrw->drr_type >= DMU_OT_NUMTYPES) + !DMU_OT_IS_VALID(drrw->drr_type)) return (EINVAL); data = restore_read(ra, drrw->drr_length); @@ -1205,8 +1207,11 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } - if (ra->byteswap) - dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + if (ra->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); + } dmu_write(os, drrw->drr_object, drrw->drr_offset, drrw->drr_length, data, tx); dmu_tx_commit(tx); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 376f60f82..980c1aa2f 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -53,6 +54,7 @@ typedef struct traverse_data { uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; + zbookmark_t *td_resume; int td_flags; prefetch_data_t *td_pfd; blkptr_cb_t *td_func; @@ -128,6 +130,54 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh) zil_free(zilog); } +typedef enum resume_skip { + RESUME_SKIP_ALL, + RESUME_SKIP_NONE, + RESUME_SKIP_CHILDREN +} resume_skip_t; + +/* + * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and + * the block indicated by zb does not need to be visited at all. Returns + * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the + * resume point. This indicates that this block should be visited but not its + * children (since they must have been visited in a previous traversal). + * Otherwise returns RESUME_SKIP_NONE. + */ +static resume_skip_t +resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, + const zbookmark_t *zb) +{ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + /* + * If we already visited this bp & everything below, + * don't bother doing it again. + */ + if (zbookmark_is_before(dnp, zb, td->td_resume)) + return (RESUME_SKIP_ALL); + + /* + * If we found the block we're trying to resume from, zero + * the bookmark out to indicate that we have resumed. + */ + ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object); + if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { + bzero(td->td_resume, sizeof (*zb)); + if (td->td_flags & TRAVERSE_POST) + return (RESUME_SKIP_CHILDREN); + } + } + return (RESUME_SKIP_NONE); +} + +static void +traverse_pause(traverse_data_t *td, const zbookmark_t *zb) +{ + ASSERT(td->td_resume != NULL); + ASSERT3U(zb->zb_level, ==, 0); + bcopy(zb, td->td_resume, sizeof (*td->td_resume)); +} + static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) @@ -137,8 +187,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; + boolean_t pause = B_FALSE; + + switch (resume_skip_check(td, dnp, zb)) { + case RESUME_SKIP_ALL: + return (0); + case RESUME_SKIP_CHILDREN: + goto post; + case RESUME_SKIP_NONE: + break; + default: + ASSERT(0); + } - if (bp->blk_birth == 0) { + if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, td->td_arg); return (err); @@ -164,8 +226,10 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); - if (err) - return (err); + if (err == ERESTART) + pause = B_TRUE; /* handle pausing at a common point */ + if (err != 0) + goto post; } if (BP_GET_LEVEL(bp) > 0) { @@ -253,9 +317,18 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (buf) (void) arc_buf_remove_ref(buf, &buf); +post: if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, td->td_arg); + if (err == ERESTART) + pause = B_TRUE; + } + + if (pause && td->td_resume != NULL) { + ASSERT3U(err, ==, ERESTART); + ASSERT(!hard); + traverse_pause(td, zb); } return (err != 0 ? err : lasterr); @@ -353,22 +426,27 @@ traverse_prefetch_thread(void *arg) * in syncing context). */ static int -traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, - uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) +traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg) { traverse_data_t *td; prefetch_data_t *pd; zbookmark_t *czb; int err; + ASSERT(ds == NULL || objset == ds->ds_object); + ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); + td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE); pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE); czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE); td->td_spa = spa; - td->td_objset = ds ? ds->ds_object : 0; + td->td_objset = objset; td->td_rootbp = rootbp; td->td_min_txg = txg_start; + td->td_resume = resume; td->td_func = func; td->td_arg = arg; td->td_pfd = pd; @@ -424,8 +502,17 @@ int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, - &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, + &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg)); +} + +int +traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, + blkptr, txg_start, resume, flags, func, arg)); } /* @@ -442,8 +529,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ - err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), - txg_start, flags, func, arg); + err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), + txg_start, NULL, flags, func, arg); if (err) return (err); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 81c6dfea2..47ec4c109 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -20,9 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -693,7 +692,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) return; } - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { blkptr_t *bp; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 99ac62565..3a8a5e32e 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -194,7 +195,7 @@ dnode_verify(dnode_t *dn) ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; @@ -212,7 +213,7 @@ dnode_verify(dnode_t *dn) ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); - ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); @@ -278,8 +279,10 @@ dnode_byteswap(dnode_phys_t *dnp) */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); size_t len = DN_MAX_BONUSLEN - off; - ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); - dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + dmu_object_byteswap_t byteswap; + ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); + byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); + dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ @@ -407,7 +410,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dmu_zfetch_init(&dn->dn_zfetch, dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); list_insert_head(&os->os_dnodes, dn); @@ -496,11 +499,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); - ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT3U(dn->dn_maxblkid, ==, 0); @@ -568,7 +571,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index f2dda867d..58fa473cb 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -598,7 +600,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) } if (dn->dn_next_bonustype[txgoff]) { - ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; dn->dn_next_bonustype[txgoff] = 0; } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 21fdd081c..872d44afb 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -35,6 +35,7 @@ #include <sys/arc.h> #include <sys/zio.h> #include <sys/zap.h> +#include <sys/zfeature.h> #include <sys/unique.h> #include <sys/zfs_context.h> #include <sys/zfs_ioctl.h> @@ -102,7 +103,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) if (BP_IS_HOLE(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); - ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { /* * Account for the meta-objset space in its placeholder @@ -119,7 +120,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); - ds->ds_phys->ds_used_bytes += used; + ds->ds_phys->ds_referenced_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; @@ -215,8 +216,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, } } mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); - ds->ds_phys->ds_used_bytes -= used; + ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); + ds->ds_phys->ds_referenced_bytes -= used; ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); ds->ds_phys->ds_compressed_bytes -= compressed; ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); @@ -823,8 +824,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = origin->ds_phys->ds_creation_txg; - dsphys->ds_used_bytes = - origin->ds_phys->ds_used_bytes; + dsphys->ds_referenced_bytes = + origin->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = origin->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = @@ -938,7 +939,6 @@ dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { dsl_dataset_t *ds; - int err; err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); if (err == 0) { @@ -1088,19 +1088,23 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) goto out_free; /* - * remove the objects in open context, so that we won't - * have too much to do in syncing context. + * If async destruction is not enabled try to remove all objects + * while in the open context so that there is less work to do in + * the syncing context. */ - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, - ds->ds_phys->ds_prev_snap_txg)) { - /* - * Ignore errors, if there is not enough disk space - * we will deal with it in dsl_dataset_destroy_sync(). - */ - (void) dmu_free_object(os, obj); + if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, + ds->ds_phys->ds_prev_snap_txg)) { + /* + * Ignore errors, if there is not enough disk space + * we will deal with it in dsl_dataset_destroy_sync(). + */ + (void) dmu_free_object(os, obj); + } + if (err != ESRCH) + goto out_free; } - if (err != ESRCH) - goto out_free; /* * Only the ZIL knows how to free log blocks. @@ -1261,7 +1265,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) - mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; + mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; else mrs_used = 0; @@ -1269,7 +1273,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ASSERT3U(dlused, <=, mrs_used); ds->ds_phys->ds_unique_bytes = - ds->ds_phys->ds_used_bytes - (mrs_used - dlused); + ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) @@ -1627,12 +1631,36 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, ds_next->ds_phys->ds_deadlist_obj); } +static int +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + int err; + struct killarg ka; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + err = traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, + kill_blkptr, &ka); + ASSERT3U(err, ==, 0); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); + + return (err); +} + void dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_destroyarg *dsda = arg1; dsl_dataset_t *ds = dsda->ds; - int err; + int err = 0; int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; @@ -1773,7 +1801,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); - dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, @@ -1849,32 +1876,54 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) } dsl_dataset_rele(ds_next, FTAG); } else { + zfeature_info_t *async_destroy = + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; + /* * There's no next snapshot, so this is a head dataset. * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty. (If it's a clone, it's * safe to ignore the deadlist contents.) */ - struct killarg ka; - dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * NB: this should be very quick, because we already - * freed all the objects in open context. - */ - ka.ds = ds; - ka.tx = tx; - err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - TRAVERSE_POST, kill_blkptr, &ka); - ASSERT3U(err, ==, 0); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - ds->ds_phys->ds_unique_bytes == 0); + if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { + err = old_synchronous_dataset_destroy(ds, tx); + } else { + /* + * Move the bptree into the pool's list of trees to + * clean up and update space accounting information. + */ + uint64_t used, comp, uncomp; + + ASSERT(err == 0 || err == EBUSY); + if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { + spa_feature_incr(dp->dp_spa, async_destroy, tx); + dp->dp_bptree_obj = bptree_alloc( + dp->dp_meta_objset, tx); + VERIFY(zap_add(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx) == 0); + } + + used = ds->ds_dir->dd_phys->dd_used_bytes; + comp = ds->ds_dir->dd_phys->dd_compressed_bytes; + uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + ds->ds_phys->ds_unique_bytes == used); + + bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj, + &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, + used, comp, uncomp, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + } if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { @@ -2065,7 +2114,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; - dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; + dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; dsphys->ds_flags = ds->ds_phys->ds_flags; @@ -2189,10 +2238,22 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAXNAMELEN]; + /* + * Even though we hold the dp_config_rwlock, the dataset + * may fail to open, returning ENOENT. If there is a + * thread concurrently attempting to destroy this + * dataset, it will have the ds_rwlock held for + * RW_WRITER. Our call to dsl_dataset_hold_obj() -> + * dsl_dataset_hold_ref() will fail its + * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the + * dp_config_rwlock, and wait for the destroy progress + * and signal ds_exclusive_cv. If the destroy was + * successful, we will see that + * DSL_DATASET_IS_DESTROYED(), and return ENOENT. + */ if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone) != 0) { - goto fail; - } + za.za_first_integer, FTAG, &clone) != 0) + continue; dsl_dir_name(clone->ds_dir, buf); VERIFY(nvlist_add_boolean(val, buf) == 0); dsl_dataset_rele(clone, FTAG); @@ -2316,7 +2377,7 @@ dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - *refdbytesp = ds->ds_phys->ds_used_bytes; + *refdbytesp = ds->ds_phys->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; @@ -2652,7 +2713,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ - pa->used = origin_ds->ds_phys->ds_used_bytes; + pa->used = origin_ds->ds_phys->ds_referenced_bytes; pa->comp = origin_ds->ds_phys->ds_compressed_bytes; pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; for (snap = list_head(&pa->shared_snaps); snap; @@ -2686,7 +2747,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * so we need to subtract out the clone origin's used space. */ if (pa->origin_origin) { - pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; + pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } @@ -3203,8 +3264,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsl_deadlist_space(&csa->ohds->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); - dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - - (csa->ohds->ds_phys->ds_used_bytes + odl_used); + dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - + (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + @@ -3233,8 +3294,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) } /* swap ds_*_bytes */ - SWITCH64(csa->ohds->ds_phys->ds_used_bytes, - csa->cds->ds_phys->ds_used_bytes); + SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, + csa->cds->ds_phys->ds_referenced_bytes); SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, csa->cds->ds_phys->ds_compressed_bytes); SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, @@ -3363,8 +3424,9 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { - if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) + if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || + ds->ds_phys->ds_referenced_bytes < ds->ds_quota) error = ERESTART; else error = EDQUOT; @@ -3393,7 +3455,7 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) if (psa->psa_effective_value == 0) return (0); - if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || + if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || psa->psa_effective_value < ds->ds_reserved) return (ENOSPC); @@ -4141,8 +4203,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, dsl_pool_t *dp = new->ds_dir->dd_pool; *usedp = 0; - *usedp += new->ds_phys->ds_used_bytes; - *usedp -= oldsnap->ds_phys->ds_used_bytes; + *usedp += new->ds_phys->ds_referenced_bytes; + *usedp -= oldsnap->ds_phys->ds_referenced_bytes; *compp = 0; *compp += new->ds_phys->ds_compressed_bytes; @@ -4158,9 +4220,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, dsl_dataset_t *snap; uint64_t used, comp, uncomp; - err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); - if (err != 0) - break; + if (snapobj == new->ds_object) { + snap = new; + } else { + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } if (snap->ds_phys->ds_prev_snap_txg == oldsnap->ds_phys->ds_creation_txg) { @@ -4189,7 +4255,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, * was not a snapshot of/before new. */ snapobj = snap->ds_phys->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); + if (snap != new) + dsl_dataset_rele(snap, FTAG); if (snapobj == 0) { err = EINVAL; break; diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index a4d4e42da..294932c45 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -171,10 +171,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { - jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, - DMU_OT_NONE, 0, tx); - VERIFY(zap_update(mos, zapobj, - whokey, 8, 1, &jumpobj, tx) == 0); + jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, + zapobj, whokey, tx); } while ((permpair = nvlist_next_nvpair(perms, permpair))) { diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 7e0fba589..85c745e8a 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dsl_pool.h> @@ -40,6 +40,8 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/bptree.h> +#include <sys/zfeature.h> int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -240,20 +242,30 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) } int -dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); + if (err != 0) + dsl_pool_close(dp); + else + *dpp = dp; + + return (err); +} + +int +dsl_pool_open(dsl_pool_t *dp) +{ + int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rw_enter(&dp->dp_config_rwlock, RW_WRITER); - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, - &dp->dp_meta_objset); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); @@ -269,7 +281,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; @@ -286,7 +298,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; } - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) @@ -300,6 +312,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) dp->dp_meta_objset, obj)); } + if (spa_feature_is_active(dp->dp_spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj); + if (err != 0) + goto out; + } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); @@ -308,15 +329,10 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - err = dsl_scan_init(dp, txg); + err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rw_exit(&dp->dp_config_rwlock); - if (err) - dsl_pool_close(dp); - else - *dpp = dp; - return (err); } @@ -611,7 +627,7 @@ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || - spa_get_dsl(dp->dp_spa) == NULL); + spa_is_initializing(dp->dp_spa)); } uint64_t @@ -932,11 +948,8 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); - dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, - DMU_OT_NONE, 0, tx); - - VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, - sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); + dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index c2386dd67..297caa067 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dsl_scan.h> @@ -44,6 +45,7 @@ #include <sys/ddt.h> #include <sys/sa.h> #include <sys/sa_impl.h> +#include <sys/zfeature.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif @@ -379,55 +381,6 @@ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, priority, zio_flags, arc_flags, zb)); } -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { @@ -459,7 +412,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ @@ -614,13 +567,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, /* * We never skip over user/group accounting objects (obj<0) */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ - if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* @@ -823,22 +776,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) goto out; - if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { - /* - * For non-user-accounting blocks, we need to read the - * new bp (from a deleted snapshot, found in - * check_existing_xlation). If we used the old bp, - * pointers inside this block from before we resumed - * would be untranslated. - * - * For user-accounting blocks, we need to read the old - * bp, because we will apply the entire space delta to - * it (original untranslated -> translations from - * deleted snap -> now). - */ - *bp_toread = *bp; - } - if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx, &buf) != 0) goto out; @@ -1414,19 +1351,28 @@ out: kmem_free(zc, sizeof(zap_cursor_t)); } -static int -dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +static boolean_t +dsl_scan_free_should_pause(dsl_scan_t *scn) { - dsl_scan_t *scn = arg; uint64_t elapsed_nanosecs; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || - spa_shutting_down(scn->scn_dp->dp_spa)) - return (ERESTART); + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +static int +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + + if (!scn->scn_is_bptree || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { + if (dsl_scan_free_should_pause(scn)) + return (ERESTART); + } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); @@ -1451,6 +1397,10 @@ dsl_scan_active(dsl_scan_t *scn) if (scn->scn_phys.scn_state == DSS_SCANNING) return (B_TRUE); + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + return (B_TRUE); + } if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); @@ -1497,14 +1447,40 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * traversing it. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_cb, scn, tx); + dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + + if (err == 0 && spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, + scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (err != 0) + return; + + /* disable async destroy feature */ + spa_feature_decr(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); + ASSERT(!spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj txg %llu", + "free_bpobj/bptree txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) (gethrtime() - scn->scn_sync_start_time) / MICROSEC, @@ -1619,9 +1595,13 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; - zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; + zfs_blkstat_t *zb; + + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; + zb = &zab->zab_type[l][t]; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); diff --git a/module/zfs/sa.c b/module/zfs/sa.c index d4b28cc90..240a683d6 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -446,10 +448,9 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { - sa->sa_layout_attr_obj = zap_create(os, - DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, - &sa->sa_layout_attr_obj, tx) == 0); + sa->sa_layout_attr_obj = zap_create_link(os, + DMU_OT_SA_ATTR_LAYOUTS, + sa->sa_master_obj, SA_LAYOUTS, tx); } (void) snprintf(attr_name, sizeof (attr_name), @@ -1583,10 +1584,9 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) } if (sa->sa_reg_attr_obj == 0) { - sa->sa_reg_attr_obj = zap_create(hdl->sa_os, - DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj, - SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0); + sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, + sa->sa_master_obj, SA_REGISTRY, tx); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b610a0dae..244f10d47 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -63,6 +63,7 @@ #include <sys/spa_boot.h> #include <sys/zfs_ioctl.h> #include <sys/dsl_scan.h> +#include <sys/zfeature.h> #ifdef _KERNEL #include <sys/bootprops.h> @@ -114,6 +115,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; +static dsl_syncfunc_t spa_sync_version; static dsl_syncfunc_t spa_sync_props; static boolean_t spa_has_active_shared_spare(spa_t *spa); static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, @@ -169,6 +171,7 @@ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; + dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size; uint64_t alloc; uint64_t space; @@ -216,6 +219,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } + if (pool != NULL) { + dsl_dir_t *freedir = pool->dp_free_dir; + + /* + * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, + * when opening pools before this version freedir will be NULL. + */ + if (freedir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + freedir->dd_phys->dd_used_bytes, src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + NULL, 0, src); + } + } + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { @@ -357,25 +376,55 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; + boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - zpool_prop_t prop; - char *propname, *strval; uint64_t intval; - objset_t *os; - char *slash, *check; + char *strval, *slash, *check, *fname; + const char *propname = nvpair_name(elem); + zpool_prop_t prop = zpool_name_to_prop(propname); + + switch ((int)prop) { + case ZPROP_INVAL: + if (!zpool_prop_feature(propname)) { + error = EINVAL; + break; + } + + /* + * Sanitize the input. + */ + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = EINVAL; + break; + } + + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } - propname = nvpair_name(elem); + if (intval != 0) { + error = EINVAL; + break; + } - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) - return (EINVAL); + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = EINVAL; + break; + } + + has_feature = B_TRUE; + break; - switch (prop) { case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && - (intval < spa_version(spa) || intval > SPA_VERSION)) + (intval < spa_version(spa) || + intval > SPA_VERSION_BEFORE_FEATURES || + has_feature)) error = EINVAL; break; @@ -412,6 +461,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = nvpair_value_string(elem, &strval); if (!error) { + objset_t *os; uint64_t compress; if (strval == NULL || strval[0] == '\0') { @@ -558,33 +608,58 @@ int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; - nvpair_t *elem; + nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; - zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { - if ((prop = zpool_name_to_prop( - nvpair_name(elem))) == ZPROP_INVAL) - return (EINVAL); + zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; + if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { + uint64_t ver; + + if (prop == ZPOOL_PROP_VERSION) { + VERIFY(nvpair_value_uint64(elem, &ver) == 0); + } else { + ASSERT(zpool_prop_feature(nvpair_name(elem))); + ver = SPA_VERSION_FEATURES; + need_sync = B_TRUE; + } + + /* Save time if the version is already set. */ + if (ver == spa_version(spa)) + continue; + + /* + * In addition to the pool directory object, we might + * create the pool properties object, the features for + * read object, the features for write object, or the + * feature descriptions object. + */ + error = dsl_sync_task_do(spa_get_dsl(spa), NULL, + spa_sync_version, spa, &ver, 6); + if (error) + return (error); + continue; + } + need_sync = B_TRUE; break; } - if (need_sync) + if (need_sync) { return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); - else - return (0); + spa, nvp, 6)); + } + + return (0); } /* @@ -1628,7 +1703,7 @@ spa_load_verify_done(zio_t *zio) int error = zio->io_error; if (error) { - if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_add_64(&sle->sle_meta_count, 1); else @@ -1858,6 +1933,9 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, KM_PUSHPAGE) == 0); } + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); @@ -1891,12 +1969,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, { int error = 0; nvlist_t *nvroot = NULL; + nvlist_t *label; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; + boolean_t missing_feat_write = B_FALSE; /* * If this is an untrusted config, access the pool in read-only mode. @@ -1976,19 +2056,79 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Find the best uberblock. */ - vdev_uberblock_load(NULL, rvd, ub); + vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ - if (ub->ub_txg == 0) + if (ub->ub_txg == 0) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + } /* - * If the pool is newer than the code, we can't open it. + * If the pool has an unsupported version we can't open it. */ - if (ub->ub_version > SPA_VERSION) + if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + } + + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *features; + + /* + * If we weren't able to find what's necessary for reading the + * MOS in the label, return failure. + */ + if (label == NULL || nvlist_lookup_nvlist(label, + ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { + nvlist_free(label); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + /* + * Update our in-core representation with the definitive values + * from the label. + */ + nvlist_free(spa->spa_label_features); + VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); + } + + nvlist_free(label); + + /* + * Look through entries in the label nvlist's features_for_read. If + * there is a feature listed there which we don't understand then we + * cannot open a pool. + */ + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *unsup_feat; + nvpair_t *nvp; + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); + nvp != NULL; + nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { + if (!zfeature_is_supported(nvpair_name(nvp))) { + VERIFY(nvlist_add_string(unsup_feat, + nvpair_name(nvp), "") == 0); + } + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + nvlist_free(unsup_feat); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + nvlist_free(unsup_feat); + } /* * If the vdev guid sum doesn't match the uberblock, we have an @@ -2022,7 +2162,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; - error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; @@ -2030,6 +2170,84 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (spa_version(spa) >= SPA_VERSION_FEATURES) { + boolean_t missing_feat_read = B_FALSE; + nvlist_t *unsup_feat; + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, + &spa->spa_feat_for_read_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, + &spa->spa_feat_for_write_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, + &spa->spa_feat_desc_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + if (!feature_is_supported(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, + unsup_feat)) + missing_feat_read = B_TRUE; + + if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { + if (!feature_is_supported(spa->spa_meta_objset, + spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, + unsup_feat)) + missing_feat_write = B_TRUE; + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + } + + nvlist_free(unsup_feat); + + if (!missing_feat_read) { + fnvlist_add_boolean(spa->spa_load_info, + ZPOOL_CONFIG_CAN_RDONLY); + } + + /* + * If the state is SPA_LOAD_TRYIMPORT, our objective is + * twofold: to determine whether the pool is available for + * import in read-write mode and (if it is not) whether the + * pool is available for import in read-only mode. If the pool + * is available for import in read-write mode, it is displayed + * as available in userland; if it is not available for import + * in read-only mode, it is displayed as unavailable in + * userland. If the pool is available for import in read-only + * mode but not read-write mode, it is displayed as unavailable + * in userland with a special note that the pool is actually + * available for open in read-only mode. + * + * As a result, if the state is SPA_LOAD_TRYIMPORT and we are + * missing a feature for write, we must first determine whether + * the pool can be opened read-only before returning to + * userland in order to know whether to display the + * abovementioned note. + */ + if (missing_feat_read || (missing_feat_write && + spa_writeable(spa))) { + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + } + + spa->spa_is_initializing = B_TRUE; + error = dsl_pool_open(spa->spa_dsl_pool); + spa->spa_is_initializing = B_FALSE; + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (!mosconfig) { uint64_t hostid; nvlist_t *policy = NULL, *nvconfig; @@ -2247,7 +2465,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, nvlist_free(nvconfig); /* - * Now that we've validate the config, check the state of the + * Now that we've validated the config, check the state of the * root vdev. If it can't be opened, it indicates one or * more toplevel vdevs are faulted. */ @@ -2260,6 +2478,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, } } + if (missing_feat_write) { + ASSERT(state == SPA_LOAD_TRYIMPORT); + + /* + * At this point, we know that we can open the pool in + * read-only mode but not read-write mode. We now have enough + * information and can return to userland. + */ + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); + } + /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. @@ -2370,10 +2599,18 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } +/* + * If spa_load() fails this function will try loading prior txg's. If + * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool + * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this + * function will not rewind the pool and will return the same error as + * spa_load(). + */ static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { + nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; @@ -2402,9 +2639,18 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, return (load_error); } - /* Price of rolling back is discarding txgs, including log */ - if (state == SPA_LOAD_RECOVER) + if (state == SPA_LOAD_RECOVER) { + /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * If we aren't rolling back save the load info from our first + * import attempt so that we can restore it after attempting + * to rewind. + */ + loadinfo = spa->spa_load_info; + spa->spa_load_info = fnvlist_alloc(); + } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; @@ -2428,7 +2674,20 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); - return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); + if (state == SPA_LOAD_RECOVER) { + ASSERT3P(loadinfo, ==, NULL); + return (rewind_error); + } else { + /* Store the rewind info as part of the initial load info */ + fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, + spa->spa_load_info); + + /* Restore the initial load info */ + fnvlist_free(spa->spa_load_info); + spa->spa_load_info = loadinfo; + + return (load_error); + } } /* @@ -2698,8 +2957,50 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) } } +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (spa->spa_feat_for_read_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_read_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + if (spa->spa_feat_for_write_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_write_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features) == 0); + nvlist_free(features); +} + int -spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) +spa_get_stats(const char *name, nvlist_t **config, + char *altroot, size_t buflen) { int error; spa_t *spa; @@ -2734,6 +3035,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); + spa_add_feature_stats(spa, *config); } } @@ -2954,6 +3256,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; + boolean_t has_features; + nvpair_t *elem; int c; /* @@ -2980,10 +3284,18 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (error); } - if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), - &version) != 0) + has_features = B_FALSE; + for (elem = nvlist_next_nvpair(props, NULL); + elem != NULL; elem = nvlist_next_nvpair(props, elem)) { + if (zpool_prop_feature(nvpair_name(elem))) + has_features = B_TRUE; + } + + if (has_features || nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; - ASSERT(version <= SPA_VERSION); + } + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; @@ -3059,8 +3371,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_l2cache.sav_sync = B_TRUE; } + spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; + spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). @@ -3084,6 +3398,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } + if (spa_version(spa) >= SPA_VERSION_FEATURES) + spa_feature_create_zap_objects(spa, tx); + if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { @@ -3276,7 +3593,7 @@ spa_import_rootpool(char *devpath, char *devid) } #endif if (config == NULL) { - cmn_err(CE_NOTE, "Can not read the pool label from '%s'", + cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (EIO); } @@ -3590,6 +3907,8 @@ spa_tryimport(nvlist_t *tryconfig) state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we @@ -5305,7 +5624,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) * information. This avoids the dbuf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ - bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); + bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_PUSHPAGE); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, @@ -5390,6 +5709,24 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } +static void +spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + uint64_t version = *(uint64_t *)arg2; + + /* + * Setting the version is special cased when first creating the pool. + */ + ASSERT(tx->tx_txg != TXG_INITIAL); + + ASSERT(version <= SPA_VERSION); + ASSERT(version >= spa_version(spa)); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Set zpool properties. */ @@ -5399,32 +5736,39 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; nvlist_t *nvp = arg2; - nvpair_t *elem; - uint64_t intval; - char *strval; - zpool_prop_t prop; - const char *propname; - zprop_type_t proptype; + nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem))) { - switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + uint64_t intval; + char *strval, *fname; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + zfeature_info_t *feature; + + prop = zpool_name_to_prop(nvpair_name(elem)); + switch ((int)prop) { + case ZPROP_INVAL: + /* + * We checked this earlier in spa_prop_validate(). + */ + ASSERT(zpool_prop_feature(nvpair_name(elem))); + + fname = strchr(nvpair_name(elem), '@') + 1; + VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); + + spa_feature_enable(spa, feature, tx); + break; + case ZPOOL_PROP_VERSION: + VERIFY(nvpair_value_uint64(elem, &intval) == 0); /* - * Only set version for non-zpool-creation cases - * (set/import). spa_create() needs special care - * for version setting. + * The version is synced seperatly before other + * properties and should be correct by now. */ - if (tx->tx_txg != TXG_INITIAL) { - VERIFY(nvpair_value_uint64(elem, - &intval) == 0); - ASSERT(intval <= SPA_VERSION); - ASSERT(intval >= spa_version(spa)); - spa->spa_uberblock.ub_version = intval; - vdev_config_dirty(spa->spa_root_vdev); - } + ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: @@ -5461,14 +5805,10 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { - VERIFY((spa->spa_pool_props_object = - zap_create(mos, DMU_OT_POOL_PROPS, - DMU_OT_NONE, 0, tx)) > 0); - - VERIFY(zap_update(mos, + spa->spa_pool_props_object = + zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, - 8, 1, &spa->spa_pool_props_object, tx) - == 0); + tx); } /* normalize the property name */ @@ -5567,6 +5907,11 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && + spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + spa_feature_create_zap_objects(spa, tx); + } } /* diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index c86884148..09149e622 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -35,6 +35,7 @@ #include <sys/utsname.h> #include <sys/systeminfo.h> #include <sys/sunddi.h> +#include <sys/zfeature.h> #ifdef _KERNEL #include <sys/kobj.h> #include <sys/zone.h> @@ -408,6 +409,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); + /* + * Store what's necessary for reading the MOS in the label. + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + spa->spa_label_features) == 0); + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 4a8e6adfd..440a6addb 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -49,6 +49,7 @@ #include <sys/arc.h> #include <sys/ddt.h> #include "zfs_prop.h" +#include "zfeature_common.h" /* * SPA locking @@ -217,7 +218,7 @@ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. * - * spa_rename() is also implemented within this file since is requires + * spa_rename() is also implemented within this file since it requires * manipulation of the namespace. */ @@ -479,8 +480,22 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); - if (config != NULL) + if (config != NULL) { + nvlist_t *features; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) == 0) { + VERIFY(nvlist_dup(features, &spa->spa_label_features, + 0) == 0); + } + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + } + + if (spa->spa_label_features == NULL) { + VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + } return (spa); } @@ -518,6 +533,7 @@ spa_remove(spa_t *spa) list_destroy(&spa->spa_config_list); + nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); @@ -1025,6 +1041,20 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) * ========================================================================== */ +void +spa_activate_mos_feature(spa_t *spa, const char *feature) +{ + (void) nvlist_add_boolean(spa->spa_label_features, feature); + vdev_config_dirty(spa->spa_root_vdev); +} + +void +spa_deactivate_mos_feature(spa_t *spa, const char *feature) +{ + (void) nvlist_remove_all(spa->spa_label_features, feature); + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Rename a spa_t. */ @@ -1175,12 +1205,22 @@ spa_generate_guid(spa_t *spa) void sprintf_blkptr(char *buf, const blkptr_t *bp) { - char *type = NULL; + char type[256]; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { - type = dmu_ot[BP_GET_TYPE(bp)].ot_name; + if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + (void) snprintf(type, sizeof (type), "bswap %s %s", + DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? + "metadata" : "data", + dmu_ot_byteswap[bswap].ob_name); + } else { + (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, + sizeof (type)); + } checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } @@ -1252,6 +1292,12 @@ spa_get_dsl(spa_t *spa) return (spa->spa_dsl_pool); } +boolean_t +spa_is_initializing(spa_t *spa) +{ + return (spa->spa_is_initializing); +} + blkptr_t * spa_get_rootblkptr(spa_t *spa) { @@ -1532,6 +1578,7 @@ spa_init(int mode) vdev_cache_stat_init(); zfs_prop_init(); zpool_prop_init(); + zpool_feature_init(); spa_config_load(); l2arc_start(); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index e374f6d78..9335ba511 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1349,7 +1349,8 @@ vdev_validate(vdev_t *vd, boolean_t strict) uint64_t aux_guid = 0; nvlist_t *nvl; - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == + NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); @@ -1993,14 +1994,14 @@ vdev_validate_aux(vdev_t *vd) if (!vdev_readable(vd)) return (0); - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || - version > SPA_VERSION || + !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7ac23500f..352b630fa 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -121,6 +123,8 @@ * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. + * features_for_read + * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * @@ -428,8 +432,13 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } +/* + * Returns the configuration from the label of the given vdev. If 'label' is + * VDEV_BEST_LABEL, each label of the vdev will be read until a valid + * configuration is found; otherwise, only the specified label will be read. + */ nvlist_t * -vdev_label_read_config(vdev_t *vd) +vdev_label_read_config(vdev_t *vd, int label) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; @@ -448,6 +457,8 @@ vdev_label_read_config(vdev_t *vd) retry: for (l = 0; l < VDEV_LABELS; l++) { + if (label >= 0 && label < VDEV_LABELS && label != l) + continue; zio = zio_root(spa, NULL, NULL, flags); @@ -497,7 +508,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, /* * Read the label, if any, and perform some basic sanity checks. */ - if ((label = vdev_label_read_config(vd)) == NULL) + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, @@ -838,7 +849,7 @@ retry: * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then - * the missing replica comes back, then for a new seconds we'll have two + * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ @@ -858,47 +869,52 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) return (0); } +struct ubl_cbdata { + uberblock_t *ubl_ubbest; /* Best uberblock */ + vdev_t *ubl_vd; /* vdev associated with the above */ + int ubl_label; /* Label associated with the above */ +}; + static void vdev_uberblock_load_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; - uberblock_t *ubbest = rio->io_private; + struct ubl_cbdata *cbp = rio->io_private; - ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (ub->ub_txg <= spa->spa_load_max_txg && - vdev_uberblock_compare(ub, ubbest) > 0) - *ubbest = *ub; + vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { + /* + * Keep track of the vdev and label in which this + * uberblock was found. We will use this information + * later to obtain the config nvlist associated with + * this uberblock. + */ + *cbp->ubl_ubbest = *ub; + cbp->ubl_vd = vd; + cbp->ubl_label = vdev_label_number(vd->vdev_psize, + zio->io_offset); + } mutex_exit(&rio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); } -void -vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +static void +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; int c, l, n; - if (vd == rvd) { - ASSERT(zio == NULL); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - zio = zio_root(spa, NULL, ubbest, flags); - bzero(ubbest, sizeof (uberblock_t)); - } - - ASSERT(zio != NULL); - for (c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (l = 0; l < VDEV_LABELS; l++) { @@ -911,11 +927,45 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) } } } +} - if (vd == rvd) { - (void) zio_wait(zio); - spa_config_exit(spa, SCL_ALL, FTAG); +/* + * Reads the 'best' uberblock from disk along with its associated + * configuration. First, we read the uberblock array of each label of each + * vdev, keeping track of the uberblock with the highest txg in each array. + * Then, we read the configuration from the same label as the best uberblock. + */ +void +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) +{ + int i; + zio_t *zio; + spa_t *spa = rvd->vdev_spa; + struct ubl_cbdata cb; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(ub); + ASSERT(config); + + bzero(ub, sizeof (uberblock_t)); + *config = NULL; + + cb.ubl_ubbest = ub; + cb.ubl_vd = NULL; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, &cb, flags); + vdev_uberblock_load_impl(zio, rvd, flags, &cb); + (void) zio_wait(zio); + if (cb.ubl_vd != NULL) { + for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) { + *config = vdev_label_read_config(cb.ubl_vd, i); + if (*config != NULL) + break; + } } + spa_config_exit(spa, SCL_ALL, FTAG); } /* diff --git a/module/zfs/zap.c b/module/zfs/zap.c index fac54eab0..59f62fa3a 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -946,6 +947,19 @@ fzap_prefetch(zap_name_t *zn) * Helper functions for consumers. */ +uint64_t +zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, dmu_tx_t *tx) +{ + uint64_t new_obj; + + VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0); + VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, + tx) == 0); + + return (new_obj); +} + int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index d5b97dac9..3d8cae0d3 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/zio.h> @@ -461,7 +461,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif @@ -585,7 +585,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c new file mode 100644 index 000000000..de9d165a9 --- /dev/null +++ b/module/zfs/zfeature.c @@ -0,0 +1,422 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/zfeature.h> +#include <sys/dmu.h> +#include <sys/nvpair.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include "zfeature_common.h" +#include <sys/spa_impl.h> + +/* + * ZFS Feature Flags + * ----------------- + * + * ZFS feature flags are used to provide fine-grained versioning to the ZFS + * on-disk format. Once enabled on a pool feature flags replace the old + * spa_version() number. + * + * Each new on-disk format change will be given a uniquely identifying string + * guid rather than a version number. This avoids the problem of different + * organizations creating new on-disk formats with the same version number. To + * keep feature guids unique they should consist of the reverse dns name of the + * organization which implemented the feature and a short name for the feature, + * separated by a colon (e.g. com.delphix:async_destroy). + * + * Reference Counts + * ---------------- + * + * Within each pool features can be in one of three states: disabled, enabled, + * or active. These states are differentiated by a reference count stored on + * disk for each feature: + * + * 1) If there is no reference count stored on disk the feature is disabled. + * 2) If the reference count is 0 a system administrator has enabled the + * feature, but the feature has not been used yet, so no on-disk + * format changes have been made. + * 3) If the reference count is greater than 0 the feature is active. + * The format changes required by the feature are currently on disk. + * Note that if the feature's format changes are reversed the feature + * may choose to set its reference count back to 0. + * + * Feature flags makes no differentiation between non-zero reference counts + * for an active feature (e.g. a reference count of 1 means the same thing as a + * reference count of 27834721), but feature implementations may choose to use + * the reference count to store meaningful information. For example, a new RAID + * implementation might set the reference count to the number of vdevs using + * it. If all those disks are removed from the pool the feature goes back to + * having a reference count of 0. + * + * It is the responsibility of the individual features to maintain a non-zero + * reference count as long as the feature's format changes are present on disk. + * + * Dependencies + * ------------ + * + * Each feature may depend on other features. The only effect of this + * relationship is that when a feature is enabled all of its dependencies are + * automatically enabled as well. Any future work to support disabling of + * features would need to ensure that features cannot be disabled if other + * enabled features depend on them. + * + * On-disk Format + * -------------- + * + * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES + * (5000). In order for this to work the pool is automatically upgraded to + * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk + * format changes will be in use. + * + * Information about features is stored in 3 ZAP objects in the pool's MOS. + * These objects are linked to by the following names in the pool directory + * object: + * + * 1) features_for_read: feature guid -> reference count + * Features needed to open the pool for reading. + * 2) features_for_write: feature guid -> reference count + * Features needed to open the pool for writing. + * 3) feature_descriptions: feature guid -> descriptive string + * A human readable string. + * + * All enabled features appear in either features_for_read or + * features_for_write, but not both. + * + * To open a pool in read-only mode only the features listed in + * features_for_read need to be supported. + * + * To open the pool in read-write mode features in both features_for_read and + * features_for_write need to be supported. + * + * Some features may be required to read the ZAP objects containing feature + * information. To allow software to check for compatibility with these features + * before the pool is opened their names must be stored in the label in a + * new "features_for_read" entry (note that features that are only required + * to write to a pool never need to be stored in the label since the + * features_for_write ZAP object can be read before the pool is written to). + * To save space in the label features must be explicitly marked as needing to + * be written to the label. Also, reference counts are not stored in the label, + * instead any feature whose reference count drops to 0 is removed from the + * label. + * + * Adding New Features + * ------------------- + * + * Features must be registered in zpool_feature_init() function in + * zfeature_common.c using the zfeature_register() function. This function + * has arguments to specify if the feature should be stored in the + * features_for_read or features_for_write ZAP object and if it needs to be + * written to the label when active. + * + * Once a feature is registered it will appear as a "feature@<feature name>" + * property which can be set by an administrator. Feature implementors should + * use the spa_feature_is_enabled() and spa_feature_is_active() functions to + * query the state of a feature and the spa_feature_incr() and + * spa_feature_decr() functions to change an enabled feature's reference count. + * Reference counts may only be updated in the syncing context. + * + * Features may not perform enable-time initialization. Instead, any such + * initialization should occur when the feature is first used. This design + * enforces that on-disk changes be made only when features are used. Code + * should only check if a feature is enabled using spa_feature_is_enabled(), + * not by relying on any feature specific metadata existing. If a feature is + * enabled, but the feature's metadata is not on disk yet then it should be + * created as needed. + * + * As an example, consider the com.delphix:async_destroy feature. This feature + * relies on the existence of a bptree in the MOS that store blocks for + * asynchronous freeing. This bptree is not created when async_destroy is + * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is + * called to check if async_destroy is enabled. If it is and the bptree object + * does not exist yet, the bptree object is created as part of the dataset + * destroy and async_destroy's reference count is incremented to indicate it + * has made an on-disk format change. Later, after the destroyed dataset's + * blocks have all been asynchronously freed there is no longer any use for the + * bptree object, so it is destroyed and async_destroy's reference count is + * decremented back to 0 to indicate that it has undone its on-disk format + * changes. + */ + +typedef enum { + FEATURE_ACTION_ENABLE, + FEATURE_ACTION_INCR, + FEATURE_ACTION_DECR, +} feature_action_t; + +/* + * Checks that the features active in the specified object are supported by + * this software. Adds each unsupported feature (name -> description) to + * the supplied nvlist. + */ +boolean_t +feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, + nvlist_t *unsup_feat) +{ + boolean_t supported; + zap_cursor_t *zc; + zap_attribute_t *za; + char *buf; + + zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); + buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + supported = B_TRUE; + for (zap_cursor_init(zc, os, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + ASSERT(za->za_integer_length == sizeof (uint64_t) && + za->za_num_integers == 1); + + if (za->za_first_integer != 0 && + !zfeature_is_supported(za->za_name)) { + supported = B_FALSE; + + if (unsup_feat != NULL) { + char *desc = ""; + + if (zap_lookup(os, desc_obj, za->za_name, + 1, sizeof (buf), buf) == 0) + desc = buf; + + VERIFY(nvlist_add_string(unsup_feat, + za->za_name, desc) == 0); + } + } + } + zap_cursor_fini(zc); + + kmem_free(buf, MAXPATHLEN); + kmem_free(za, sizeof(zap_attribute_t)); + kmem_free(zc, sizeof(zap_cursor_t)); + + return (supported); +} + +static int +feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, + zfeature_info_t *feature, uint64_t *res) +{ + int err; + uint64_t refcount; + uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; + + ASSERT(0 != zapobj); + + err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1, + &refcount); + if (err != 0) { + if (err == ENOENT) + return (ENOTSUP); + else + return (err); + } + *res = refcount; + return (0); +} + +static int +feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj, + uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action, + dmu_tx_t *tx) +{ + int error; + uint64_t refcount; + uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; + + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + + error = zap_lookup(os, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount); + + /* + * If we can't ascertain the status of the specified feature, an I/O + * error occurred. + */ + if (error != 0 && error != ENOENT) + return (error); + + switch (action) { + case FEATURE_ACTION_ENABLE: + /* + * If the feature is already enabled, ignore the request. + */ + if (error == 0) + return (0); + refcount = 0; + break; + case FEATURE_ACTION_INCR: + if (error == ENOENT) + return (ENOTSUP); + if (refcount == UINT64_MAX) + return (EOVERFLOW); + refcount++; + break; + case FEATURE_ACTION_DECR: + if (error == ENOENT) + return (ENOTSUP); + if (refcount == 0) + return (EOVERFLOW); + refcount--; + break; + default: + ASSERT(0); + break; + } + + if (action == FEATURE_ACTION_ENABLE) { + int i; + + for (i = 0; feature->fi_depends[i] != NULL; i++) { + zfeature_info_t *dep = feature->fi_depends[i]; + + error = feature_do_action(os, read_obj, write_obj, + desc_obj, dep, FEATURE_ACTION_ENABLE, tx); + if (error != 0) + return (error); + } + } + + error = zap_update(os, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount, tx); + if (error != 0) + return (error); + + if (action == FEATURE_ACTION_ENABLE) { + error = zap_update(os, desc_obj, + feature->fi_guid, 1, strlen(feature->fi_desc) + 1, + feature->fi_desc, tx); + if (error != 0) + return (error); + } + + if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) { + spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid); + } + + if (action == FEATURE_ACTION_DECR && refcount == 0) { + spa_deactivate_mos_feature(dmu_objset_spa(os), + feature->fi_guid); + } + + return (0); +} + +void +spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We create feature flags ZAP objects in two instances: during pool + * creation and during pool upgrade. + */ + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && + tx->tx_txg == TXG_INITIAL)); + + spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_READ, tx); + spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_WRITE, tx); + spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_DESCRIPTIONS, tx); +} + +/* + * Enable any required dependencies, then enable the requested feature. + */ +void +spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx)); +} + +/* + * If the specified feature has not yet been enabled, this function returns + * ENOTSUP; otherwise, this function increments the feature's refcount (or + * returns EOVERFLOW if the refcount cannot be incremented). This function must + * be called from syncing context. + */ +void +spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx)); +} + +/* + * If the specified feature has not yet been enabled, this function returns + * ENOTSUP; otherwise, this function decrements the feature's refcount (or + * returns EOVERFLOW if the refcount is already 0). This function must + * be called from syncing context. + */ +void +spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx)); +} + +boolean_t +spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature) +{ + int err; + uint64_t refcount = 0; + + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + feature, &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0); +} + +boolean_t +spa_feature_is_active(spa_t *spa, zfeature_info_t *feature) +{ + int err; + uint64_t refcount = 0; + + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + feature, &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0 && refcount > 0); +} diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c new file mode 100644 index 000000000..33d15133e --- /dev/null +++ b/module/zfs/zfeature_common.c @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <errno.h> +#include <string.h> +#endif +#include <sys/debug.h> +#include <sys/fs/zfs.h> +#include <sys/inttypes.h> +#include <sys/types.h> +#include "zfeature_common.h" + +/* + * Set to disable all feature checks while opening pools, allowing pools with + * unsupported features to be opened. Set for testing only. + */ +boolean_t zfeature_checks_disable = B_FALSE; + +zfeature_info_t spa_feature_table[SPA_FEATURES]; + +/* + * Valid characters for feature guids. This list is mainly for aesthetic + * purposes and could be expanded in the future. There are different allowed + * characters in the guids reverse dns portion (before the colon) and its + * short name (after the colon). + */ +static int +valid_char(char c, boolean_t after_colon) +{ + return ((c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == (after_colon ? '_' : '.')); +} + +/* + * Every feature guid must contain exactly one colon which separates a reverse + * dns organization name from the feature's "short" name (e.g. + * "com.company:feature_name"). + */ +boolean_t +zfeature_is_valid_guid(const char *name) +{ + int i; + boolean_t has_colon = B_FALSE; + + i = 0; + while (name[i] != '\0') { + char c = name[i++]; + if (c == ':') { + if (has_colon) + return (B_FALSE); + has_colon = B_TRUE; + continue; + } + if (!valid_char(c, has_colon)) + return (B_FALSE); + } + + return (has_colon); +} + +boolean_t +zfeature_is_supported(const char *guid) +{ + if (zfeature_checks_disable) + return (B_TRUE); + + return (0 == zfeature_lookup_guid(guid, NULL)); +} + +int +zfeature_lookup_guid(const char *guid, zfeature_info_t **res) +{ + int i; + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(guid, feature->fi_guid) == 0) { + if (res != NULL) + *res = feature; + return (0); + } + } + + return (ENOENT); +} + +int +zfeature_lookup_name(const char *name, zfeature_info_t **res) +{ + int i; + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(name, feature->fi_uname) == 0) { + if (res != NULL) + *res = feature; + return (0); + } + } + + return (ENOENT); +} + +static void +zfeature_register(int fid, const char *guid, const char *name, const char *desc, + boolean_t readonly, boolean_t mos, zfeature_info_t **deps) +{ + zfeature_info_t *feature = &spa_feature_table[fid]; + static zfeature_info_t *nodeps[] = { NULL }; + + ASSERT(name != NULL); + ASSERT(desc != NULL); + ASSERT(!readonly || !mos); + ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(zfeature_is_valid_guid(guid)); + + if (deps == NULL) + deps = nodeps; + + feature->fi_guid = guid; + feature->fi_uname = name; + feature->fi_desc = desc; + feature->fi_can_readonly = readonly; + feature->fi_mos = mos; + feature->fi_depends = deps; +} + +void +zpool_feature_init(void) +{ + zfeature_register(SPA_FEATURE_ASYNC_DESTROY, + "com.delphix:async_destroy", "async_destroy", + "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL); +} diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index c609203ea..98b1dd794 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska @@ -1107,6 +1108,8 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) /* * Find a zfs_sb_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. + * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, + * which prevents all inode ops from running. */ static int zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer) @@ -1170,7 +1173,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); - if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { + if (!SPA_VERSION_IS_SUPPORTED(version)) { error = EINVAL; goto pool_props_bad; } @@ -1297,6 +1300,15 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * + * outputs: + * zc_cookie real errno + * zc_nvlist_dst config nvlist + * zc_nvlist_dst_size size of config nvlist + */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { @@ -1398,7 +1410,8 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { + if (zc->zc_cookie < spa_version(spa) || + !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (EINVAL); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index bfb817b78..66f228bc7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -704,7 +704,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - zp->zp_type < DMU_OT_NUMTYPES && + DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa) && @@ -988,7 +988,7 @@ zio_read_bp_init(zio_t *zio) zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } - if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) @@ -3131,6 +3131,48 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_done }; +/* dnp is the dnode for zb1->zb_object */ +boolean_t +zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + #if defined(_KERNEL) && defined(HAVE_SPL) /* Fault injection */ EXPORT_SYMBOL(zio_injection_enabled); |