From 9ae529ec5dbdc828ff8326beae58062971d74b2e Mon Sep 17 00:00:00 2001 From: Christopher Siden Date: Thu, 13 Dec 2012 15:24:15 -0800 Subject: Illumos #2619 and #2747 2619 asynchronous destruction of ZFS file systems 2747 SPA versioning with zfs feature flags Reviewed by: Matt Ahrens Reviewed by: George Wilson Reviewed by: Richard Lowe Reviewed by: Dan Kruchinin Approved by: Eric Schrock References: illumos/illumos-gate@53089ab7c84db6fb76c16ca50076c147cda11757 illumos/illumos-gate@ad135b5d644628e791c3188a6ecbd9c257961ef8 illumos changeset: 13700:2889e2596bd6 https://www.illumos.org/issues/2619 https://www.illumos.org/issues/2747 NOTE: The grub specific changes were not ported. This change must be made to the Linux grub packages. Ported-by: Brian Behlendorf --- module/nvpair/Makefile.in | 1 + module/nvpair/fnvpair.c | 566 +++++++++++++++++++++++++++++++++++++++++++ module/zcommon/zpool_prop.c | 24 ++ module/zfs/Makefile.in | 3 + module/zfs/arc.c | 4 +- module/zfs/bptree.c | 224 +++++++++++++++++ module/zfs/dbuf.c | 3 +- module/zfs/ddt.c | 9 +- module/zfs/dmu.c | 126 +++++----- module/zfs/dmu_send.c | 17 +- module/zfs/dmu_traverse.c | 107 +++++++- module/zfs/dmu_tx.c | 5 +- module/zfs/dnode.c | 19 +- module/zfs/dnode_sync.c | 4 +- module/zfs/dsl_dataset.c | 185 +++++++++----- module/zfs/dsl_deleg.c | 8 +- module/zfs/dsl_pool.c | 55 +++-- module/zfs/dsl_scan.c | 136 +++++------ module/zfs/sa.c | 16 +- module/zfs/spa.c | 467 ++++++++++++++++++++++++++++++----- module/zfs/spa_config.c | 9 +- module/zfs/spa_misc.c | 57 ++++- module/zfs/vdev.c | 7 +- module/zfs/vdev_label.c | 102 ++++++-- module/zfs/zap.c | 14 ++ module/zfs/zap_micro.c | 6 +- module/zfs/zfeature.c | 422 ++++++++++++++++++++++++++++++++ module/zfs/zfeature_common.c | 160 ++++++++++++ module/zfs/zfs_ioctl.c | 17 +- module/zfs/zio.c | 46 +++- 30 files changed, 2454 insertions(+), 365 deletions(-) create mode 100644 module/nvpair/fnvpair.c create mode 100644 module/zfs/bptree.c create mode 100644 module/zfs/zfeature.c create mode 100644 module/zfs/zfeature_common.c (limited to 'module') diff --git a/module/nvpair/Makefile.in b/module/nvpair/Makefile.in index b53381f6a..211fc726d 100644 --- a/module/nvpair/Makefile.in +++ b/module/nvpair/Makefile.in @@ -5,5 +5,6 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ obj-$(CONFIG_ZFS) := $(MODULE).o $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair.o +$(MODULE)-objs += @top_srcdir@/module/nvpair/fnvpair.o $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_spl.o $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_fixed.o diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c new file mode 100644 index 000000000..fa28afc75 --- /dev/null +++ b/module/nvpair/fnvpair.c @@ -0,0 +1,566 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include +#include +#ifndef _KERNEL +#include +#endif + +/* + * "Force" nvlist wrapper. + * + * These functions wrap the nvlist_* functions with assertions that assume + * the operation is successful. This allows the caller's code to be much + * more readable, especially for the fnvlist_lookup_* and fnvpair_value_* + * functions, which can return the requested value (rather than filling in + * a pointer). + * + * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate + * with KM_SLEEP. + * + * More wrappers should be added as needed -- for example + * nvlist_lookup_*_array and nvpair_value_*_array. + */ + +nvlist_t * +fnvlist_alloc(void) +{ + nvlist_t *nvl; + VERIFY3U(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP), ==, 0); + return (nvl); +} + +void +fnvlist_free(nvlist_t *nvl) +{ + nvlist_free(nvl); +} + +size_t +fnvlist_size(nvlist_t *nvl) +{ + size_t size; + VERIFY3U(nvlist_size(nvl, &size, NV_ENCODE_NATIVE), ==, 0); + return (size); +} + +/* + * Returns allocated buffer of size *sizep. Caller must free the buffer with + * fnvlist_pack_free(). + */ +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + char *packed = 0; + VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, + KM_SLEEP), ==, 0); + return (packed); +} + +/*ARGSUSED*/ +void +fnvlist_pack_free(char *pack, size_t size) +{ +#ifdef _KERNEL + kmem_free(pack, size); +#else + free(pack); +#endif +} + +nvlist_t * +fnvlist_unpack(char *buf, size_t buflen) +{ + nvlist_t *rv; + VERIFY3U(nvlist_unpack(buf, buflen, &rv, KM_SLEEP), ==, 0); + return (rv); +} + +nvlist_t * +fnvlist_dup(nvlist_t *nvl) +{ + nvlist_t *rv; + VERIFY3U(nvlist_dup(nvl, &rv, KM_SLEEP), ==, 0); + return (rv); +} + +void +fnvlist_merge(nvlist_t *dst, nvlist_t *src) +{ + VERIFY3U(nvlist_merge(dst, src, KM_SLEEP), ==, 0); +} + +void +fnvlist_add_boolean(nvlist_t *nvl, const char *name) +{ + VERIFY3U(nvlist_add_boolean(nvl, name), ==, 0); +} + +void +fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) +{ + VERIFY3U(nvlist_add_boolean_value(nvl, name, val), ==, 0); +} + +void +fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) +{ + VERIFY3U(nvlist_add_byte(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) +{ + VERIFY3U(nvlist_add_int8(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) +{ + VERIFY3U(nvlist_add_uint8(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) +{ + VERIFY3U(nvlist_add_int16(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) +{ + VERIFY3U(nvlist_add_uint16(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) +{ + VERIFY3U(nvlist_add_int32(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) +{ + VERIFY3U(nvlist_add_uint32(nvl, name, val), ==, 0); +} + +void +fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) +{ + VERIFY3U(nvlist_add_int64(nvl, name, val), ==, 0); +} + +void +fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) +{ + VERIFY3U(nvlist_add_uint64(nvl, name, val), ==, 0); +} + +void +fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val) +{ + VERIFY3U(nvlist_add_string(nvl, name, val), ==, 0); +} + +void +fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + VERIFY3U(nvlist_add_nvlist(nvl, name, val), ==, 0); +} + +void +fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY3U(nvlist_add_nvpair(nvl, pair), ==, 0); +} + +void +fnvlist_add_boolean_array(nvlist_t *nvl, const char *name, + boolean_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_boolean_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_byte_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int8_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint8_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int16_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint16_array(nvlist_t *nvl, const char *name, + uint16_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint16_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int32_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint32_array(nvlist_t *nvl, const char *name, + uint32_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint32_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_int64_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_uint64_array(nvlist_t *nvl, const char *name, + uint64_t *val, uint_t n) +{ + VERIFY3U(nvlist_add_uint64_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_string_array(nvlist_t *nvl, const char *name, + char * const *val, uint_t n) +{ + VERIFY3U(nvlist_add_string_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name, + nvlist_t **val, uint_t n) +{ + VERIFY3U(nvlist_add_nvlist_array(nvl, name, val, n), ==, 0); +} + +void +fnvlist_remove(nvlist_t *nvl, const char *name) +{ + VERIFY3U(nvlist_remove_all(nvl, name), ==, 0); +} + +void +fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY3U(nvlist_remove_nvpair(nvl, pair), ==, 0); +} + +nvpair_t * +fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name) +{ + nvpair_t *rv; + VERIFY3U(nvlist_lookup_nvpair(nvl, name, &rv), ==, 0); + return (rv); +} + +/* returns B_TRUE if the entry exists */ +boolean_t +fnvlist_lookup_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_lookup_boolean(nvl, name) == 0); +} + +boolean_t +fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name) +{ + boolean_t rv; + VERIFY3U(nvlist_lookup_boolean_value(nvl, name, &rv), ==, 0); + return (rv); +} + +uchar_t +fnvlist_lookup_byte(nvlist_t *nvl, const char *name) +{ + uchar_t rv; + VERIFY3U(nvlist_lookup_byte(nvl, name, &rv), ==, 0); + return (rv); +} + +int8_t +fnvlist_lookup_int8(nvlist_t *nvl, const char *name) +{ + int8_t rv; + VERIFY3U(nvlist_lookup_int8(nvl, name, &rv), ==, 0); + return (rv); +} + +int16_t +fnvlist_lookup_int16(nvlist_t *nvl, const char *name) +{ + int16_t rv; + VERIFY3U(nvlist_lookup_int16(nvl, name, &rv), ==, 0); + return (rv); +} + +int32_t +fnvlist_lookup_int32(nvlist_t *nvl, const char *name) +{ + int32_t rv; + VERIFY3U(nvlist_lookup_int32(nvl, name, &rv), ==, 0); + return (rv); +} + +int64_t +fnvlist_lookup_int64(nvlist_t *nvl, const char *name) +{ + int64_t rv; + VERIFY3U(nvlist_lookup_int64(nvl, name, &rv), ==, 0); + return (rv); +} + +uint8_t +fnvlist_lookup_uint8(nvlist_t *nvl, const char *name) +{ + uint8_t rv; + VERIFY3U(nvlist_lookup_uint8(nvl, name, &rv), ==, 0); + return (rv); +} + +uint16_t +fnvlist_lookup_uint16(nvlist_t *nvl, const char *name) +{ + uint16_t rv; + VERIFY3U(nvlist_lookup_uint16(nvl, name, &rv), ==, 0); + return (rv); +} + +uint32_t +fnvlist_lookup_uint32(nvlist_t *nvl, const char *name) +{ + uint32_t rv; + VERIFY3U(nvlist_lookup_uint32(nvl, name, &rv), ==, 0); + return (rv); +} + +uint64_t +fnvlist_lookup_uint64(nvlist_t *nvl, const char *name) +{ + uint64_t rv; + VERIFY3U(nvlist_lookup_uint64(nvl, name, &rv), ==, 0); + return (rv); +} + +char * +fnvlist_lookup_string(nvlist_t *nvl, const char *name) +{ + char *rv; + VERIFY3U(nvlist_lookup_string(nvl, name, &rv), ==, 0); + return (rv); +} + +nvlist_t * +fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name) +{ + nvlist_t *rv; + VERIFY3U(nvlist_lookup_nvlist(nvl, name, &rv), ==, 0); + return (rv); +} + +boolean_t +fnvpair_value_boolean_value(nvpair_t *nvp) +{ + boolean_t rv; + VERIFY3U(nvpair_value_boolean_value(nvp, &rv), ==, 0); + return (rv); +} + +uchar_t +fnvpair_value_byte(nvpair_t *nvp) +{ + uchar_t rv; + VERIFY3U(nvpair_value_byte(nvp, &rv), ==, 0); + return (rv); +} + +int8_t +fnvpair_value_int8(nvpair_t *nvp) +{ + int8_t rv; + VERIFY3U(nvpair_value_int8(nvp, &rv), ==, 0); + return (rv); +} + +int16_t +fnvpair_value_int16(nvpair_t *nvp) +{ + int16_t rv; + VERIFY3U(nvpair_value_int16(nvp, &rv), ==, 0); + return (rv); +} + +int32_t +fnvpair_value_int32(nvpair_t *nvp) +{ + int32_t rv; + VERIFY3U(nvpair_value_int32(nvp, &rv), ==, 0); + return (rv); +} + +int64_t +fnvpair_value_int64(nvpair_t *nvp) +{ + int64_t rv; + VERIFY3U(nvpair_value_int64(nvp, &rv), ==, 0); + return (rv); +} + +uint8_t +fnvpair_value_uint8(nvpair_t *nvp) +{ + uint8_t rv; + VERIFY3U(nvpair_value_uint8(nvp, &rv), ==, 0); + return (rv); +} + +uint16_t +fnvpair_value_uint16(nvpair_t *nvp) +{ + uint16_t rv; + VERIFY3U(nvpair_value_uint16(nvp, &rv), ==, 0); + return (rv); +} + +uint32_t +fnvpair_value_uint32(nvpair_t *nvp) +{ + uint32_t rv; + VERIFY3U(nvpair_value_uint32(nvp, &rv), ==, 0); + return (rv); +} + +uint64_t +fnvpair_value_uint64(nvpair_t *nvp) +{ + uint64_t rv; + VERIFY3U(nvpair_value_uint64(nvp, &rv), ==, 0); + return (rv); +} + +char * +fnvpair_value_string(nvpair_t *nvp) +{ + char *rv; + VERIFY3U(nvpair_value_string(nvp, &rv), ==, 0); + return (rv); +} + +nvlist_t * +fnvpair_value_nvlist(nvpair_t *nvp) +{ + nvlist_t *rv; + VERIFY3U(nvpair_value_nvlist(nvp, &rv), ==, 0); + return (rv); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) + +EXPORT_SYMBOL(fnvlist_alloc); +EXPORT_SYMBOL(fnvlist_free); +EXPORT_SYMBOL(fnvlist_size); +EXPORT_SYMBOL(fnvlist_pack); +EXPORT_SYMBOL(fnvlist_unpack); +EXPORT_SYMBOL(fnvlist_dup); +EXPORT_SYMBOL(fnvlist_merge); + +EXPORT_SYMBOL(fnvlist_add_nvpair); +EXPORT_SYMBOL(fnvlist_add_boolean); +EXPORT_SYMBOL(fnvlist_add_boolean_value); +EXPORT_SYMBOL(fnvlist_add_byte); +EXPORT_SYMBOL(fnvlist_add_int8); +EXPORT_SYMBOL(fnvlist_add_uint8); +EXPORT_SYMBOL(fnvlist_add_int16); +EXPORT_SYMBOL(fnvlist_add_uint16); +EXPORT_SYMBOL(fnvlist_add_int32); +EXPORT_SYMBOL(fnvlist_add_uint32); +EXPORT_SYMBOL(fnvlist_add_int64); +EXPORT_SYMBOL(fnvlist_add_uint64); +EXPORT_SYMBOL(fnvlist_add_string); +EXPORT_SYMBOL(fnvlist_add_nvlist); +EXPORT_SYMBOL(fnvlist_add_boolean_array); +EXPORT_SYMBOL(fnvlist_add_byte_array); +EXPORT_SYMBOL(fnvlist_add_int8_array); +EXPORT_SYMBOL(fnvlist_add_uint8_array); +EXPORT_SYMBOL(fnvlist_add_int16_array); +EXPORT_SYMBOL(fnvlist_add_uint16_array); +EXPORT_SYMBOL(fnvlist_add_int32_array); +EXPORT_SYMBOL(fnvlist_add_uint32_array); +EXPORT_SYMBOL(fnvlist_add_int64_array); +EXPORT_SYMBOL(fnvlist_add_uint64_array); +EXPORT_SYMBOL(fnvlist_add_string_array); +EXPORT_SYMBOL(fnvlist_add_nvlist_array); + +EXPORT_SYMBOL(fnvlist_remove); +EXPORT_SYMBOL(fnvlist_remove_nvpair); + +EXPORT_SYMBOL(fnvlist_lookup_nvpair); +EXPORT_SYMBOL(fnvlist_lookup_boolean); +EXPORT_SYMBOL(fnvlist_lookup_boolean_value); +EXPORT_SYMBOL(fnvlist_lookup_byte); +EXPORT_SYMBOL(fnvlist_lookup_int8); +EXPORT_SYMBOL(fnvlist_lookup_uint8); +EXPORT_SYMBOL(fnvlist_lookup_int16); +EXPORT_SYMBOL(fnvlist_lookup_uint16); +EXPORT_SYMBOL(fnvlist_lookup_int32); +EXPORT_SYMBOL(fnvlist_lookup_uint32); +EXPORT_SYMBOL(fnvlist_lookup_int64); +EXPORT_SYMBOL(fnvlist_lookup_uint64); +EXPORT_SYMBOL(fnvlist_lookup_string); +EXPORT_SYMBOL(fnvlist_lookup_nvlist); + +EXPORT_SYMBOL(fnvpair_value_boolean_value); +EXPORT_SYMBOL(fnvpair_value_byte); +EXPORT_SYMBOL(fnvpair_value_int8); +EXPORT_SYMBOL(fnvpair_value_uint8); +EXPORT_SYMBOL(fnvpair_value_int16); +EXPORT_SYMBOL(fnvpair_value_uint16); +EXPORT_SYMBOL(fnvpair_value_int32); +EXPORT_SYMBOL(fnvpair_value_uint32); +EXPORT_SYMBOL(fnvpair_value_int64); +EXPORT_SYMBOL(fnvpair_value_uint64); +EXPORT_SYMBOL(fnvpair_value_string); +EXPORT_SYMBOL(fnvpair_value_nvlist); + +#endif diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 303edcefc..1173fc0c9 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -79,6 +79,8 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "", "SIZE"); zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREE"); + zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "FREEING"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, @@ -170,6 +172,26 @@ zpool_prop_default_numeric(zpool_prop_t prop) return (zpool_prop_table[prop].pd_numdefault); } +/* + * Returns true if this is a valid feature@ property. + */ +boolean_t +zpool_prop_feature(const char *name) +{ + static const char *prefix = "feature@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* + * Returns true if this is a valid unsupported@ property. + */ +boolean_t +zpool_prop_unsupported(const char *name) +{ + static const char *prefix = "unsupported@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + int zpool_prop_string_to_index(zpool_prop_t prop, const char *string, uint64_t *index) @@ -223,6 +245,8 @@ EXPORT_SYMBOL(zpool_prop_to_name); EXPORT_SYMBOL(zpool_prop_default_string); EXPORT_SYMBOL(zpool_prop_default_numeric); EXPORT_SYMBOL(zpool_prop_readonly); +EXPORT_SYMBOL(zpool_prop_feature); +EXPORT_SYMBOL(zpool_prop_unsupported); EXPORT_SYMBOL(zpool_prop_index_to_string); EXPORT_SYMBOL(zpool_prop_string_to_index); #endif diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 98576d1d2..f1e32a19d 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o +$(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o $(MODULE)-objs += @top_srcdir@/module/zfs/dmu.o @@ -59,6 +60,8 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_root.o $(MODULE)-objs += @top_srcdir@/module/zfs/zap.o $(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o $(MODULE)-objs += @top_srcdir@/module/zfs/zap_micro.o +$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature.o +$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature_common.o $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_acl.o $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_byteswap.o $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ctldir.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6ec9f04b7..5d9b34fbf 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2729,9 +2729,11 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + dmu_ot_byteswap[bswap].ob_func; func(buf->b_data, hdr->b_size); } diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c new file mode 100644 index 000000000..8c5a7d40e --- /dev/null +++ b/module/zfs/bptree.c @@ -0,0 +1,224 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * A bptree is a queue of root block pointers from destroyed datasets. When a + * dataset is destroyed its root block pointer is put on the end of the pool's + * bptree queue so the dataset's blocks can be freed asynchronously by + * dsl_scan_sync. This allows the delete operation to finish without traversing + * all the dataset's blocks. + * + * Note that while bt_begin and bt_end are only ever incremented in this code + * they are effectively reset to 0 every time the entire bptree is freed because + * the bptree's object is destroyed and re-created. + */ + +struct bptree_args { + bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ + boolean_t ba_free; /* true if freeing during traversal */ + + bptree_itor_t *ba_func; /* function to call for each blockpointer */ + void *ba_arg; /* caller supplied argument to ba_func */ + dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ +} bptree_args_t; + +uint64_t +bptree_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t obj; + dmu_buf_t *db; + bptree_phys_t *bt; + + obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, + SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (bptree_phys_t), tx); + + /* + * Bonus buffer contents are already initialized to 0, but for + * readability we make it explicit. + */ + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + bt = db->db_data; + bt->bt_begin = 0; + bt->bt_end = 0; + bt->bt_bytes = 0; + bt->bt_comp = 0; + bt->bt_uncomp = 0; + dmu_buf_rele(db, FTAG); + + return (obj); +} + +int +bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + ASSERT3U(bt->bt_begin, ==, bt->bt_end); + ASSERT3U(bt->bt_bytes, ==, 0); + ASSERT3U(bt->bt_comp, ==, 0); + ASSERT3U(bt->bt_uncomp, ==, 0); + dmu_buf_rele(db, FTAG); + + return (dmu_object_free(os, obj, tx)); +} + +void +bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + bptree_entry_phys_t bte; + + /* + * bptree objects are in the pool mos, therefore they can only be + * modified in syncing context. Furthermore, this is only modified + * by the sync thread, so no locking is necessary. + */ + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + + bte.be_birth_txg = birth_txg; + bte.be_bp = *bp; + bzero(&bte.be_zb, sizeof (bte.be_zb)); + dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + + dmu_buf_will_dirty(db, tx); + bt->bt_end++; + bt->bt_bytes += bytes; + bt->bt_comp += comp; + bt->bt_uncomp += uncomp; + dmu_buf_rele(db, FTAG); +} + +/* ARGSUSED */ +static int +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + int err; + struct bptree_args *ba = arg; + + if (bp == NULL) + return (0); + + err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); + if (err == 0 && ba->ba_free) { + ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); + ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); + ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); + } + return (err); +} + +int +bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, + void *arg, dmu_tx_t *tx) +{ + int err; + uint64_t i; + dmu_buf_t *db; + struct bptree_args ba; + + ASSERT(!free || dmu_tx_is_syncing(tx)); + + err = dmu_bonus_hold(os, obj, FTAG, &db); + if (err != 0) + return (err); + + if (free) + dmu_buf_will_dirty(db, tx); + + ba.ba_phys = db->db_data; + ba.ba_free = free; + ba.ba_func = func; + ba.ba_arg = arg; + ba.ba_tx = tx; + + err = 0; + for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { + bptree_entry_phys_t bte; + + ASSERT(!free || i == ba.ba_phys->bt_begin); + + err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), + &bte, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, + bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST, + bptree_visit_cb, &ba); + if (free) { + ASSERT(err == 0 || err == ERESTART); + if (err != 0) { + /* save bookmark for future resume */ + ASSERT3U(bte.be_zb.zb_objset, ==, + ZB_DESTROYED_OBJSET); + ASSERT3U(bte.be_zb.zb_level, ==, 0); + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + break; + } else { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } + } + + ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + + /* if all blocks are free there should be no used space */ + if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + ASSERT3U(ba.ba_phys->bt_bytes, ==, 0); + ASSERT3U(ba.ba_phys->bt_comp, ==, 0); + ASSERT3U(ba.ba_phys->bt_uncomp, ==, 0); + } + + dmu_buf_rele(db, FTAG); + + return (err); +} diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 1f6fa9340..99f728f4b 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -260,7 +261,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db) boolean_t is_metadata; DB_DNODE_ENTER(db); - is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 6221157f8..ef868619a 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -1120,11 +1121,9 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { - spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, - DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, - &spa->spa_ddt_stat_object, tx) == 0); + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); } while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 00a7a07f4..5d3f70d4c 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -46,60 +47,73 @@ #endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { byteswap_uint8_array, TRUE, "unallocated" }, - { zap_byteswap, TRUE, "object directory" }, - { byteswap_uint64_array, TRUE, "object array" }, - { byteswap_uint8_array, TRUE, "packed nvlist" }, - { byteswap_uint64_array, TRUE, "packed nvlist size" }, - { byteswap_uint64_array, TRUE, "bpobj" }, - { byteswap_uint64_array, TRUE, "bpobj header" }, - { byteswap_uint64_array, TRUE, "SPA space map header" }, - { byteswap_uint64_array, TRUE, "SPA space map" }, - { byteswap_uint64_array, TRUE, "ZIL intent log" }, - { dnode_buf_byteswap, TRUE, "DMU dnode" }, - { dmu_objset_byteswap, TRUE, "DMU objset" }, - { byteswap_uint64_array, TRUE, "DSL directory" }, - { zap_byteswap, TRUE, "DSL directory child map"}, - { zap_byteswap, TRUE, "DSL dataset snap map" }, - { zap_byteswap, TRUE, "DSL props" }, - { byteswap_uint64_array, TRUE, "DSL dataset" }, - { zfs_znode_byteswap, TRUE, "ZFS znode" }, - { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, - { byteswap_uint8_array, FALSE, "ZFS plain file" }, - { zap_byteswap, TRUE, "ZFS directory" }, - { zap_byteswap, TRUE, "ZFS master node" }, - { zap_byteswap, TRUE, "ZFS delete queue" }, - { byteswap_uint8_array, FALSE, "zvol object" }, - { zap_byteswap, TRUE, "zvol prop" }, - { byteswap_uint8_array, FALSE, "other uint8[]" }, - { byteswap_uint64_array, FALSE, "other uint64[]" }, - { zap_byteswap, TRUE, "other ZAP" }, - { zap_byteswap, TRUE, "persistent error log" }, - { byteswap_uint8_array, TRUE, "SPA history" }, - { byteswap_uint64_array, TRUE, "SPA history offsets" }, - { zap_byteswap, TRUE, "Pool properties" }, - { zap_byteswap, TRUE, "DSL permissions" }, - { zfs_acl_byteswap, TRUE, "ZFS ACL" }, - { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, - { byteswap_uint8_array, TRUE, "FUID table" }, - { byteswap_uint64_array, TRUE, "FUID table size" }, - { zap_byteswap, TRUE, "DSL dataset next clones"}, - { zap_byteswap, TRUE, "scan work queue" }, - { zap_byteswap, TRUE, "ZFS user/group used" }, - { zap_byteswap, TRUE, "ZFS user/group quota" }, - { zap_byteswap, TRUE, "snapshot refcount tags"}, - { zap_byteswap, TRUE, "DDT ZAP algorithm" }, - { zap_byteswap, TRUE, "DDT statistics" }, - { byteswap_uint8_array, TRUE, "System attributes" }, - { zap_byteswap, TRUE, "SA master node" }, - { zap_byteswap, TRUE, "SA attr registration" }, - { zap_byteswap, TRUE, "SA attr layouts" }, - { zap_byteswap, TRUE, "scan translations" }, - { byteswap_uint8_array, FALSE, "deduplicated block" }, - { zap_byteswap, TRUE, "DSL deadlist map" }, - { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, - { zap_byteswap, TRUE, "DSL dir clones" }, - { byteswap_uint64_array, TRUE, "bpobj subobj" }, + { DMU_BSWAP_UINT8, TRUE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, + { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, + { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } +}; + +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { + { byteswap_uint8_array, "uint8" }, + { byteswap_uint16_array, "uint16" }, + { byteswap_uint32_array, "uint32" }, + { byteswap_uint64_array, "uint64" }, + { zap_byteswap, "zap" }, + { dnode_buf_byteswap, "dnode" }, + { dmu_objset_byteswap, "objset" }, + { zfs_znode_byteswap, "znode" }, + { zfs_oldacl_byteswap, "oldacl" }, + { zfs_acl_byteswap, "acl" } }; int @@ -176,7 +190,7 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (type > DMU_OT_NUMTYPES) { + if (!DMU_OT_IS_VALID(type)) { error = EINVAL; } else if (dn->dn_bonus != db) { error = EINVAL; @@ -1695,7 +1709,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; - boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata || + boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 949f4d773..97c23cb07 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1077,8 +1077,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) void *data = NULL; if (drro->drr_type == DMU_OT_NONE || - drro->drr_type >= DMU_OT_NUMTYPES || - drro->drr_bonustype >= DMU_OT_NUMTYPES || + !DMU_OT_IS_VALID(drro->drr_type) || + !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || @@ -1143,7 +1143,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { - dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drro->drr_bonustype); + dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); @@ -1186,7 +1188,7 @@ restore_write(struct restorearg *ra, objset_t *os, int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || - drrw->drr_type >= DMU_OT_NUMTYPES) + !DMU_OT_IS_VALID(drrw->drr_type)) return (EINVAL); data = restore_read(ra, drrw->drr_length); @@ -1205,8 +1207,11 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } - if (ra->byteswap) - dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + if (ra->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); + } dmu_write(os, drrw->drr_object, drrw->drr_offset, drrw->drr_length, data, tx); dmu_tx_commit(tx); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 376f60f82..980c1aa2f 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -53,6 +54,7 @@ typedef struct traverse_data { uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; + zbookmark_t *td_resume; int td_flags; prefetch_data_t *td_pfd; blkptr_cb_t *td_func; @@ -128,6 +130,54 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh) zil_free(zilog); } +typedef enum resume_skip { + RESUME_SKIP_ALL, + RESUME_SKIP_NONE, + RESUME_SKIP_CHILDREN +} resume_skip_t; + +/* + * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and + * the block indicated by zb does not need to be visited at all. Returns + * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the + * resume point. This indicates that this block should be visited but not its + * children (since they must have been visited in a previous traversal). + * Otherwise returns RESUME_SKIP_NONE. + */ +static resume_skip_t +resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, + const zbookmark_t *zb) +{ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + /* + * If we already visited this bp & everything below, + * don't bother doing it again. + */ + if (zbookmark_is_before(dnp, zb, td->td_resume)) + return (RESUME_SKIP_ALL); + + /* + * If we found the block we're trying to resume from, zero + * the bookmark out to indicate that we have resumed. + */ + ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object); + if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { + bzero(td->td_resume, sizeof (*zb)); + if (td->td_flags & TRAVERSE_POST) + return (RESUME_SKIP_CHILDREN); + } + } + return (RESUME_SKIP_NONE); +} + +static void +traverse_pause(traverse_data_t *td, const zbookmark_t *zb) +{ + ASSERT(td->td_resume != NULL); + ASSERT3U(zb->zb_level, ==, 0); + bcopy(zb, td->td_resume, sizeof (*td->td_resume)); +} + static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) @@ -137,8 +187,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; + boolean_t pause = B_FALSE; + + switch (resume_skip_check(td, dnp, zb)) { + case RESUME_SKIP_ALL: + return (0); + case RESUME_SKIP_CHILDREN: + goto post; + case RESUME_SKIP_NONE: + break; + default: + ASSERT(0); + } - if (bp->blk_birth == 0) { + if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, td->td_arg); return (err); @@ -164,8 +226,10 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); - if (err) - return (err); + if (err == ERESTART) + pause = B_TRUE; /* handle pausing at a common point */ + if (err != 0) + goto post; } if (BP_GET_LEVEL(bp) > 0) { @@ -253,9 +317,18 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (buf) (void) arc_buf_remove_ref(buf, &buf); +post: if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, td->td_arg); + if (err == ERESTART) + pause = B_TRUE; + } + + if (pause && td->td_resume != NULL) { + ASSERT3U(err, ==, ERESTART); + ASSERT(!hard); + traverse_pause(td, zb); } return (err != 0 ? err : lasterr); @@ -353,22 +426,27 @@ traverse_prefetch_thread(void *arg) * in syncing context). */ static int -traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, - uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) +traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg) { traverse_data_t *td; prefetch_data_t *pd; zbookmark_t *czb; int err; + ASSERT(ds == NULL || objset == ds->ds_object); + ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); + td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE); pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE); czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE); td->td_spa = spa; - td->td_objset = ds ? ds->ds_object : 0; + td->td_objset = objset; td->td_rootbp = rootbp; td->td_min_txg = txg_start; + td->td_resume = resume; td->td_func = func; td->td_arg = arg; td->td_pfd = pd; @@ -424,8 +502,17 @@ int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, - &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, + &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg)); +} + +int +traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_t *resume, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, + blkptr, txg_start, resume, flags, func, arg)); } /* @@ -442,8 +529,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ - err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), - txg_start, flags, func, arg); + err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), + txg_start, NULL, flags, func, arg); if (err) return (err); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 81c6dfea2..47ec4c109 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -20,9 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -693,7 +692,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) return; } - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { blkptr_t *bp; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 99ac62565..3a8a5e32e 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -194,7 +195,7 @@ dnode_verify(dnode_t *dn) ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; @@ -212,7 +213,7 @@ dnode_verify(dnode_t *dn) ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); - ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); @@ -278,8 +279,10 @@ dnode_byteswap(dnode_phys_t *dnp) */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); size_t len = DN_MAX_BONUSLEN - off; - ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); - dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + dmu_object_byteswap_t byteswap; + ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); + byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); + dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ @@ -407,7 +410,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dmu_zfetch_init(&dn->dn_zfetch, dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); list_insert_head(&os->os_dnodes, dn); @@ -496,11 +499,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); - ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT3U(dn->dn_maxblkid, ==, 0); @@ -568,7 +571,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index f2dda867d..58fa473cb 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -598,7 +600,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) } if (dn->dn_next_bonustype[txgoff]) { - ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; dn->dn_next_bonustype[txgoff] = 0; } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 21fdd081c..872d44afb 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -102,7 +103,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) if (BP_IS_HOLE(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); - ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { /* * Account for the meta-objset space in its placeholder @@ -119,7 +120,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); - ds->ds_phys->ds_used_bytes += used; + ds->ds_phys->ds_referenced_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; @@ -215,8 +216,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, } } mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); - ds->ds_phys->ds_used_bytes -= used; + ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); + ds->ds_phys->ds_referenced_bytes -= used; ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); ds->ds_phys->ds_compressed_bytes -= compressed; ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); @@ -823,8 +824,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = origin->ds_phys->ds_creation_txg; - dsphys->ds_used_bytes = - origin->ds_phys->ds_used_bytes; + dsphys->ds_referenced_bytes = + origin->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = origin->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = @@ -938,7 +939,6 @@ dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { dsl_dataset_t *ds; - int err; err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); if (err == 0) { @@ -1088,19 +1088,23 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) goto out_free; /* - * remove the objects in open context, so that we won't - * have too much to do in syncing context. + * If async destruction is not enabled try to remove all objects + * while in the open context so that there is less work to do in + * the syncing context. */ - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, - ds->ds_phys->ds_prev_snap_txg)) { - /* - * Ignore errors, if there is not enough disk space - * we will deal with it in dsl_dataset_destroy_sync(). - */ - (void) dmu_free_object(os, obj); + if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, + ds->ds_phys->ds_prev_snap_txg)) { + /* + * Ignore errors, if there is not enough disk space + * we will deal with it in dsl_dataset_destroy_sync(). + */ + (void) dmu_free_object(os, obj); + } + if (err != ESRCH) + goto out_free; } - if (err != ESRCH) - goto out_free; /* * Only the ZIL knows how to free log blocks. @@ -1261,7 +1265,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) - mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; + mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; else mrs_used = 0; @@ -1269,7 +1273,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ASSERT3U(dlused, <=, mrs_used); ds->ds_phys->ds_unique_bytes = - ds->ds_phys->ds_used_bytes - (mrs_used - dlused); + ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) @@ -1627,12 +1631,36 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, ds_next->ds_phys->ds_deadlist_obj); } +static int +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + int err; + struct killarg ka; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + err = traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, + kill_blkptr, &ka); + ASSERT3U(err, ==, 0); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); + + return (err); +} + void dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_destroyarg *dsda = arg1; dsl_dataset_t *ds = dsda->ds; - int err; + int err = 0; int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; @@ -1773,7 +1801,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); - dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, @@ -1849,32 +1876,54 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) } dsl_dataset_rele(ds_next, FTAG); } else { + zfeature_info_t *async_destroy = + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; + /* * There's no next snapshot, so this is a head dataset. * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty. (If it's a clone, it's * safe to ignore the deadlist contents.) */ - struct killarg ka; - dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * NB: this should be very quick, because we already - * freed all the objects in open context. - */ - ka.ds = ds; - ka.tx = tx; - err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - TRAVERSE_POST, kill_blkptr, &ka); - ASSERT3U(err, ==, 0); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - ds->ds_phys->ds_unique_bytes == 0); + if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { + err = old_synchronous_dataset_destroy(ds, tx); + } else { + /* + * Move the bptree into the pool's list of trees to + * clean up and update space accounting information. + */ + uint64_t used, comp, uncomp; + + ASSERT(err == 0 || err == EBUSY); + if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { + spa_feature_incr(dp->dp_spa, async_destroy, tx); + dp->dp_bptree_obj = bptree_alloc( + dp->dp_meta_objset, tx); + VERIFY(zap_add(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx) == 0); + } + + used = ds->ds_dir->dd_phys->dd_used_bytes; + comp = ds->ds_dir->dd_phys->dd_compressed_bytes; + uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + ds->ds_phys->ds_unique_bytes == used); + + bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj, + &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, + used, comp, uncomp, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + } if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { @@ -2065,7 +2114,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; - dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; + dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; dsphys->ds_flags = ds->ds_phys->ds_flags; @@ -2189,10 +2238,22 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAXNAMELEN]; + /* + * Even though we hold the dp_config_rwlock, the dataset + * may fail to open, returning ENOENT. If there is a + * thread concurrently attempting to destroy this + * dataset, it will have the ds_rwlock held for + * RW_WRITER. Our call to dsl_dataset_hold_obj() -> + * dsl_dataset_hold_ref() will fail its + * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the + * dp_config_rwlock, and wait for the destroy progress + * and signal ds_exclusive_cv. If the destroy was + * successful, we will see that + * DSL_DATASET_IS_DESTROYED(), and return ENOENT. + */ if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone) != 0) { - goto fail; - } + za.za_first_integer, FTAG, &clone) != 0) + continue; dsl_dir_name(clone->ds_dir, buf); VERIFY(nvlist_add_boolean(val, buf) == 0); dsl_dataset_rele(clone, FTAG); @@ -2316,7 +2377,7 @@ dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - *refdbytesp = ds->ds_phys->ds_used_bytes; + *refdbytesp = ds->ds_phys->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; @@ -2652,7 +2713,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ - pa->used = origin_ds->ds_phys->ds_used_bytes; + pa->used = origin_ds->ds_phys->ds_referenced_bytes; pa->comp = origin_ds->ds_phys->ds_compressed_bytes; pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; for (snap = list_head(&pa->shared_snaps); snap; @@ -2686,7 +2747,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * so we need to subtract out the clone origin's used space. */ if (pa->origin_origin) { - pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; + pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } @@ -3203,8 +3264,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsl_deadlist_space(&csa->ohds->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); - dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - - (csa->ohds->ds_phys->ds_used_bytes + odl_used); + dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - + (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + @@ -3233,8 +3294,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) } /* swap ds_*_bytes */ - SWITCH64(csa->ohds->ds_phys->ds_used_bytes, - csa->cds->ds_phys->ds_used_bytes); + SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, + csa->cds->ds_phys->ds_referenced_bytes); SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, csa->cds->ds_phys->ds_compressed_bytes); SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, @@ -3363,8 +3424,9 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { - if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) + if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || + ds->ds_phys->ds_referenced_bytes < ds->ds_quota) error = ERESTART; else error = EDQUOT; @@ -3393,7 +3455,7 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) if (psa->psa_effective_value == 0) return (0); - if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || + if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || psa->psa_effective_value < ds->ds_reserved) return (ENOSPC); @@ -4141,8 +4203,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, dsl_pool_t *dp = new->ds_dir->dd_pool; *usedp = 0; - *usedp += new->ds_phys->ds_used_bytes; - *usedp -= oldsnap->ds_phys->ds_used_bytes; + *usedp += new->ds_phys->ds_referenced_bytes; + *usedp -= oldsnap->ds_phys->ds_referenced_bytes; *compp = 0; *compp += new->ds_phys->ds_compressed_bytes; @@ -4158,9 +4220,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, dsl_dataset_t *snap; uint64_t used, comp, uncomp; - err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); - if (err != 0) - break; + if (snapobj == new->ds_object) { + snap = new; + } else { + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } if (snap->ds_phys->ds_prev_snap_txg == oldsnap->ds_phys->ds_creation_txg) { @@ -4189,7 +4255,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, * was not a snapshot of/before new. */ snapobj = snap->ds_phys->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); + if (snap != new) + dsl_dataset_rele(snap, FTAG); if (snapobj == 0) { err = EINVAL; break; diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index a4d4e42da..294932c45 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -171,10 +171,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { - jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, - DMU_OT_NONE, 0, tx); - VERIFY(zap_update(mos, zapobj, - whokey, 8, 1, &jumpobj, tx) == 0); + jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, + zapobj, whokey, tx); } while ((permpair = nvlist_next_nvpair(perms, permpair))) { diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 7e0fba589..85c745e8a 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -40,6 +40,8 @@ #include #include #include +#include +#include int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -240,20 +242,30 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) } int -dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); + if (err != 0) + dsl_pool_close(dp); + else + *dpp = dp; + + return (err); +} + +int +dsl_pool_open(dsl_pool_t *dp) +{ + int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rw_enter(&dp->dp_config_rwlock, RW_WRITER); - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, - &dp->dp_meta_objset); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); @@ -269,7 +281,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; @@ -286,7 +298,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; } - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) @@ -300,6 +312,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) dp->dp_meta_objset, obj)); } + if (spa_feature_is_active(dp->dp_spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj); + if (err != 0) + goto out; + } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); @@ -308,15 +329,10 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - err = dsl_scan_init(dp, txg); + err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rw_exit(&dp->dp_config_rwlock); - if (err) - dsl_pool_close(dp); - else - *dpp = dp; - return (err); } @@ -611,7 +627,7 @@ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || - spa_get_dsl(dp->dp_spa) == NULL); + spa_is_initializing(dp->dp_spa)); } uint64_t @@ -932,11 +948,8 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); - dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, - DMU_OT_NONE, 0, tx); - - VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, - sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); + dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index c2386dd67..297caa067 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -44,6 +45,7 @@ #include #include #include +#include #ifdef _KERNEL #include #endif @@ -379,55 +381,6 @@ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, priority, zio_flags, arc_flags, zb)); } -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { @@ -459,7 +412,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ @@ -614,13 +567,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, /* * We never skip over user/group accounting objects (obj<0) */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ - if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* @@ -823,22 +776,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) goto out; - if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { - /* - * For non-user-accounting blocks, we need to read the - * new bp (from a deleted snapshot, found in - * check_existing_xlation). If we used the old bp, - * pointers inside this block from before we resumed - * would be untranslated. - * - * For user-accounting blocks, we need to read the old - * bp, because we will apply the entire space delta to - * it (original untranslated -> translations from - * deleted snap -> now). - */ - *bp_toread = *bp; - } - if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx, &buf) != 0) goto out; @@ -1414,19 +1351,28 @@ out: kmem_free(zc, sizeof(zap_cursor_t)); } -static int -dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +static boolean_t +dsl_scan_free_should_pause(dsl_scan_t *scn) { - dsl_scan_t *scn = arg; uint64_t elapsed_nanosecs; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || - spa_shutting_down(scn->scn_dp->dp_spa)) - return (ERESTART); + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +static int +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + + if (!scn->scn_is_bptree || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { + if (dsl_scan_free_should_pause(scn)) + return (ERESTART); + } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); @@ -1451,6 +1397,10 @@ dsl_scan_active(dsl_scan_t *scn) if (scn->scn_phys.scn_state == DSS_SCANNING) return (B_TRUE); + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + return (B_TRUE); + } if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); @@ -1497,14 +1447,40 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * traversing it. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_cb, scn, tx); + dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + + if (err == 0 && spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, + scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (err != 0) + return; + + /* disable async destroy feature */ + spa_feature_decr(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); + ASSERT(!spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj txg %llu", + "free_bpobj/bptree txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) (gethrtime() - scn->scn_sync_start_time) / MICROSEC, @@ -1619,9 +1595,13 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; - zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; + zfs_blkstat_t *zb; + + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; + zb = &zab->zab_type[l][t]; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); diff --git a/module/zfs/sa.c b/module/zfs/sa.c index d4b28cc90..240a683d6 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -446,10 +448,9 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { - sa->sa_layout_attr_obj = zap_create(os, - DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, - &sa->sa_layout_attr_obj, tx) == 0); + sa->sa_layout_attr_obj = zap_create_link(os, + DMU_OT_SA_ATTR_LAYOUTS, + sa->sa_master_obj, SA_LAYOUTS, tx); } (void) snprintf(attr_name, sizeof (attr_name), @@ -1583,10 +1584,9 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) } if (sa->sa_reg_attr_obj == 0) { - sa->sa_reg_attr_obj = zap_create(hdl->sa_os, - DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj, - SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0); + sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, + sa->sa_master_obj, SA_REGISTRY, tx); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b610a0dae..244f10d47 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -63,6 +63,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -114,6 +115,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; +static dsl_syncfunc_t spa_sync_version; static dsl_syncfunc_t spa_sync_props; static boolean_t spa_has_active_shared_spare(spa_t *spa); static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, @@ -169,6 +171,7 @@ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; + dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size; uint64_t alloc; uint64_t space; @@ -216,6 +219,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } + if (pool != NULL) { + dsl_dir_t *freedir = pool->dp_free_dir; + + /* + * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, + * when opening pools before this version freedir will be NULL. + */ + if (freedir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + freedir->dd_phys->dd_used_bytes, src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + NULL, 0, src); + } + } + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { @@ -357,25 +376,55 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; + boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - zpool_prop_t prop; - char *propname, *strval; uint64_t intval; - objset_t *os; - char *slash, *check; + char *strval, *slash, *check, *fname; + const char *propname = nvpair_name(elem); + zpool_prop_t prop = zpool_name_to_prop(propname); + + switch ((int)prop) { + case ZPROP_INVAL: + if (!zpool_prop_feature(propname)) { + error = EINVAL; + break; + } + + /* + * Sanitize the input. + */ + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = EINVAL; + break; + } + + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } - propname = nvpair_name(elem); + if (intval != 0) { + error = EINVAL; + break; + } - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) - return (EINVAL); + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = EINVAL; + break; + } + + has_feature = B_TRUE; + break; - switch (prop) { case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && - (intval < spa_version(spa) || intval > SPA_VERSION)) + (intval < spa_version(spa) || + intval > SPA_VERSION_BEFORE_FEATURES || + has_feature)) error = EINVAL; break; @@ -412,6 +461,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = nvpair_value_string(elem, &strval); if (!error) { + objset_t *os; uint64_t compress; if (strval == NULL || strval[0] == '\0') { @@ -558,33 +608,58 @@ int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; - nvpair_t *elem; + nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; - zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { - if ((prop = zpool_name_to_prop( - nvpair_name(elem))) == ZPROP_INVAL) - return (EINVAL); + zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; + if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { + uint64_t ver; + + if (prop == ZPOOL_PROP_VERSION) { + VERIFY(nvpair_value_uint64(elem, &ver) == 0); + } else { + ASSERT(zpool_prop_feature(nvpair_name(elem))); + ver = SPA_VERSION_FEATURES; + need_sync = B_TRUE; + } + + /* Save time if the version is already set. */ + if (ver == spa_version(spa)) + continue; + + /* + * In addition to the pool directory object, we might + * create the pool properties object, the features for + * read object, the features for write object, or the + * feature descriptions object. + */ + error = dsl_sync_task_do(spa_get_dsl(spa), NULL, + spa_sync_version, spa, &ver, 6); + if (error) + return (error); + continue; + } + need_sync = B_TRUE; break; } - if (need_sync) + if (need_sync) { return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); - else - return (0); + spa, nvp, 6)); + } + + return (0); } /* @@ -1628,7 +1703,7 @@ spa_load_verify_done(zio_t *zio) int error = zio->io_error; if (error) { - if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_add_64(&sle->sle_meta_count, 1); else @@ -1858,6 +1933,9 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, KM_PUSHPAGE) == 0); } + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); @@ -1891,12 +1969,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, { int error = 0; nvlist_t *nvroot = NULL; + nvlist_t *label; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; + boolean_t missing_feat_write = B_FALSE; /* * If this is an untrusted config, access the pool in read-only mode. @@ -1976,19 +2056,79 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Find the best uberblock. */ - vdev_uberblock_load(NULL, rvd, ub); + vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ - if (ub->ub_txg == 0) + if (ub->ub_txg == 0) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + } /* - * If the pool is newer than the code, we can't open it. + * If the pool has an unsupported version we can't open it. */ - if (ub->ub_version > SPA_VERSION) + if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + } + + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *features; + + /* + * If we weren't able to find what's necessary for reading the + * MOS in the label, return failure. + */ + if (label == NULL || nvlist_lookup_nvlist(label, + ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { + nvlist_free(label); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + /* + * Update our in-core representation with the definitive values + * from the label. + */ + nvlist_free(spa->spa_label_features); + VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); + } + + nvlist_free(label); + + /* + * Look through entries in the label nvlist's features_for_read. If + * there is a feature listed there which we don't understand then we + * cannot open a pool. + */ + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *unsup_feat; + nvpair_t *nvp; + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); + nvp != NULL; + nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { + if (!zfeature_is_supported(nvpair_name(nvp))) { + VERIFY(nvlist_add_string(unsup_feat, + nvpair_name(nvp), "") == 0); + } + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + nvlist_free(unsup_feat); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + nvlist_free(unsup_feat); + } /* * If the vdev guid sum doesn't match the uberblock, we have an @@ -2022,7 +2162,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; - error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; @@ -2030,6 +2170,84 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (spa_version(spa) >= SPA_VERSION_FEATURES) { + boolean_t missing_feat_read = B_FALSE; + nvlist_t *unsup_feat; + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, + &spa->spa_feat_for_read_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, + &spa->spa_feat_for_write_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, + &spa->spa_feat_desc_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + if (!feature_is_supported(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, + unsup_feat)) + missing_feat_read = B_TRUE; + + if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { + if (!feature_is_supported(spa->spa_meta_objset, + spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, + unsup_feat)) + missing_feat_write = B_TRUE; + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + } + + nvlist_free(unsup_feat); + + if (!missing_feat_read) { + fnvlist_add_boolean(spa->spa_load_info, + ZPOOL_CONFIG_CAN_RDONLY); + } + + /* + * If the state is SPA_LOAD_TRYIMPORT, our objective is + * twofold: to determine whether the pool is available for + * import in read-write mode and (if it is not) whether the + * pool is available for import in read-only mode. If the pool + * is available for import in read-write mode, it is displayed + * as available in userland; if it is not available for import + * in read-only mode, it is displayed as unavailable in + * userland. If the pool is available for import in read-only + * mode but not read-write mode, it is displayed as unavailable + * in userland with a special note that the pool is actually + * available for open in read-only mode. + * + * As a result, if the state is SPA_LOAD_TRYIMPORT and we are + * missing a feature for write, we must first determine whether + * the pool can be opened read-only before returning to + * userland in order to know whether to display the + * abovementioned note. + */ + if (missing_feat_read || (missing_feat_write && + spa_writeable(spa))) { + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + } + + spa->spa_is_initializing = B_TRUE; + error = dsl_pool_open(spa->spa_dsl_pool); + spa->spa_is_initializing = B_FALSE; + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (!mosconfig) { uint64_t hostid; nvlist_t *policy = NULL, *nvconfig; @@ -2247,7 +2465,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, nvlist_free(nvconfig); /* - * Now that we've validate the config, check the state of the + * Now that we've validated the config, check the state of the * root vdev. If it can't be opened, it indicates one or * more toplevel vdevs are faulted. */ @@ -2260,6 +2478,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, } } + if (missing_feat_write) { + ASSERT(state == SPA_LOAD_TRYIMPORT); + + /* + * At this point, we know that we can open the pool in + * read-only mode but not read-write mode. We now have enough + * information and can return to userland. + */ + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); + } + /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. @@ -2370,10 +2599,18 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } +/* + * If spa_load() fails this function will try loading prior txg's. If + * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool + * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this + * function will not rewind the pool and will return the same error as + * spa_load(). + */ static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { + nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; @@ -2402,9 +2639,18 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, return (load_error); } - /* Price of rolling back is discarding txgs, including log */ - if (state == SPA_LOAD_RECOVER) + if (state == SPA_LOAD_RECOVER) { + /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * If we aren't rolling back save the load info from our first + * import attempt so that we can restore it after attempting + * to rewind. + */ + loadinfo = spa->spa_load_info; + spa->spa_load_info = fnvlist_alloc(); + } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; @@ -2428,7 +2674,20 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); - return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); + if (state == SPA_LOAD_RECOVER) { + ASSERT3P(loadinfo, ==, NULL); + return (rewind_error); + } else { + /* Store the rewind info as part of the initial load info */ + fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, + spa->spa_load_info); + + /* Restore the initial load info */ + fnvlist_free(spa->spa_load_info); + spa->spa_load_info = loadinfo; + + return (load_error); + } } /* @@ -2698,8 +2957,50 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) } } +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (spa->spa_feat_for_read_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_read_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + if (spa->spa_feat_for_write_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_write_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features) == 0); + nvlist_free(features); +} + int -spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) +spa_get_stats(const char *name, nvlist_t **config, + char *altroot, size_t buflen) { int error; spa_t *spa; @@ -2734,6 +3035,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); + spa_add_feature_stats(spa, *config); } } @@ -2954,6 +3256,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; + boolean_t has_features; + nvpair_t *elem; int c; /* @@ -2980,10 +3284,18 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (error); } - if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), - &version) != 0) + has_features = B_FALSE; + for (elem = nvlist_next_nvpair(props, NULL); + elem != NULL; elem = nvlist_next_nvpair(props, elem)) { + if (zpool_prop_feature(nvpair_name(elem))) + has_features = B_TRUE; + } + + if (has_features || nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; - ASSERT(version <= SPA_VERSION); + } + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; @@ -3059,8 +3371,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_l2cache.sav_sync = B_TRUE; } + spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; + spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). @@ -3084,6 +3398,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } + if (spa_version(spa) >= SPA_VERSION_FEATURES) + spa_feature_create_zap_objects(spa, tx); + if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { @@ -3276,7 +3593,7 @@ spa_import_rootpool(char *devpath, char *devid) } #endif if (config == NULL) { - cmn_err(CE_NOTE, "Can not read the pool label from '%s'", + cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (EIO); } @@ -3590,6 +3907,8 @@ spa_tryimport(nvlist_t *tryconfig) state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we @@ -5305,7 +5624,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) * information. This avoids the dbuf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ - bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); + bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_PUSHPAGE); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, @@ -5390,6 +5709,24 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } +static void +spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + uint64_t version = *(uint64_t *)arg2; + + /* + * Setting the version is special cased when first creating the pool. + */ + ASSERT(tx->tx_txg != TXG_INITIAL); + + ASSERT(version <= SPA_VERSION); + ASSERT(version >= spa_version(spa)); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Set zpool properties. */ @@ -5399,32 +5736,39 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; nvlist_t *nvp = arg2; - nvpair_t *elem; - uint64_t intval; - char *strval; - zpool_prop_t prop; - const char *propname; - zprop_type_t proptype; + nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem))) { - switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + uint64_t intval; + char *strval, *fname; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + zfeature_info_t *feature; + + prop = zpool_name_to_prop(nvpair_name(elem)); + switch ((int)prop) { + case ZPROP_INVAL: + /* + * We checked this earlier in spa_prop_validate(). + */ + ASSERT(zpool_prop_feature(nvpair_name(elem))); + + fname = strchr(nvpair_name(elem), '@') + 1; + VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); + + spa_feature_enable(spa, feature, tx); + break; + case ZPOOL_PROP_VERSION: + VERIFY(nvpair_value_uint64(elem, &intval) == 0); /* - * Only set version for non-zpool-creation cases - * (set/import). spa_create() needs special care - * for version setting. + * The version is synced seperatly before other + * properties and should be correct by now. */ - if (tx->tx_txg != TXG_INITIAL) { - VERIFY(nvpair_value_uint64(elem, - &intval) == 0); - ASSERT(intval <= SPA_VERSION); - ASSERT(intval >= spa_version(spa)); - spa->spa_uberblock.ub_version = intval; - vdev_config_dirty(spa->spa_root_vdev); - } + ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: @@ -5461,14 +5805,10 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { - VERIFY((spa->spa_pool_props_object = - zap_create(mos, DMU_OT_POOL_PROPS, - DMU_OT_NONE, 0, tx)) > 0); - - VERIFY(zap_update(mos, + spa->spa_pool_props_object = + zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, - 8, 1, &spa->spa_pool_props_object, tx) - == 0); + tx); } /* normalize the property name */ @@ -5567,6 +5907,11 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && + spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + spa_feature_create_zap_objects(spa, tx); + } } /* diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index c86884148..09149e622 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -408,6 +409,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); + /* + * Store what's necessary for reading the MOS in the label. + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + spa->spa_label_features) == 0); + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 4a8e6adfd..440a6addb 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -49,6 +49,7 @@ #include #include #include "zfs_prop.h" +#include "zfeature_common.h" /* * SPA locking @@ -217,7 +218,7 @@ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. * - * spa_rename() is also implemented within this file since is requires + * spa_rename() is also implemented within this file since it requires * manipulation of the namespace. */ @@ -479,8 +480,22 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); - if (config != NULL) + if (config != NULL) { + nvlist_t *features; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) == 0) { + VERIFY(nvlist_dup(features, &spa->spa_label_features, + 0) == 0); + } + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + } + + if (spa->spa_label_features == NULL) { + VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + } return (spa); } @@ -518,6 +533,7 @@ spa_remove(spa_t *spa) list_destroy(&spa->spa_config_list); + nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); @@ -1025,6 +1041,20 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) * ========================================================================== */ +void +spa_activate_mos_feature(spa_t *spa, const char *feature) +{ + (void) nvlist_add_boolean(spa->spa_label_features, feature); + vdev_config_dirty(spa->spa_root_vdev); +} + +void +spa_deactivate_mos_feature(spa_t *spa, const char *feature) +{ + (void) nvlist_remove_all(spa->spa_label_features, feature); + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Rename a spa_t. */ @@ -1175,12 +1205,22 @@ spa_generate_guid(spa_t *spa) void sprintf_blkptr(char *buf, const blkptr_t *bp) { - char *type = NULL; + char type[256]; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { - type = dmu_ot[BP_GET_TYPE(bp)].ot_name; + if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + (void) snprintf(type, sizeof (type), "bswap %s %s", + DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? + "metadata" : "data", + dmu_ot_byteswap[bswap].ob_name); + } else { + (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, + sizeof (type)); + } checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } @@ -1252,6 +1292,12 @@ spa_get_dsl(spa_t *spa) return (spa->spa_dsl_pool); } +boolean_t +spa_is_initializing(spa_t *spa) +{ + return (spa->spa_is_initializing); +} + blkptr_t * spa_get_rootblkptr(spa_t *spa) { @@ -1532,6 +1578,7 @@ spa_init(int mode) vdev_cache_stat_init(); zfs_prop_init(); zpool_prop_init(); + zpool_feature_init(); spa_config_load(); l2arc_start(); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index e374f6d78..9335ba511 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1349,7 +1349,8 @@ vdev_validate(vdev_t *vd, boolean_t strict) uint64_t aux_guid = 0; nvlist_t *nvl; - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == + NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); @@ -1993,14 +1994,14 @@ vdev_validate_aux(vdev_t *vd) if (!vdev_readable(vd)) return (0); - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || - version > SPA_VERSION || + !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7ac23500f..352b630fa 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -121,6 +123,8 @@ * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. + * features_for_read + * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * @@ -428,8 +432,13 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } +/* + * Returns the configuration from the label of the given vdev. If 'label' is + * VDEV_BEST_LABEL, each label of the vdev will be read until a valid + * configuration is found; otherwise, only the specified label will be read. + */ nvlist_t * -vdev_label_read_config(vdev_t *vd) +vdev_label_read_config(vdev_t *vd, int label) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; @@ -448,6 +457,8 @@ vdev_label_read_config(vdev_t *vd) retry: for (l = 0; l < VDEV_LABELS; l++) { + if (label >= 0 && label < VDEV_LABELS && label != l) + continue; zio = zio_root(spa, NULL, NULL, flags); @@ -497,7 +508,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, /* * Read the label, if any, and perform some basic sanity checks. */ - if ((label = vdev_label_read_config(vd)) == NULL) + if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, @@ -838,7 +849,7 @@ retry: * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then - * the missing replica comes back, then for a new seconds we'll have two + * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ @@ -858,47 +869,52 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) return (0); } +struct ubl_cbdata { + uberblock_t *ubl_ubbest; /* Best uberblock */ + vdev_t *ubl_vd; /* vdev associated with the above */ + int ubl_label; /* Label associated with the above */ +}; + static void vdev_uberblock_load_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; - uberblock_t *ubbest = rio->io_private; + struct ubl_cbdata *cbp = rio->io_private; - ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (ub->ub_txg <= spa->spa_load_max_txg && - vdev_uberblock_compare(ub, ubbest) > 0) - *ubbest = *ub; + vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { + /* + * Keep track of the vdev and label in which this + * uberblock was found. We will use this information + * later to obtain the config nvlist associated with + * this uberblock. + */ + *cbp->ubl_ubbest = *ub; + cbp->ubl_vd = vd; + cbp->ubl_label = vdev_label_number(vd->vdev_psize, + zio->io_offset); + } mutex_exit(&rio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); } -void -vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +static void +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; int c, l, n; - if (vd == rvd) { - ASSERT(zio == NULL); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - zio = zio_root(spa, NULL, ubbest, flags); - bzero(ubbest, sizeof (uberblock_t)); - } - - ASSERT(zio != NULL); - for (c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (l = 0; l < VDEV_LABELS; l++) { @@ -911,11 +927,45 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) } } } +} - if (vd == rvd) { - (void) zio_wait(zio); - spa_config_exit(spa, SCL_ALL, FTAG); +/* + * Reads the 'best' uberblock from disk along with its associated + * configuration. First, we read the uberblock array of each label of each + * vdev, keeping track of the uberblock with the highest txg in each array. + * Then, we read the configuration from the same label as the best uberblock. + */ +void +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) +{ + int i; + zio_t *zio; + spa_t *spa = rvd->vdev_spa; + struct ubl_cbdata cb; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(ub); + ASSERT(config); + + bzero(ub, sizeof (uberblock_t)); + *config = NULL; + + cb.ubl_ubbest = ub; + cb.ubl_vd = NULL; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, &cb, flags); + vdev_uberblock_load_impl(zio, rvd, flags, &cb); + (void) zio_wait(zio); + if (cb.ubl_vd != NULL) { + for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) { + *config = vdev_label_read_config(cb.ubl_vd, i); + if (*config != NULL) + break; + } } + spa_config_exit(spa, SCL_ALL, FTAG); } /* diff --git a/module/zfs/zap.c b/module/zfs/zap.c index fac54eab0..59f62fa3a 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -946,6 +947,19 @@ fzap_prefetch(zap_name_t *zn) * Helper functions for consumers. */ +uint64_t +zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, dmu_tx_t *tx) +{ + uint64_t new_obj; + + VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0); + VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, + tx) == 0); + + return (new_obj); +} + int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index d5b97dac9..3d8cae0d3 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -461,7 +461,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif @@ -585,7 +585,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c new file mode 100644 index 000000000..de9d165a9 --- /dev/null +++ b/module/zfs/zfeature.c @@ -0,0 +1,422 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include "zfeature_common.h" +#include + +/* + * ZFS Feature Flags + * ----------------- + * + * ZFS feature flags are used to provide fine-grained versioning to the ZFS + * on-disk format. Once enabled on a pool feature flags replace the old + * spa_version() number. + * + * Each new on-disk format change will be given a uniquely identifying string + * guid rather than a version number. This avoids the problem of different + * organizations creating new on-disk formats with the same version number. To + * keep feature guids unique they should consist of the reverse dns name of the + * organization which implemented the feature and a short name for the feature, + * separated by a colon (e.g. com.delphix:async_destroy). + * + * Reference Counts + * ---------------- + * + * Within each pool features can be in one of three states: disabled, enabled, + * or active. These states are differentiated by a reference count stored on + * disk for each feature: + * + * 1) If there is no reference count stored on disk the feature is disabled. + * 2) If the reference count is 0 a system administrator has enabled the + * feature, but the feature has not been used yet, so no on-disk + * format changes have been made. + * 3) If the reference count is greater than 0 the feature is active. + * The format changes required by the feature are currently on disk. + * Note that if the feature's format changes are reversed the feature + * may choose to set its reference count back to 0. + * + * Feature flags makes no differentiation between non-zero reference counts + * for an active feature (e.g. a reference count of 1 means the same thing as a + * reference count of 27834721), but feature implementations may choose to use + * the reference count to store meaningful information. For example, a new RAID + * implementation might set the reference count to the number of vdevs using + * it. If all those disks are removed from the pool the feature goes back to + * having a reference count of 0. + * + * It is the responsibility of the individual features to maintain a non-zero + * reference count as long as the feature's format changes are present on disk. + * + * Dependencies + * ------------ + * + * Each feature may depend on other features. The only effect of this + * relationship is that when a feature is enabled all of its dependencies are + * automatically enabled as well. Any future work to support disabling of + * features would need to ensure that features cannot be disabled if other + * enabled features depend on them. + * + * On-disk Format + * -------------- + * + * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES + * (5000). In order for this to work the pool is automatically upgraded to + * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk + * format changes will be in use. + * + * Information about features is stored in 3 ZAP objects in the pool's MOS. + * These objects are linked to by the following names in the pool directory + * object: + * + * 1) features_for_read: feature guid -> reference count + * Features needed to open the pool for reading. + * 2) features_for_write: feature guid -> reference count + * Features needed to open the pool for writing. + * 3) feature_descriptions: feature guid -> descriptive string + * A human readable string. + * + * All enabled features appear in either features_for_read or + * features_for_write, but not both. + * + * To open a pool in read-only mode only the features listed in + * features_for_read need to be supported. + * + * To open the pool in read-write mode features in both features_for_read and + * features_for_write need to be supported. + * + * Some features may be required to read the ZAP objects containing feature + * information. To allow software to check for compatibility with these features + * before the pool is opened their names must be stored in the label in a + * new "features_for_read" entry (note that features that are only required + * to write to a pool never need to be stored in the label since the + * features_for_write ZAP object can be read before the pool is written to). + * To save space in the label features must be explicitly marked as needing to + * be written to the label. Also, reference counts are not stored in the label, + * instead any feature whose reference count drops to 0 is removed from the + * label. + * + * Adding New Features + * ------------------- + * + * Features must be registered in zpool_feature_init() function in + * zfeature_common.c using the zfeature_register() function. This function + * has arguments to specify if the feature should be stored in the + * features_for_read or features_for_write ZAP object and if it needs to be + * written to the label when active. + * + * Once a feature is registered it will appear as a "feature@" + * property which can be set by an administrator. Feature implementors should + * use the spa_feature_is_enabled() and spa_feature_is_active() functions to + * query the state of a feature and the spa_feature_incr() and + * spa_feature_decr() functions to change an enabled feature's reference count. + * Reference counts may only be updated in the syncing context. + * + * Features may not perform enable-time initialization. Instead, any such + * initialization should occur when the feature is first used. This design + * enforces that on-disk changes be made only when features are used. Code + * should only check if a feature is enabled using spa_feature_is_enabled(), + * not by relying on any feature specific metadata existing. If a feature is + * enabled, but the feature's metadata is not on disk yet then it should be + * created as needed. + * + * As an example, consider the com.delphix:async_destroy feature. This feature + * relies on the existence of a bptree in the MOS that store blocks for + * asynchronous freeing. This bptree is not created when async_destroy is + * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is + * called to check if async_destroy is enabled. If it is and the bptree object + * does not exist yet, the bptree object is created as part of the dataset + * destroy and async_destroy's reference count is incremented to indicate it + * has made an on-disk format change. Later, after the destroyed dataset's + * blocks have all been asynchronously freed there is no longer any use for the + * bptree object, so it is destroyed and async_destroy's reference count is + * decremented back to 0 to indicate that it has undone its on-disk format + * changes. + */ + +typedef enum { + FEATURE_ACTION_ENABLE, + FEATURE_ACTION_INCR, + FEATURE_ACTION_DECR, +} feature_action_t; + +/* + * Checks that the features active in the specified object are supported by + * this software. Adds each unsupported feature (name -> description) to + * the supplied nvlist. + */ +boolean_t +feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, + nvlist_t *unsup_feat) +{ + boolean_t supported; + zap_cursor_t *zc; + zap_attribute_t *za; + char *buf; + + zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); + buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + supported = B_TRUE; + for (zap_cursor_init(zc, os, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + ASSERT(za->za_integer_length == sizeof (uint64_t) && + za->za_num_integers == 1); + + if (za->za_first_integer != 0 && + !zfeature_is_supported(za->za_name)) { + supported = B_FALSE; + + if (unsup_feat != NULL) { + char *desc = ""; + + if (zap_lookup(os, desc_obj, za->za_name, + 1, sizeof (buf), buf) == 0) + desc = buf; + + VERIFY(nvlist_add_string(unsup_feat, + za->za_name, desc) == 0); + } + } + } + zap_cursor_fini(zc); + + kmem_free(buf, MAXPATHLEN); + kmem_free(za, sizeof(zap_attribute_t)); + kmem_free(zc, sizeof(zap_cursor_t)); + + return (supported); +} + +static int +feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, + zfeature_info_t *feature, uint64_t *res) +{ + int err; + uint64_t refcount; + uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; + + ASSERT(0 != zapobj); + + err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1, + &refcount); + if (err != 0) { + if (err == ENOENT) + return (ENOTSUP); + else + return (err); + } + *res = refcount; + return (0); +} + +static int +feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj, + uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action, + dmu_tx_t *tx) +{ + int error; + uint64_t refcount; + uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; + + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + + error = zap_lookup(os, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount); + + /* + * If we can't ascertain the status of the specified feature, an I/O + * error occurred. + */ + if (error != 0 && error != ENOENT) + return (error); + + switch (action) { + case FEATURE_ACTION_ENABLE: + /* + * If the feature is already enabled, ignore the request. + */ + if (error == 0) + return (0); + refcount = 0; + break; + case FEATURE_ACTION_INCR: + if (error == ENOENT) + return (ENOTSUP); + if (refcount == UINT64_MAX) + return (EOVERFLOW); + refcount++; + break; + case FEATURE_ACTION_DECR: + if (error == ENOENT) + return (ENOTSUP); + if (refcount == 0) + return (EOVERFLOW); + refcount--; + break; + default: + ASSERT(0); + break; + } + + if (action == FEATURE_ACTION_ENABLE) { + int i; + + for (i = 0; feature->fi_depends[i] != NULL; i++) { + zfeature_info_t *dep = feature->fi_depends[i]; + + error = feature_do_action(os, read_obj, write_obj, + desc_obj, dep, FEATURE_ACTION_ENABLE, tx); + if (error != 0) + return (error); + } + } + + error = zap_update(os, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount, tx); + if (error != 0) + return (error); + + if (action == FEATURE_ACTION_ENABLE) { + error = zap_update(os, desc_obj, + feature->fi_guid, 1, strlen(feature->fi_desc) + 1, + feature->fi_desc, tx); + if (error != 0) + return (error); + } + + if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) { + spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid); + } + + if (action == FEATURE_ACTION_DECR && refcount == 0) { + spa_deactivate_mos_feature(dmu_objset_spa(os), + feature->fi_guid); + } + + return (0); +} + +void +spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We create feature flags ZAP objects in two instances: during pool + * creation and during pool upgrade. + */ + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && + tx->tx_txg == TXG_INITIAL)); + + spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_READ, tx); + spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_WRITE, tx); + spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_DESCRIPTIONS, tx); +} + +/* + * Enable any required dependencies, then enable the requested feature. + */ +void +spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx)); +} + +/* + * If the specified feature has not yet been enabled, this function returns + * ENOTSUP; otherwise, this function increments the feature's refcount (or + * returns EOVERFLOW if the refcount cannot be incremented). This function must + * be called from syncing context. + */ +void +spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx)); +} + +/* + * If the specified feature has not yet been enabled, this function returns + * ENOTSUP; otherwise, this function decrements the feature's refcount (or + * returns EOVERFLOW if the refcount is already 0). This function must + * be called from syncing context. + */ +void +spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx)); +} + +boolean_t +spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature) +{ + int err; + uint64_t refcount = 0; + + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + feature, &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0); +} + +boolean_t +spa_feature_is_active(spa_t *spa, zfeature_info_t *feature) +{ + int err; + uint64_t refcount = 0; + + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa->spa_meta_objset, + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj, + feature, &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0 && refcount > 0); +} diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c new file mode 100644 index 000000000..33d15133e --- /dev/null +++ b/module/zfs/zfeature_common.c @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifdef _KERNEL +#include +#else +#include +#include +#endif +#include +#include +#include +#include +#include "zfeature_common.h" + +/* + * Set to disable all feature checks while opening pools, allowing pools with + * unsupported features to be opened. Set for testing only. + */ +boolean_t zfeature_checks_disable = B_FALSE; + +zfeature_info_t spa_feature_table[SPA_FEATURES]; + +/* + * Valid characters for feature guids. This list is mainly for aesthetic + * purposes and could be expanded in the future. There are different allowed + * characters in the guids reverse dns portion (before the colon) and its + * short name (after the colon). + */ +static int +valid_char(char c, boolean_t after_colon) +{ + return ((c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == (after_colon ? '_' : '.')); +} + +/* + * Every feature guid must contain exactly one colon which separates a reverse + * dns organization name from the feature's "short" name (e.g. + * "com.company:feature_name"). + */ +boolean_t +zfeature_is_valid_guid(const char *name) +{ + int i; + boolean_t has_colon = B_FALSE; + + i = 0; + while (name[i] != '\0') { + char c = name[i++]; + if (c == ':') { + if (has_colon) + return (B_FALSE); + has_colon = B_TRUE; + continue; + } + if (!valid_char(c, has_colon)) + return (B_FALSE); + } + + return (has_colon); +} + +boolean_t +zfeature_is_supported(const char *guid) +{ + if (zfeature_checks_disable) + return (B_TRUE); + + return (0 == zfeature_lookup_guid(guid, NULL)); +} + +int +zfeature_lookup_guid(const char *guid, zfeature_info_t **res) +{ + int i; + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(guid, feature->fi_guid) == 0) { + if (res != NULL) + *res = feature; + return (0); + } + } + + return (ENOENT); +} + +int +zfeature_lookup_name(const char *name, zfeature_info_t **res) +{ + int i; + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(name, feature->fi_uname) == 0) { + if (res != NULL) + *res = feature; + return (0); + } + } + + return (ENOENT); +} + +static void +zfeature_register(int fid, const char *guid, const char *name, const char *desc, + boolean_t readonly, boolean_t mos, zfeature_info_t **deps) +{ + zfeature_info_t *feature = &spa_feature_table[fid]; + static zfeature_info_t *nodeps[] = { NULL }; + + ASSERT(name != NULL); + ASSERT(desc != NULL); + ASSERT(!readonly || !mos); + ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(zfeature_is_valid_guid(guid)); + + if (deps == NULL) + deps = nodeps; + + feature->fi_guid = guid; + feature->fi_uname = name; + feature->fi_desc = desc; + feature->fi_can_readonly = readonly; + feature->fi_mos = mos; + feature->fi_depends = deps; +} + +void +zpool_feature_init(void) +{ + zfeature_register(SPA_FEATURE_ASYNC_DESTROY, + "com.delphix:async_destroy", "async_destroy", + "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL); +} diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index c609203ea..98b1dd794 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska @@ -1107,6 +1108,8 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) /* * Find a zfs_sb_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. + * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, + * which prevents all inode ops from running. */ static int zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer) @@ -1170,7 +1173,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); - if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { + if (!SPA_VERSION_IS_SUPPORTED(version)) { error = EINVAL; goto pool_props_bad; } @@ -1297,6 +1300,15 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * + * outputs: + * zc_cookie real errno + * zc_nvlist_dst config nvlist + * zc_nvlist_dst_size size of config nvlist + */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { @@ -1398,7 +1410,8 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { + if (zc->zc_cookie < spa_version(spa) || + !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (EINVAL); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index bfb817b78..66f228bc7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -704,7 +704,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - zp->zp_type < DMU_OT_NUMTYPES && + DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa) && @@ -988,7 +988,7 @@ zio_read_bp_init(zio_t *zio) zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } - if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) @@ -3131,6 +3131,48 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_done }; +/* dnp is the dnode for zb1->zb_object */ +boolean_t +zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + #if defined(_KERNEL) && defined(HAVE_SPL) /* Fault injection */ EXPORT_SYMBOL(zio_injection_enabled); -- cgit v1.2.3 From 3bc7e0fb0f3904eaf41e0b9768ebe2d042ae98aa Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 14 Dec 2012 12:38:04 -0800 Subject: Illumos #3090 and #3102 3090 vdev_reopen() during reguid causes vdev to be treated as corrupt 3102 vdev_uberblock_load() and vdev_validate() may read the wrong label Reviewed by: Matthew Ahrens Reviewed by: Christopher Siden Reviewed by: Garrett D'Amore Approved by: Eric Schrock References: illumos/illumos-gate@dfbb943217bf8ab22a1a9d2e9dca01d4da95ee0b illumos changeset: 13777:b1e53580146d https://www.illumos.org/issues/3090 https://www.illumos.org/issues/3102 Ported-by: Brian Behlendorf Closes #939 --- cmd/ztest/ztest.c | 23 +++++++++++-- include/sys/fs/zfs.h | 1 + include/sys/spa_impl.h | 1 + include/sys/vdev.h | 2 +- lib/libzfs/libzfs_import.c | 85 +++++++++++++++++++++------------------------- module/zfs/spa.c | 76 +++++++++++++++++++++++++++++++---------- module/zfs/spa_misc.c | 17 ++++++++-- module/zfs/vdev.c | 8 ++--- module/zfs/vdev_label.c | 75 +++++++++++++++++++++++++--------------- 9 files changed, 188 insertions(+), 100 deletions(-) (limited to 'module') diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0dace4961..9b7adc726 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -365,7 +365,7 @@ ztest_info_t ztest_info[] = { { ztest_spa_rename, 1, &zopt_rarely }, { ztest_scrub, 1, &zopt_rarely }, { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, - { ztest_vdev_attach_detach, 1, &zopt_rarely }, + { ztest_vdev_attach_detach, 1, &zopt_rarely }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, { ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime }, @@ -416,6 +416,13 @@ static spa_t *ztest_spa = NULL; static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; + +/* + * The ztest_name_lock protects the pool and dataset namespace used by + * the individual tests. To modify the namespace, consumers must grab + * this lock as writer. Grabbing the lock as reader will ensure that the + * namespace does not change while the lock is held. + */ static krwlock_t ztest_name_lock; static boolean_t ztest_dump_core = B_TRUE; @@ -5034,10 +5041,16 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; uint64_t orig, load; + int error; orig = spa_guid(spa); load = spa_load_guid(spa); - if (spa_change_guid(spa) != 0) + + (void) rw_enter(&ztest_name_lock, RW_WRITER); + error = spa_change_guid(spa); + (void) rw_exit(&ztest_name_lock); + + if (error != 0) return; if (ztest_opts.zo_verbose >= 3) { @@ -5732,6 +5745,12 @@ ztest_freeze(void) VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY3U(0, ==, ztest_dataset_open(0)); ztest_dataset_close(0); + + spa->spa_debug = B_TRUE; + ztest_spa = spa; + txg_wait_synced(spa_get_dsl(spa), 0); + ztest_reguid(NULL, 0); + spa_close(spa, FTAG); kernel_fini(); } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 61596f7d7..800eb7768 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -954,6 +954,7 @@ typedef enum history_internal_events { LOG_DS_USER_HOLD, LOG_DS_USER_RELEASE, LOG_POOL_SPLIT, + LOG_POOL_GUID_CHANGE, LOG_END } history_internal_events_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 85a825d08..65edc97f5 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -141,6 +141,7 @@ struct spa { vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ + uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ spa_aux_vdev_t spa_spares; /* hot spares */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 51eb855ee..8f297a917 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -141,7 +141,7 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, struct uberblock; extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); -extern nvlist_t *vdev_label_read_config(vdev_t *vd, int label); +extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); typedef enum { diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index ad343e896..22e46b432 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -434,8 +434,8 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) uint_t i, nspares, nl2cache; boolean_t config_seen; uint64_t best_txg; - char *name, *hostname, *comment; - uint64_t version, guid; + char *name, *hostname = NULL; + uint64_t guid; uint_t children = 0; nvlist_t **child = NULL; uint_t holes; @@ -521,61 +521,54 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) * configuration: * * version - * pool guid - * name + * pool guid + * name + * pool txg (if available) * comment (if available) - * pool state + * pool state * hostid (if available) * hostname (if available) */ - uint64_t state; + uint64_t state, version, pool_txg; + char *comment = NULL; + + version = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VERSION); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_VERSION, version); + guid = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_GUID); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_GUID, guid); + name = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_POOL_NAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_POOL_NAME, name); - verify(nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_VERSION, &version) == 0); - if (nvlist_add_uint64(config, - ZPOOL_CONFIG_VERSION, version) != 0) - goto nomem; - verify(nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_POOL_GUID, &guid) == 0); - if (nvlist_add_uint64(config, - ZPOOL_CONFIG_POOL_GUID, guid) != 0) - goto nomem; - verify(nvlist_lookup_string(tmp, - ZPOOL_CONFIG_POOL_NAME, &name) == 0); - if (nvlist_add_string(config, - ZPOOL_CONFIG_POOL_NAME, name) != 0) - goto nomem; + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_TXG, &pool_txg) == 0) + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_TXG, pool_txg); - /* - * COMMENT is optional, don't bail if it's not - * there, instead, set it to NULL. - */ if (nvlist_lookup_string(tmp, - ZPOOL_CONFIG_COMMENT, &comment) != 0) - comment = NULL; - else if (nvlist_add_string(config, - ZPOOL_CONFIG_COMMENT, comment) != 0) - goto nomem; + ZPOOL_CONFIG_COMMENT, &comment) == 0) + fnvlist_add_string(config, + ZPOOL_CONFIG_COMMENT, comment); - verify(nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_POOL_STATE, &state) == 0); - if (nvlist_add_uint64(config, - ZPOOL_CONFIG_POOL_STATE, state) != 0) - goto nomem; + state = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_STATE); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_STATE, state); hostid = 0; if (nvlist_lookup_uint64(tmp, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { - if (nvlist_add_uint64(config, - ZPOOL_CONFIG_HOSTID, hostid) != 0) - goto nomem; - verify(nvlist_lookup_string(tmp, - ZPOOL_CONFIG_HOSTNAME, - &hostname) == 0); - if (nvlist_add_string(config, - ZPOOL_CONFIG_HOSTNAME, - hostname) != 0) - goto nomem; + fnvlist_add_uint64(config, + ZPOOL_CONFIG_HOSTID, hostid); + hostname = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_HOSTNAME, hostname); } config_seen = B_TRUE; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 244f10d47..59c43f40c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -117,6 +117,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { static dsl_syncfunc_t spa_sync_version; static dsl_syncfunc_t spa_sync_props; +static dsl_checkfunc_t spa_change_guid_check; +static dsl_syncfunc_t spa_change_guid_sync; static boolean_t spa_has_active_shared_spare(spa_t *spa); static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, @@ -676,6 +678,47 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) } } +/*ARGSUSED*/ +static int +spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t vdev_state; + ASSERTV(uint64_t *newguid = arg2); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_state = rvd->vdev_state; + spa_config_exit(spa, SCL_STATE, FTAG); + + if (vdev_state != VDEV_STATE_HEALTHY) + return (ENXIO); + + ASSERT3U(spa_guid(spa), !=, *newguid); + + return (0); +} + +static void +spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + uint64_t *newguid = arg2; + uint64_t oldguid; + vdev_t *rvd = spa->spa_root_vdev; + + oldguid = spa_guid(spa); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + rvd->vdev_guid = *newguid; + rvd->vdev_guid_sum += (*newguid - oldguid); + vdev_config_dirty(rvd); + spa_config_exit(spa, SCL_STATE, FTAG); + + spa_history_log_internal(LOG_POOL_GUID_CHANGE, spa, tx, + "old=%lld new=%lld", oldguid, *newguid); +} + /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify @@ -688,29 +731,23 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) int spa_change_guid(spa_t *spa) { - uint64_t oldguid, newguid; - uint64_t txg; - - if (!(spa_mode_global & FWRITE)) - return (EROFS); - - txg = spa_vdev_enter(spa); - - if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) - return (spa_vdev_exit(spa, NULL, txg, ENXIO)); + int error; + uint64_t guid; - oldguid = spa_guid(spa); - newguid = spa_generate_guid(NULL); - ASSERT3U(oldguid, !=, newguid); + mutex_enter(&spa_namespace_lock); + guid = spa_generate_guid(NULL); - spa->spa_root_vdev->vdev_guid = newguid; - spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); + error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, + spa_change_guid_sync, spa, &guid, 5); - vdev_config_dirty(spa->spa_root_vdev); + if (error == 0) { + spa_config_sync(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID); + } - spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID); + mutex_exit(&spa_namespace_lock); - return (spa_vdev_exit(spa, NULL, txg, 0)); + return (error); } /* @@ -6083,6 +6120,9 @@ spa_sync(spa_t *spa, uint64_t txg) rvd->vdev_children, txg, B_TRUE); } + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 440a6addb..5ec8e688e 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1334,16 +1334,29 @@ spa_name(spa_t *spa) uint64_t spa_guid(spa_t *spa) { + dsl_pool_t *dp = spa_get_dsl(spa); + uint64_t guid; + /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ - if (spa->spa_root_vdev != NULL) + if (spa->spa_root_vdev == NULL) + return (spa->spa_config_guid); + + guid = spa->spa_last_synced_guid != 0 ? + spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; + + /* + * Return the most recently synced out guid unless we're + * in syncing context. + */ + if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else - return (spa->spa_config_guid); + return (guid); } uint64_t diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 9335ba511..b96975103 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1348,9 +1348,9 @@ vdev_validate(vdev_t *vd, boolean_t strict) if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { uint64_t aux_guid = 0; nvlist_t *nvl; + uint64_t txg = strict ? spa->spa_config_txg : -1ULL; - if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == - NULL) { + if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); @@ -1533,7 +1533,7 @@ vdev_reopen(vdev_t *vd) !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { - (void) vdev_validate(vd, B_TRUE); + (void) vdev_validate(vd, spa_last_synced_txg(spa)); } /* @@ -1994,7 +1994,7 @@ vdev_validate_aux(vdev_t *vd) if (!vdev_readable(vd)) return (0); - if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) { + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 352b630fa..1fe36feb2 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -433,17 +433,22 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) } /* - * Returns the configuration from the label of the given vdev. If 'label' is - * VDEV_BEST_LABEL, each label of the vdev will be read until a valid - * configuration is found; otherwise, only the specified label will be read. + * Returns the configuration from the label of the given vdev. For vdevs + * which don't have a txg value stored on their label (i.e. spares/cache) + * or have not been completely initialized (txg = 0) just return + * the configuration from the first valid label we find. Otherwise, + * find the most up-to-date label that does not exceed the specified + * 'txg' value. */ nvlist_t * -vdev_label_read_config(vdev_t *vd, int label) +vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; zio_t *zio; + uint64_t best_txg = 0; + int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; int l; @@ -457,8 +462,7 @@ vdev_label_read_config(vdev_t *vd, int label) retry: for (l = 0; l < VDEV_LABELS; l++) { - if (label >= 0 && label < VDEV_LABELS && label != l) - continue; + nvlist_t *label = NULL; zio = zio_root(spa, NULL, NULL, flags); @@ -468,12 +472,31 @@ retry: if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), - &config, 0) == 0) - break; + &label, 0) == 0) { + uint64_t label_txg = 0; + + /* + * Auxiliary vdevs won't have txg values in their + * labels and newly added vdevs may not have been + * completely initialized so just return the + * configuration from the first valid label we + * encounter. + */ + error = nvlist_lookup_uint64(label, + ZPOOL_CONFIG_POOL_TXG, &label_txg); + if ((error || label_txg == 0) && !config) { + config = label; + break; + } else if (label_txg <= txg && label_txg > best_txg) { + best_txg = label_txg; + nvlist_free(config); + config = fnvlist_dup(label); + } + } - if (config != NULL) { - nvlist_free(config); - config = NULL; + if (label != NULL) { + nvlist_free(label); + label = NULL; } } @@ -508,7 +531,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, /* * Read the label, if any, and perform some basic sanity checks. */ - if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, @@ -872,7 +895,6 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) struct ubl_cbdata { uberblock_t *ubl_ubbest; /* Best uberblock */ vdev_t *ubl_vd; /* vdev associated with the above */ - int ubl_label; /* Label associated with the above */ }; static void @@ -891,15 +913,13 @@ vdev_uberblock_load_done(zio_t *zio) if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* - * Keep track of the vdev and label in which this - * uberblock was found. We will use this information - * later to obtain the config nvlist associated with + * Keep track of the vdev in which this uberblock + * was found. We will use this information later + * to obtain the config nvlist associated with * this uberblock. */ *cbp->ubl_ubbest = *ub; cbp->ubl_vd = vd; - cbp->ubl_label = vdev_label_number(vd->vdev_psize, - zio->io_offset); } mutex_exit(&rio->io_lock); } @@ -933,12 +953,11 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, * Reads the 'best' uberblock from disk along with its associated * configuration. First, we read the uberblock array of each label of each * vdev, keeping track of the uberblock with the highest txg in each array. - * Then, we read the configuration from the same label as the best uberblock. + * Then, we read the configuration from the same vdev as the best uberblock. */ void vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) { - int i; zio_t *zio; spa_t *spa = rvd->vdev_spa; struct ubl_cbdata cb; @@ -958,13 +977,15 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) zio = zio_root(spa, NULL, &cb, flags); vdev_uberblock_load_impl(zio, rvd, flags, &cb); (void) zio_wait(zio); - if (cb.ubl_vd != NULL) { - for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) { - *config = vdev_label_read_config(cb.ubl_vd, i); - if (*config != NULL) - break; - } - } + + /* + * It's possible that the best uberblock was discovered on a label + * that has a configuration which was written in a future txg. + * Search all labels on this vdev to find the configuration that + * matches the txg for our uberblock. + */ + if (cb.ubl_vd != NULL) + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); spa_config_exit(spa, SCL_ALL, FTAG); } -- cgit v1.2.3 From b9b24bb4ca45f2d903efadba44d10dfd182f62ac Mon Sep 17 00:00:00 2001 From: Christopher Siden Date: Fri, 14 Dec 2012 15:00:45 -0800 Subject: Illumos #2762: zpool command should have better support for feature flags 2762 zpool command should have better support for feature flags Reviewed by: Matthew Ahrens Reviewed by: George Wilson Approved by: Eric Schrock References: illumos/illumos-gate@57221772c3fc05faba04bf48ddff45abf2bbf2bd https://www.illumos.org/issues/2762 Ported-by: Brian Behlendorf --- cmd/zpool/zpool_main.c | 407 ++++++++++++++++++++++++++++++++++++--------- include/libzfs.h | 3 +- include/sys/fs/zfs.h | 1 + include/sys/zfeature.h | 2 +- lib/libzfs/libzfs_status.c | 25 +++ man/man5/zpool-features.5 | 5 +- man/man8/zpool.8 | 14 +- module/zfs/spa.c | 21 ++- module/zfs/zfeature.c | 9 +- 9 files changed, 388 insertions(+), 99 deletions(-) (limited to 'module') diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d0c0a923f..8da4620c0 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -391,6 +391,18 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, } } +static boolean_t +prop_list_contains_feature(nvlist_t *proplist) +{ + nvpair_t *nvp; + for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; + nvp = nvlist_next_nvpair(proplist, nvp)) { + if (zpool_prop_feature(nvpair_name(nvp))) + return (B_TRUE); + } + return (B_FALSE); +} + /* * Add a property pair (name, string-value) into a property nvlist. */ @@ -414,12 +426,30 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, proplist = *props; if (poolprop) { + const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); + if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL && !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } + + /* + * feature@ properties and version should not be specified + * at the same time. + */ + if ((prop == ZPROP_INVAL && zpool_prop_feature(propname) && + nvlist_exists(proplist, vname)) || + (prop == ZPOOL_PROP_VERSION && + prop_list_contains_feature(proplist))) { + (void) fprintf(stderr, gettext("'feature@' and " + "'version' properties cannot be specified " + "together\n")); + return (2); + } + + if (zpool_prop_feature(propname)) normnm = propname; else @@ -1482,8 +1512,8 @@ show_import(nvlist_t *config) break; case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext(" status: The pool is formatted using an " - "older on-disk version.\n")); + (void) printf(gettext(" status: The pool is formatted using a " + "legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: @@ -1491,6 +1521,11 @@ show_import(nvlist_t *config) "incompatible version.\n")); break; + case ZPOOL_STATUS_FEAT_DISABLED: + (void) printf(gettext(" status: Some supported features are " + "not enabled on the pool.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool uses the following " "feature(s) not supported on this sytem:\n")); @@ -1537,19 +1572,21 @@ show_import(nvlist_t *config) * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { - if (reason == ZPOOL_STATUS_VERSION_OLDER) + if (reason == ZPOOL_STATUS_VERSION_OLDER || + reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); - else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) + } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); - else + } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); + } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " @@ -4042,12 +4079,13 @@ status_callback(zpool_handle_t *zhp, void *data) break; case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext("status: The pool is formatted using an " - "older on-disk format. The pool can\n\tstill be used, but " - "some features are unavailable.\n")); + (void) printf(gettext("status: The pool is formatted using a " + "legacy on-disk format. The pool can\n\tstill be used, " + "but some features are unavailable.\n")); (void) printf(gettext("action: Upgrade the pool using 'zpool " "upgrade'. Once this is done, the\n\tpool will no longer " - "be accessible on older software versions.\n")); + "be accessible on software that does not support feature\n" + "\tflags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: @@ -4059,6 +4097,16 @@ status_callback(zpool_handle_t *zhp, void *data) "backup.\n")); break; + case ZPOOL_STATUS_FEAT_DISABLED: + (void) printf(gettext("status: Some supported features are not " + "enabled on the pool. The pool can\n\tstill be used, but " + "some features are unavailable.\n")); + (void) printf(gettext("action: Enable all features using " + "'zpool upgrade'. Once this is done,\n\tthe pool may no " + "longer be accessible by software that does not support\n\t" + "the features. See zpool-features(5) for details.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool cannot be accessed on " "this system because it uses the\n\tfollowing feature(s) " @@ -4288,58 +4336,162 @@ zpool_do_status(int argc, char **argv) } typedef struct upgrade_cbdata { - int cb_all; int cb_first; - int cb_newer; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; +static int +upgrade_version(zpool_handle_t *zhp, uint64_t version) +{ + int ret; + nvlist_t *config; + uint64_t oldversion; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &oldversion) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(oldversion)); + assert(oldversion < version); + + ret = zpool_upgrade(zhp, version); + if (ret != 0) + return (ret); + + if (version >= SPA_VERSION_FEATURES) { + (void) printf(gettext("Successfully upgraded " + "'%s' from version %llu to feature flags.\n"), + zpool_get_name(zhp), (u_longlong_t) oldversion); + } else { + (void) printf(gettext("Successfully upgraded " + "'%s' from version %llu to version %llu.\n"), + zpool_get_name(zhp), (u_longlong_t) oldversion, + (u_longlong_t) version); + } + + return (0); +} + +static int +upgrade_enable_all(zpool_handle_t *zhp, int *countp) +{ + int i, ret, count; + boolean_t firstff = B_TRUE; + nvlist_t *enabled = zpool_get_features(zhp); + + count = 0; + for (i = 0; i < SPA_FEATURES; i++) { + const char *fname = spa_feature_table[i].fi_uname; + const char *fguid = spa_feature_table[i].fi_guid; + if (!nvlist_exists(enabled, fguid)) { + char *propname; + verify(-1 != asprintf(&propname, "feature@%s", fname)); + ret = zpool_set_prop(zhp, propname, + ZFS_FEATURE_ENABLED); + if (ret != 0) { + free(propname); + return (ret); + } + count++; + + if (firstff) { + (void) printf(gettext("Enabled the " + "following features on '%s':\n"), + zpool_get_name(zhp)); + firstff = B_FALSE; + } + (void) printf(gettext(" %s\n"), fname); + free(propname); + } + } + + if (countp != NULL) + *countp = count; + return (0); +} + static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; - int ret = 0; + boolean_t printnl = B_FALSE; + int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); - if (!cbp->cb_newer && SPA_VERSION_IS_SUPPORTED(version) && - version != SPA_VERSION) { - if (!cbp->cb_all) { - if (cbp->cb_first) { - (void) printf(gettext("The following pools are " - "out of date, and can be upgraded. After " - "being\nupgraded, these pools will no " - "longer be accessible by older software " - "versions.\n\n")); - (void) printf(gettext("VER POOL\n")); - (void) printf(gettext("--- ------------\n")); - cbp->cb_first = B_FALSE; - } + assert(SPA_VERSION_IS_SUPPORTED(version)); - (void) printf("%2llu %s\n", (u_longlong_t)version, - zpool_get_name(zhp)); - } else { + if (version < cbp->cb_version) { + cbp->cb_first = B_FALSE; + ret = upgrade_version(zhp, cbp->cb_version); + if (ret != 0) + return (ret); + printnl = B_TRUE; + +#if 0 + /* + * XXX: This code can be enabled when Illumos commit + * 4445fffbbb1ea25fd0e9ea68b9380dd7a6709025 is merged. + * It reworks the history logging among other things. + */ + + /* + * If they did "zpool upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; +#endif + } + + if (cbp->cb_version >= SPA_VERSION_FEATURES) { + int count; + ret = upgrade_enable_all(zhp, &count); + if (ret != 0) + return (ret); + + if (count > 0) { cbp->cb_first = B_FALSE; - ret = zpool_upgrade(zhp, cbp->cb_version); - if (!ret) { - (void) printf(gettext("Successfully upgraded " - "'%s'\n\n"), zpool_get_name(zhp)); - } + printnl = B_TRUE; } - } else if (cbp->cb_newer && !SPA_VERSION_IS_SUPPORTED(version)) { - assert(!cbp->cb_all); + } + + if (printnl) { + (void) printf(gettext("\n")); + } + + return (0); +} +static int +upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(version)); + + if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " - "formatted using an unsupported software version " - "and\ncannot be accessed on the current " - "system.\n\n")); + "formatted with legacy version numbers and can\n" + "be upgraded to use feature flags. After " + "being upgraded, these pools\nwill no " + "longer be accessible by software that does not " + "support feature\nflags.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; @@ -4349,14 +4501,65 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) zpool_get_name(zhp)); } - zpool_close(zhp); - return (ret); + return (0); +} + +static int +upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + if (version >= SPA_VERSION_FEATURES) { + int i; + boolean_t poolfirst = B_TRUE; + nvlist_t *enabled = zpool_get_features(zhp); + + for (i = 0; i < SPA_FEATURES; i++) { + const char *fguid = spa_feature_table[i].fi_guid; + const char *fname = spa_feature_table[i].fi_uname; + if (!nvlist_exists(enabled, fguid)) { + if (cbp->cb_first) { + (void) printf(gettext("\nSome " + "supported features are not " + "enabled on the following pools. " + "Once a\nfeature is enabled the " + "pool may become incompatible with " + "software\nthat does not support " + "the feature. See " + "zpool-features(5) for " + "details.\n\n")); + (void) printf(gettext("POOL " + "FEATURE\n")); + (void) printf(gettext("------" + "---------\n")); + cbp->cb_first = B_FALSE; + } + + if (poolfirst) { + (void) printf(gettext("%s\n"), + zpool_get_name(zhp)); + poolfirst = B_FALSE; + } + + (void) printf(gettext(" %s\n"), fname); + } + } + } + + return (0); } /* ARGSUSED */ static int upgrade_one(zpool_handle_t *zhp, void *data) { + boolean_t printnl = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; @@ -4371,26 +4574,45 @@ upgrade_one(zpool_handle_t *zhp, void *data) cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " - "using more current version '%llu'.\n"), + "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t) cur_version); return (0); } - if (cur_version == cbp->cb_version) { + + if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " - "using the current version.\n"), zpool_get_name(zhp)); + "using version %llu.\n\n"), zpool_get_name(zhp), + (u_longlong_t) cbp->cb_version); return (0); } - ret = zpool_upgrade(zhp, cbp->cb_version); + if (cur_version != cbp->cb_version) { + printnl = B_TRUE; + ret = upgrade_version(zhp, cbp->cb_version); + if (ret != 0) + return (ret); + } - if (!ret) { - (void) printf(gettext("Successfully upgraded '%s' " - "from version %llu to version %llu\n\n"), - zpool_get_name(zhp), (u_longlong_t)cur_version, - (u_longlong_t)cbp->cb_version); + if (cbp->cb_version >= SPA_VERSION_FEATURES) { + int count = 0; + ret = upgrade_enable_all(zhp, &count); + if (ret != 0) + return (ret); + + if (count != 0) { + printnl = B_TRUE; + } else if (cur_version == SPA_VERSION) { + (void) printf(gettext("Pool '%s' already has all " + "supported features enabled.\n"), + zpool_get_name(zhp)); + } + } + + if (printnl) { + (void) printf(gettext("\n")); } - return (ret != 0); + return (0); } /* @@ -4409,6 +4631,7 @@ zpool_do_upgrade(int argc, char **argv) upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; + boolean_t upgradeall = B_FALSE; char *end; @@ -4416,7 +4639,7 @@ zpool_do_upgrade(int argc, char **argv) while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': - cb.cb_all = B_TRUE; + upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; @@ -4449,19 +4672,19 @@ zpool_do_upgrade(int argc, char **argv) if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; - } else if (!cb.cb_all && argc == 0) { + } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { - if (cb.cb_all || argc != 0) { + if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } - } else if (cb.cb_all) { + } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); @@ -4471,9 +4694,25 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext("This system supports ZFS pool feature " "flags.\n\n")); - cb.cb_first = B_TRUE; if (showversions) { - (void) printf(gettext("The following versions are " + int i; + + (void) printf(gettext("The following features are " + "supported:\n\n")); + (void) printf(gettext("FEAT DESCRIPTION\n")); + (void) printf("----------------------------------------------" + "---------------\n"); + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *fi = &spa_feature_table[i]; + const char *ro = fi->fi_can_readonly ? + " (read-only compatible)" : ""; + + (void) printf("%-37s%s\n", fi->fi_uname, ro); + (void) printf(" %s\n", fi->fi_desc); + } + (void) printf("\n"); + + (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" @@ -4516,32 +4755,44 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); - } else if (argc == 0) { - int notfound; - + } else if (argc == 0 && upgradeall) { + cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); - notfound = cb.cb_first; - - if (!cb.cb_all && ret == 0) { - if (!cb.cb_first) - (void) printf("\n"); - cb.cb_first = B_TRUE; - cb.cb_newer = B_TRUE; - ret = zpool_iter(g_zfs, upgrade_cb, &cb); - if (!cb.cb_first) { - notfound = B_FALSE; - (void) printf("\n"); + if (ret == 0 && cb.cb_first) { + if (cb.cb_version == SPA_VERSION) { + (void) printf(gettext("All pools are already " + "formatted using feature flags.\n\n")); + (void) printf(gettext("Every feature flags " + "pool already has all supported features " + "enabled.\n")); + } else { + (void) printf(gettext("All pools are already " + "formatted with version %llu or higher.\n"), + (u_longlong_t) cb.cb_version); } } + } else if (argc == 0) { + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); + assert(ret == 0); - if (ret == 0) { - if (notfound) - (void) printf(gettext("All pools are formatted " - "using this version.\n")); - else if (!cb.cb_all) - (void) printf(gettext("Use 'zpool upgrade -v' " - "for a list of available versions and " - "their associated\nfeatures.\n")); + if (cb.cb_first) { + (void) printf(gettext("All pools are formatted " + "using feature flags.\n\n")); + } else { + (void) printf(gettext("\nUse 'zpool upgrade -v' " + "for a list of available legacy versions.\n")); + } + + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); + assert(ret == 0); + + if (cb.cb_first) { + (void) printf(gettext("Every feature flags pool has " + "all supported features enabled.\n")); + } else { + (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, diff --git a/include/libzfs.h b/include/libzfs.h index 83ac34394..08f3d9ebd 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -322,7 +322,8 @@ typedef enum { * requiring administrative attention. There is no corresponding * message ID. */ - ZPOOL_STATUS_VERSION_OLDER, /* older on-disk version */ + ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ + ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device online */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 800eb7768..137dd39e6 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -524,6 +524,7 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ +#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h index 9ff1c93df..481e85b1b 100644 --- a/include/sys/zfeature.h +++ b/include/sys/zfeature.h @@ -35,7 +35,7 @@ extern "C" { #endif extern boolean_t feature_is_supported(objset_t *os, uint64_t obj, - uint64_t desc_obj, nvlist_t *unsup_feat); + uint64_t desc_obj, nvlist_t *unsup_feat, nvlist_t *enabled_feat); struct spa; extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *); diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index ef6a64133..e6e923090 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -44,6 +44,7 @@ #include #include #include "libzfs_impl.h" +#include "zfeature_common.h" /* * Message ID table. This must be kept in sync with the ZPOOL_STATUS_* defines @@ -320,6 +321,30 @@ check_status(nvlist_t *config, boolean_t isimport) if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) return (ZPOOL_STATUS_VERSION_OLDER); + /* + * Usable pool with disabled features + */ + if (version >= SPA_VERSION_FEATURES) { + int i; + nvlist_t *feat; + + if (isimport) { + feat = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_LOAD_INFO); + feat = fnvlist_lookup_nvlist(feat, + ZPOOL_CONFIG_ENABLED_FEAT); + } else { + feat = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_FEATURE_STATS); + } + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *fi = &spa_feature_table[i]; + if (!nvlist_exists(feat, fi->fi_guid)) + return (ZPOOL_STATUS_FEAT_DISABLED); + } + } + return (ZPOOL_STATUS_OK); } diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index 453e1ad2a..3c1930c35 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -20,8 +20,9 @@ zpool\-features \- ZFS pool feature descriptions .LP ZFS pool on\-disk format versions are specified via "features" which replace the old on\-disk format numbers (the last supported on\-disk format number is -28). To enable a feature on a pool use the \fBzpool\fR(1M) command to set -the \fBfeature@\fR\fIfeature_name\fR property to \fBenabled\fR. +28). To enable a feature on a pool use the \fBupgrade\fR subcommand of the +\fBzpool\fR(1M) command, or set the \fBfeature@\fR\fIfeature_name\fR property +to \fBenabled\fR. .sp .LP The pool format does not affect file system version compatibility or the ability diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 8c110511a..d5f61cfed 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -1628,7 +1628,7 @@ Displays verbose data error information, printing out a complete list of all dat .ad .sp .6 .RS 4n -Displays all pools formatted using a different \fBZFS\fR on-disk version. Older versions can continue to be used, but some features may not be available. These pools can be upgraded using "\fBzpool upgrade -a\fR". Pools that are formatted with a more recent version are also displayed, although these pools will be inaccessible on the system. +Displays pools which do not have all supported features enabled and pools formatted using a legacy ZFS version number. These pools can continue to be used, but some features may not be available. Use "\fBzpool upgrade -a\fR" to enable all features on all pools. .RE .sp @@ -1639,7 +1639,7 @@ Displays all pools formatted using a different \fBZFS\fR on-disk version. Older .ad .sp .6 .RS 4n -Displays \fBZFS\fR versions supported by the current software. The current \fBZFS\fR versions and all previous supported versions are displayed, along with an explanation of the features provided with each version. +Displays legacy \fBZFS\fR versions supported by the current software. See \fBzfs-features\fR(5) for a description of feature flags features supported by the current software. .RE .sp @@ -1650,7 +1650,7 @@ Displays \fBZFS\fR versions supported by the current software. The current \fBZF .ad .sp .6 .RS 4n -Upgrades the given pool to the latest on-disk version. Once this is done, the pool will no longer be accessible on systems running older versions of the software. +Enables all supported features on the given pool. Once this is done, the pool will no longer be accessible on systems that do not support feature flags. See \fBzfs-features\fR(5) for details on compatability with systems that support feature flags, but do not support all features enabled on the pool. .sp .ne 2 .mk @@ -1658,8 +1658,8 @@ Upgrades the given pool to the latest on-disk version. Once this is done, the po \fB\fB-a\fR\fR .ad .RS 14n -.rt -Upgrades all pools. +.rt +Enables all supported features on all pools. .RE .sp @@ -1669,8 +1669,8 @@ Upgrades all pools. \fB\fB-V\fR \fIversion\fR\fR .ad .RS 14n -.rt -Upgrade to the specified version. If the \fB-V\fR flag is not specified, the pool is upgraded to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported by this software. +.rt +Upgrade to the specified legacy version. If the \fB-V\fR flag is specified, no features will be enabled on the pool. This option can only be used to increase the version number up to the last supported legacy version number. .RE .RE diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 59c43f40c..0a785f78a 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2209,7 +2209,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; - nvlist_t *unsup_feat; + nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj) != 0) { @@ -2226,27 +2226,32 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } - VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == - 0); + enabled_feat = fnvlist_alloc(); + unsup_feat = fnvlist_alloc(); if (!feature_is_supported(spa->spa_meta_objset, spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, - unsup_feat)) + unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { if (!feature_is_supported(spa->spa_meta_objset, spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, - unsup_feat)) + unsup_feat, enabled_feat)) { missing_feat_write = B_TRUE; + } } + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); + if (!nvlist_empty(unsup_feat)) { - VERIFY(nvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } - nvlist_free(unsup_feat); + fnvlist_free(enabled_feat); + fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c index de9d165a9..24ff18fc3 100644 --- a/module/zfs/zfeature.c +++ b/module/zfs/zfeature.c @@ -173,7 +173,7 @@ typedef enum { */ boolean_t feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, - nvlist_t *unsup_feat) + nvlist_t *unsup_feat, nvlist_t *enabled_feat) { boolean_t supported; zap_cursor_t *zc; @@ -191,11 +191,16 @@ feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, ASSERT(za->za_integer_length == sizeof (uint64_t) && za->za_num_integers == 1); + if (NULL != enabled_feat) { + fnvlist_add_uint64(enabled_feat, za->za_name, + za->za_first_integer); + } + if (za->za_first_integer != 0 && !zfeature_is_supported(za->za_name)) { supported = B_FALSE; - if (unsup_feat != NULL) { + if (NULL != unsup_feat) { char *desc = ""; if (zap_lookup(os, desc_obj, za->za_name, -- cgit v1.2.3 From 29809a6cbae9869ca6ee026337981b2c9771650a Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Fri, 14 Dec 2012 16:13:40 -0800 Subject: Illumos #3086: unnecessarily setting DS_FLAG_INCONSISTENT on async 3086 unnecessarily setting DS_FLAG_INCONSISTENT on async destroyed datasets Reviewed by: Christopher Siden Approved by: Eric Schrock References: illumos/illumos-gate@ce636f8b38e8c9ff484e880d9abb27251a882860 illumos changeset: 13776:cd512c80fd75 https://www.illumos.org/issues/3086 Ported-by: Brian Behlendorf --- cmd/ztest/ztest.c | 3 ++ include/sys/dsl_pool.h | 7 ++- include/sys/txg.h | 5 +- include/sys/zil.h | 2 + include/sys/zil_impl.h | 2 + module/zfs/dmu.c | 4 +- module/zfs/dmu_send.c | 7 --- module/zfs/dsl_dataset.c | 108 +++++++++++++++++++------------------------ module/zfs/dsl_dir.c | 10 ++-- module/zfs/dsl_pool.c | 116 +++++++++++++++++++++++++++++++++++------------ module/zfs/txg.c | 4 +- module/zfs/zil.c | 56 +++++++++++++++++++++-- 12 files changed, 209 insertions(+), 115 deletions(-) (limited to 'module') diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 9b7adc726..6df0812c6 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -2277,6 +2277,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; + mutex_enter(&zd->zd_dirobj_lock); (void) rw_enter(&zd->zd_zilog_lock, RW_WRITER); /* zfs_sb_teardown() */ @@ -2287,6 +2288,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id) zil_replay(os, zd, ztest_replay_vector); (void) rw_exit(&zd->zd_zilog_lock); + mutex_exit(&zd->zd_dirobj_lock); } /* @@ -5743,6 +5745,7 @@ ztest_freeze(void) */ kernel_init(FREAD | FWRITE); VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + ASSERT(spa_freeze_txg(spa) == UINT64_MAX); VERIFY3U(0, ==, ztest_dataset_open(0)); ztest_dataset_close(0); diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 16fb98669..61c91f06c 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -90,7 +90,6 @@ typedef struct dsl_pool { /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; - list_t dp_synced_datasets; hrtime_t dp_read_overhead; uint64_t dp_throughput; /* bytes per millisec */ uint64_t dp_write_limit; @@ -104,6 +103,9 @@ typedef struct dsl_pool { kmutex_t dp_lock; uint64_t dp_space_towrite[TXG_SIZE]; uint64_t dp_tempreserved[TXG_SIZE]; + uint64_t dp_mos_used_delta; + uint64_t dp_mos_compressed_delta; + uint64_t dp_mos_uncompressed_delta; uint64_t dp_txg_history_size; list_t dp_txg_history; @@ -111,6 +113,7 @@ typedef struct dsl_pool { /* Has its own locking */ tx_state_t dp_tx; txg_list_t dp_dirty_datasets; + txg_list_t dp_dirty_zilogs; txg_list_t dp_dirty_dirs; txg_list_t dp_sync_tasks; @@ -150,6 +153,8 @@ int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp); taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp); diff --git a/include/sys/txg.h b/include/sys/txg.h index 2f87d747c..f9d6dd421 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -22,6 +22,9 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ #ifndef _SYS_TXG_H #define _SYS_TXG_H @@ -121,7 +124,7 @@ extern void txg_wait_callbacks(struct dsl_pool *dp); extern void txg_list_create(txg_list_t *tl, size_t offset); extern void txg_list_destroy(txg_list_t *tl); -extern int txg_list_empty(txg_list_t *tl, uint64_t txg); +extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); diff --git a/include/sys/zil.h b/include/sys/zil.h index c583887cd..f786f0c9f 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -454,6 +455,7 @@ extern void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]); extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); +extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 6c37c1ac2..f5b69b7ed 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -131,6 +132,7 @@ struct zilog { zil_header_t zl_old_header; /* debugging aid */ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ uint_t zl_prev_rotor; /* rotor for zl_prev[] */ + txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ }; typedef struct zil_bp_node { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 5d3f70d4c..e85635654 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1953,15 +1953,15 @@ dmu_init(void) dbuf_init(); zfetch_init(); dmu_tx_init(); - arc_init(); l2arc_init(); + arc_init(); } void dmu_fini(void) { - l2arc_fini(); arc_fini(); + l2arc_fini(); dmu_tx_fini(); zfetch_fini(); dbuf_fini(); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 97c23cb07..0cf3c4a9a 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1609,13 +1609,6 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc) dsl_dataset_t *ds = drc->drc_logical_ds; int err, myerr; - /* - * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() - * expects it to have a ds_user_ptr (and zil), but clone_swap() - * can close it. - */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); - if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, drc->drc_force); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 872d44afb..c5b84a26c 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -105,14 +105,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { - /* - * Account for the meta-objset space in its placeholder - * dsl_dir. - */ - ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, - used, compressed, uncompressed, tx); - dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + dsl_pool_mos_diduse_space(tx->tx_pool, + used, compressed, uncompressed); return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); @@ -150,15 +144,9 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT(used > 0); if (ds == NULL) { - /* - * Account for the meta-objset space in its placeholder - * dataset. - */ dsl_free(tx->tx_pool, tx->tx_txg, bp); - - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, - -used, -compressed, -uncompressed, tx); - dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + dsl_pool_mos_diduse_space(tx->tx_pool, + -used, -compressed, -uncompressed); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); @@ -1074,26 +1062,26 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) dummy_ds->ds_dir = dd; dummy_ds->ds_object = ds->ds_object; - /* - * Check for errors and mark this ds as inconsistent, in - * case we crash while freeing the objects. - */ - err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, - dsl_dataset_destroy_begin_sync, ds, NULL, 0); - if (err) - goto out_free; - - err = dmu_objset_from_ds(ds, &os); - if (err) - goto out_free; - - /* - * If async destruction is not enabled try to remove all objects - * while in the open context so that there is less work to do in - * the syncing context. - */ if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + /* + * Check for errors and mark this ds as inconsistent, in + * case we crash while freeing the objects. + */ + err = dsl_sync_task_do(dd->dd_pool, + dsl_dataset_destroy_begin_check, + dsl_dataset_destroy_begin_sync, ds, NULL, 0); + if (err) + goto out_free; + + err = dmu_objset_from_ds(ds, &os); + if (err) + goto out_free; + + /* + * Remove all objects while in the open context so that + * there is less work to do in the syncing context. + */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, ds->ds_phys->ds_prev_snap_txg)) { /* @@ -1104,29 +1092,25 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) } if (err != ESRCH) goto out_free; - } - /* - * Only the ZIL knows how to free log blocks. - */ - zil_destroy(dmu_objset_zil(os), B_FALSE); - - /* - * Sync out all in-flight IO. - */ - txg_wait_synced(dd->dd_pool, 0); + /* + * Sync out all in-flight IO. + */ + txg_wait_synced(dd->dd_pool, 0); - /* - * If we managed to free all the objects in open - * context, the user space accounting should be zero. - */ - if (ds->ds_phys->ds_bp.blk_fill == 0 && - dmu_objset_userused_enabled(os)) { - ASSERTV(uint64_t count); - ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || - count == 0); - ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || - count == 0); + /* + * If we managed to free all the objects in open + * context, the user space accounting should be zero. + */ + if (ds->ds_phys->ds_bp.blk_fill == 0 && + dmu_objset_userused_enabled(os)) { + ASSERTV(uint64_t count); + + ASSERT(zap_count(os, DMU_USERUSED_OBJECT, + &count) != 0 || count == 0); + ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, + &count) != 0 || count == 0); + } } rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); @@ -1878,6 +1862,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) } else { zfeature_info_t *async_destroy = &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; + objset_t *os; /* * There's no next snapshot, so this is a head dataset. @@ -1889,6 +1874,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; + VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); + if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { err = old_synchronous_dataset_destroy(ds, tx); } else { @@ -1898,12 +1885,12 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) */ uint64_t used, comp, uncomp; - ASSERT(err == 0 || err == EBUSY); + zil_destroy_sync(dmu_objset_zil(os), tx); + if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { spa_feature_incr(dp->dp_spa, async_destroy, tx); - dp->dp_bptree_obj = bptree_alloc( - dp->dp_meta_objset, tx); - VERIFY(zap_add(dp->dp_meta_objset, + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj, tx) == 0); @@ -1916,7 +1903,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == used); - bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj, + bptree_add(mos, dp->dp_bptree_obj, &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, used, comp, uncomp, tx); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, @@ -2203,7 +2190,6 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; - dsl_dir_dirty(ds->ds_dir, tx); dmu_objset_sync(ds->ds_objset, zio, tx); } diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 377df40a8..741223984 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -189,7 +189,6 @@ errout: kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); - } void @@ -223,7 +222,7 @@ dsl_dir_name(dsl_dir_t *dd, char *buf) } } -/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */ +/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { @@ -592,8 +591,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); - dmu_buf_will_dirty(dd->dd_dbuf, tx); - mutex_enter(&dd->dd_lock); ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, @@ -950,8 +947,6 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); - dsl_dir_dirty(dd, tx); - if (needlock) mutex_enter(&dd->dd_lock); accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); @@ -960,6 +955,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, dd->dd_phys->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); + dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_used_bytes += used; dd->dd_phys->dd_uncompressed_bytes += uncompressed; dd->dd_phys->dd_compressed_bytes += compressed; @@ -1003,13 +999,13 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; - dsl_dir_dirty(dd, tx); if (needlock) mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? dd->dd_phys->dd_used_breakdown[oldtype] >= delta : dd->dd_phys->dd_used_breakdown[newtype] >= -delta); ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); + dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_used_breakdown[oldtype] -= delta; dd->dd_phys->dd_used_breakdown[newtype] += delta; if (needlock) diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 85c745e8a..089a7f092 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -42,6 +42,7 @@ #include #include #include +#include int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -224,12 +225,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_dirty_datasets, offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_zilogs, + offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, offsetof(dsl_sync_task_group_t, dstg_node)); - list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), - offsetof(dsl_dataset_t, ds_synced_link)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); @@ -362,9 +363,9 @@ dsl_pool_close(dsl_pool_t *dp) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); + txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); - list_destroy(&dp->dp_synced_datasets); arc_flush(dp->dp_spa); txg_fini(dp); @@ -445,6 +446,21 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) return (dp); } +/* + * Account for the meta-objset space in its placeholder dsl_dir. + */ +void +dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp) +{ + ASSERT3U(comp, ==, uncomp); /* it's all metadata */ + mutex_enter(&dp->dp_lock); + dp->dp_mos_used_delta += used; + dp->dp_mos_compressed_delta += comp; + dp->dp_mos_uncompressed_delta += uncomp; + mutex_exit(&dp->dp_lock); +} + static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { @@ -463,11 +479,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; - dsl_sync_task_group_t *dstg; objset_t *mos = dp->dp_meta_objset; hrtime_t start, write_time; uint64_t data_written; int err; + list_t synced_datasets; + + list_create(&synced_datasets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); /* * We need to copy dp_space_towrite() before doing @@ -490,7 +509,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); - list_insert_tail(&dp->dp_synced_datasets, ds); + list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); @@ -500,15 +519,20 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) ASSERT(err == 0); DTRACE_PROBE(pool_sync__2rootzio); - for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) + /* + * After the data blocks have been written (ensured by the zio_wait() + * above), update the user/group space accounting. + */ + for (ds = list_head(&synced_datasets); ds; + ds = list_next(&synced_datasets, ds)) dmu_objset_do_userquota_updates(ds->ds_objset, tx); /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the - * sync tasks, because that could cause a snapshot of a dataset - * whose ds_bp will be rewritten when we do this 2nd sync. + * sync tasks, so that any snapshots will have the correct + * user accounting information (and we won't get confused + * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { @@ -519,30 +543,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) err = zio_wait(zio); /* - * Move dead blocks from the pending deadlist to the on-disk - * deadlist. + * Now that the datasets have been completely synced, we can + * clean up our in-memory structures accumulated while syncing: + * + * - move dead blocks from the pending deadlist to the on-disk deadlist + * - clean up zil records + * - release hold from dsl_dataset_dirty() */ - for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) { + while ((ds = list_remove_head(&synced_datasets))) { + ASSERTV(objset_t *os = ds->ds_objset); bplist_iterate(&ds->ds_pending_deadlist, deadlist_enqueue_cb, &ds->ds_deadlist, tx); + ASSERT(!dmu_objset_is_dirty(os, txg)); + dmu_buf_rele(ds->ds_dbuf, ds); } - while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) { - /* - * No more sync tasks should have been added while we - * were syncing. - */ - ASSERT(spa_sync_pass(dp->dp_spa) == 1); - dsl_sync_task_group_sync(dstg, tx); - } - DTRACE_PROBE(pool_sync__3task); - start = gethrtime(); while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg))) dsl_dir_sync(dd, tx); write_time += gethrtime() - start; + /* + * The MOS's space is accounted for in the pool/$MOS + * (dp_mos_dir). We can't modify the mos while we're syncing + * it, so we remember the deltas and apply them here. + */ + if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || + dp->dp_mos_uncompressed_delta != 0) { + dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, + dp->dp_mos_used_delta, + dp->dp_mos_compressed_delta, + dp->dp_mos_uncompressed_delta, tx); + dp->dp_mos_used_delta = 0; + dp->dp_mos_compressed_delta = 0; + dp->dp_mos_uncompressed_delta = 0; + } + start = gethrtime(); if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { @@ -558,6 +594,27 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) hrtime_t, dp->dp_read_overhead); write_time -= dp->dp_read_overhead; + /* + * If we modify a dataset in the same txg that we want to destroy it, + * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. + * dsl_dir_destroy_check() will fail if there are unexpected holds. + * Therefore, we want to sync the MOS (thus syncing the dd_dbuf + * and clearing the hold on it) before we process the sync_tasks. + * The MOS data dirtied by the sync_tasks will be synced on the next + * pass. + */ + DTRACE_PROBE(pool_sync__3task); + if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { + dsl_sync_task_group_t *dstg; + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT(spa_sync_pass(dp->dp_spa) == 1); + while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) + dsl_sync_task_group_sync(dstg, tx); + } + dmu_tx_commit(tx); dp->dp_space_towrite[txg & TXG_MASK] = 0; @@ -606,15 +663,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { + zilog_t *zilog; dsl_dataset_t *ds; - objset_t *os; - while ((ds = list_head(&dp->dp_synced_datasets))) { - list_remove(&dp->dp_synced_datasets, ds); - os = ds->ds_objset; - zil_clean(os->os_zil, txg); - ASSERT(!dmu_objset_is_dirty(os, txg)); - dmu_buf_rele(ds->ds_dbuf, ds); + while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) { + ds = dmu_objset_ds(zilog->zl_os); + zil_clean(zilog, txg); + ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); + dmu_buf_rele(ds->ds_dbuf, zilog); } ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 17494bcda..838a6f642 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -669,7 +671,7 @@ txg_list_destroy(txg_list_t *tl) mutex_destroy(&tl->tl_lock); } -int +boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg) { return (tl->tl_head[txg & TXG_MASK] == NULL); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 220f2d79e..c9618c13e 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -479,6 +479,38 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) return (lwb); } +/* + * Called when we create in-memory log transactions so that we know + * to cleanup the itxs at the end of spa_sync(). + */ +void +zilog_dirty(zilog_t *zilog, uint64_t txg) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + + if (dsl_dataset_is_snapshot(ds)) + panic("dirtying snapshot!"); + + if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, zilog); + } +} + +boolean_t +zilog_is_dirty(zilog_t *zilog) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + int t; + + for (t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) + return (B_TRUE); + } + return (B_FALSE); +} + /* * Create an on-disk intent log. */ @@ -601,14 +633,21 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) kmem_cache_free(zil_lwb_cache, lwb); } } else if (!keep_first) { - (void) zil_parse(zilog, zil_free_log_block, - zil_free_log_record, tx, zh->zh_claim_txg); + zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); } +void +zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); +} + int zil_claim(const char *osname, void *txarg) { @@ -1042,6 +1081,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) return (NULL); ASSERT(lwb->lwb_buf != NULL); + ASSERT(zilog_is_dirty(zilog) || + spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) dlen = P2ROUNDUP_TYPED( @@ -1272,7 +1313,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); - if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); @@ -1323,6 +1364,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); + zilog_dirty(zilog, txg); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ @@ -1332,7 +1374,10 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) /* * If there are any in-memory intent log transactions which have now been - * synced then start up a taskq to free them. + * synced then start up a taskq to free them. We should only do this after we + * have written out the uberblocks (i.e. txg has been comitted) so that + * don't inadvertently clean out in-memory log records that would be required + * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) @@ -1837,6 +1882,7 @@ zil_close(zilog_t *zilog) mutex_exit(&zilog->zl_lock); if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); + ASSERT(!zilog_is_dirty(zilog)); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; -- cgit v1.2.3 From ea0b2538cd5967fcdf26b7b7c01859a060fef3e3 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 14 Dec 2012 16:28:49 -0800 Subject: Illumos #3349: zpool upgrade -V bumps the on disk version number 3349 zpool upgrade -V bumps the on disk version number, but leaves the in core version Reviewed by: Adam Leventhal Reviewed by: Christopher Siden Reviewed by: Matt Ahrens Reviewed by: Richard Lowe Approved by: Dan McDonald References: illumos/illumos-gate@25345e466695fbe736faa53b8f3413d8e8f81981 https://www.illumos.org/issues/3349 Ported-by: Brian Behlendorf --- cmd/ztest/ztest.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++-------- module/zfs/spa.c | 8 ++++ 2 files changed, 127 insertions(+), 21 deletions(-) (limited to 'module') diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 6df0812c6..c9fdf466f 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -331,6 +331,7 @@ ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; +ztest_func_t ztest_spa_upgrade; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -364,6 +365,7 @@ ztest_info_t ztest_info[] = { { ztest_reguid, 1, &zopt_sometimes }, { ztest_spa_rename, 1, &zopt_rarely }, { ztest_scrub, 1, &zopt_rarely }, + { ztest_spa_upgrade, 1, &zopt_rarely }, { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, { ztest_vdev_attach_detach, 1, &zopt_rarely }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, @@ -818,7 +820,7 @@ ztest_get_ashift(void) } static nvlist_t * -make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) +make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char *pathbuf; uint64_t vdev; @@ -836,12 +838,13 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) vdev = ztest_shared->zs_vdev_aux; (void) snprintf(path, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, - ztest_opts.zo_pool, aux, vdev); + pool == NULL ? ztest_opts.zo_pool : pool, + aux, vdev); } else { vdev = ztest_shared->zs_vdev_next_leaf++; (void) snprintf(path, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, - ztest_opts.zo_pool, vdev); + pool == NULL ? ztest_opts.zo_pool : pool, vdev); } } @@ -864,17 +867,18 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) } static nvlist_t * -make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r) +make_vdev_raidz(char *path, char *aux, char *pool, size_t size, + uint64_t ashift, int r) { nvlist_t *raidz, **child; int c; if (r < 2) - return (make_vdev_file(path, aux, size, ashift)); + return (make_vdev_file(path, aux, pool, size, ashift)); child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < r; c++) - child[c] = make_vdev_file(path, aux, size, ashift); + child[c] = make_vdev_file(path, aux, pool, size, ashift); VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, @@ -893,19 +897,19 @@ make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r) } static nvlist_t * -make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift, - int r, int m) +make_vdev_mirror(char *path, char *aux, char *pool, size_t size, + uint64_t ashift, int r, int m) { nvlist_t *mirror, **child; int c; if (m < 1) - return (make_vdev_raidz(path, aux, size, ashift, r)); + return (make_vdev_raidz(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) - child[c] = make_vdev_raidz(path, aux, size, ashift, r); + child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, @@ -922,8 +926,8 @@ make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift, } static nvlist_t * -make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift, - int log, int r, int m, int t) +make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, + int log, int r, int m, int t) { nvlist_t *root, **child; int c; @@ -933,7 +937,8 @@ make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift, child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { - child[c] = make_vdev_mirror(path, aux, size, ashift, r, m); + child[c] = make_vdev_mirror(path, aux, pool, size, ashift, + r, m); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log) == 0); } @@ -951,6 +956,27 @@ make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift, return (root); } +/* + * Find a random spa version. Returns back a random spa version in the + * range [initial_version, SPA_VERSION_FEATURES]. + */ +static uint64_t +ztest_random_spa_version(uint64_t initial_version) +{ + uint64_t version = initial_version; + + if (version <= SPA_VERSION_BEFORE_FEATURES) { + version = version + + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); + } + + if (version > SPA_VERSION_BEFORE_FEATURES) + version = SPA_VERSION_FEATURES; + + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + return (version); +} + static int ztest_random_blocksize(void) { @@ -2306,7 +2332,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) /* * Attempt to create using a bad file. */ - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); nvlist_free(nvroot); @@ -2314,7 +2340,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) /* * Attempt to create using a bad mirror. */ - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); nvlist_free(nvroot); @@ -2324,7 +2350,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) * what's in the nvroot; we should fail with EEXIST. */ (void) rw_enter(&ztest_name_lock, RW_READER); - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); nvlist_free(nvroot); VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); @@ -2334,6 +2360,78 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) (void) rw_exit(&ztest_name_lock); } +/* ARGSUSED */ +void +ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa; + uint64_t initial_version = SPA_VERSION_INITIAL; + uint64_t version, newversion; + nvlist_t *nvroot, *props; + char *name; + + mutex_enter(&ztest_vdev_lock); + name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); + + /* + * Clean up from previous runs. + */ + (void) spa_destroy(name); + + nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, + 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + + /* + * If we're configuring a RAIDZ device then make sure that the + * the initial version is capable of supporting that feature. + */ + switch (ztest_opts.zo_raidz_parity) { + case 0: + case 1: + initial_version = SPA_VERSION_INITIAL; + break; + case 2: + initial_version = SPA_VERSION_RAIDZ2; + break; + case 3: + initial_version = SPA_VERSION_RAIDZ3; + break; + } + + /* + * Create a pool with a spa version that can be upgraded. Pick + * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. + */ + do { + version = ztest_random_spa_version(initial_version); + } while (version > SPA_VERSION_BEFORE_FEATURES); + + props = fnvlist_alloc(); + fnvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), version); + VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0); + fnvlist_free(nvroot); + fnvlist_free(props); + + VERIFY3S(spa_open(name, &spa, FTAG), ==, 0); + VERIFY3U(spa_version(spa), ==, version); + newversion = ztest_random_spa_version(version + 1); + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("upgrading spa version from %llu to %llu\n", + (u_longlong_t)version, (u_longlong_t)newversion); + } + + spa_upgrade(spa, newversion); + VERIFY3U(spa_version(spa), >, version); + VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, + zpool_prop_to_name(ZPOOL_PROP_VERSION))); + spa_close(spa, FTAG); + + strfree(name); + mutex_exit(&ztest_vdev_lock); +} + static vdev_t * vdev_lookup_by_path(vdev_t *vd, const char *path) { @@ -2424,7 +2522,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) /* * Make 1/4 of the devices be log devices. */ - nvroot = make_vdev_root(NULL, NULL, + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, ztest_random(4) == 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); @@ -2503,7 +2601,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) /* * Add a new device. */ - nvlist_t *nvroot = make_vdev_root(NULL, aux, + nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1); error = spa_vdev_add(spa, nvroot); if (error != 0) @@ -2776,7 +2874,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) /* * Build the nvlist describing newpath. */ - root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0, + root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, 0, 0, 0, 1); error = spa_vdev_attach(spa, oldguid, root, replacing); @@ -5055,7 +5153,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id) if (error != 0) return; - if (ztest_opts.zo_verbose >= 3) { + if (ztest_opts.zo_verbose >= 4) { (void) printf("Changed guid old %llu -> %llu\n", (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); } @@ -5819,7 +5917,7 @@ ztest_init(ztest_shared_t *zs) ztest_shared->zs_vdev_next_leaf = 0; zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; - nvroot = make_vdev_root(NULL, NULL, ztest_opts.zo_vdev_size, 0, + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); props = make_random_props(); for (i = 0; i < SPA_FEATURES; i++) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 0a785f78a..5b6465f2e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -5742,6 +5742,14 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); + /* + * If we're upgrading the spa version then make sure that + * the config object gets updated with the correct version. + */ + if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa->spa_uberblock.ub_version); + spa_config_exit(spa, SCL_STATE, FTAG); if (spa->spa_config_syncing) -- cgit v1.2.3 From 91579709fccd3e55a21970742b66c388fb1403db Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 17 Dec 2012 16:23:27 -0800 Subject: Fix __zio_execute() asynchronous dispatch To save valuable stack all zio's were made asynchronous when in the tgx_sync_thread context or during pool initialization. See commit 2fac4c2 for the original patch and motivation. Unfortuantely, the changes to dsl_pool_sync_context() made by the feature flags broke this logic causing in __zio_execute() to dispatch itself infinitely when called during pool initialization. This commit refines the existing logic to specificly target only the two cases we care about. Signed-off-by: Brian Behlendorf --- module/zfs/zio.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'module') diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 66f228bc7..638105a09 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1248,7 +1248,7 @@ __zio_execute(zio_t *zio) while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; - dsl_pool_t *dsl; + dsl_pool_t *dp; boolean_t cut; int rv; @@ -1262,7 +1262,7 @@ __zio_execute(zio_t *zio) ASSERT(stage <= ZIO_STAGE_DONE); - dsl = spa_get_dsl(zio->io_spa); + dp = spa_get_dsl(zio->io_spa); cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; @@ -1272,16 +1272,24 @@ __zio_execute(zio_t *zio) * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * - * If we are in the txg_sync_thread or being called - * during pool init issue async to minimize stack depth. - * Both of these call paths may be recursively called. - * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ - if (((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && - zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) || - (dsl != NULL && dsl_pool_sync_context(dsl))) { + if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && + zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); + return; + } + + /* + * If we executing in the context of the tx_sync_thread, + * or we are performing pool initialization outside of a + * zio_taskq[ZIO_TASKQ_ISSUE] context. Then issue the zio + * async to minimize stack usage for these deep call paths. + */ + if ((dp && curthread == dp->dp_tx.tx_sync_thread) || + (dp && spa_is_initializing(dp->dp_spa) && + !zio_taskq_member(zio, ZIO_TASKQ_ISSUE))) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } -- cgit v1.2.3 From 753c38392ddff9d3cf140bb4d28f3bfba52c92d2 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Sun, 23 Dec 2012 15:57:14 -0800 Subject: Illumos #3104: eliminate empty bpobjs 3104 eliminate empty bpobjs Reviewed by: George Wilson Reviewed by: Adam Leventhal Reviewed by: Christopher Siden Reviewed by: Garrett D'Amore Approved by: Eric Schrock References: illumos/illumos-gate@f17457368189aa911f774c38c1f21875a568bdca illumos changeset: 13782:8f78aae28a63 https://www.illumos.org/issues/3104 Ported-by: Brian Behlendorf --- include/sys/bpobj.h | 3 +++ include/sys/dmu.h | 1 + include/sys/dsl_pool.h | 1 + include/sys/zap.h | 2 ++ include/zfeature_common.h | 1 + man/man5/zpool-features.5 | 28 +++++++++++++++++++++ module/zfs/bpobj.c | 58 +++++++++++++++++++++++++++++++++++++++++++- module/zfs/dsl_deadlist.c | 54 ++++++++++++++++++++++++++++++++++------- module/zfs/dsl_pool.c | 9 +++++++ module/zfs/zap.c | 10 ++++++++ module/zfs/zfeature.c | 7 +++++- module/zfs/zfeature_common.c | 3 +++ 12 files changed, 166 insertions(+), 11 deletions(-) (limited to 'module') diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index 3771a9541..af975c734 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_BPOBJ_H @@ -67,7 +68,9 @@ typedef struct bpobj { typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); +uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); +void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx); int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); void bpobj_close(bpobj_t *bpo); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index ce3169731..7fc876be7 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -309,6 +309,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" +#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" /* * Allocate an object from this objset. The range of object numbers diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 61c91f06c..ff5df1414 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -96,6 +96,7 @@ typedef struct dsl_pool { uint64_t dp_tmp_userrefs_obj; bpobj_t dp_free_bpobj; uint64_t dp_bptree_obj; + uint64_t dp_empty_bpobj; struct dsl_scan *dp_scan; diff --git a/include/sys/zap.h b/include/sys/zap.h index 4d7b31559..092669c8b 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -300,6 +300,8 @@ int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, /* Here the key is an int and the value is a different int. */ int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_update_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep); diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 27f8f00a0..cb1d02fd7 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -51,6 +51,7 @@ typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg); typedef enum spa_feature { SPA_FEATURE_ASYNC_DESTROY, + SPA_FEATURE_EMPTY_BPOBJ, SPA_FEATURES } spa_feature_t; diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index 3c1930c35..0ab179ef7 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -169,5 +169,33 @@ through the \fBfreeing\fR property. This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero. .RE + +.sp +.ne 2 +.na +\fB\fBempty_bpobj\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:empty_bpobj +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature increases the performance of creating and using a large +number of snapshots of a single filesystem or volume, and also reduces +the disk space required. + +When there are many snapshots, each snapshot uses many Block Pointer +Objects (bpobj's) to track blocks associated with that snapshot. +However, in common use cases, most of these bpobj's are empty. This +feature allows us to create each bpobj on-demand, thus eliminating the +empty bpobjs. + +This feature is \fBactive\fR while there are any filesystems, volumes, +or snapshots which were created after enabling this feature. +.RE + .SH "SEE ALSO" \fBzpool\fR(1M) diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 022921c66..d5f8d4072 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -20,13 +20,61 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include +#include +#include + +/* + * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). + */ +uint64_t +bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + zfeature_info_t *empty_bpobj_feat = + &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ]; + spa_t *spa = dmu_objset_spa(os); + dsl_pool_t *dp = dmu_objset_pool(os); + + if (spa_feature_is_enabled(spa, empty_bpobj_feat)) { + if (!spa_feature_is_active(spa, empty_bpobj_feat)) { + ASSERT3U(dp->dp_empty_bpobj, ==, 0); + dp->dp_empty_bpobj = + bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx); + VERIFY(zap_add(os, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj, tx) == 0); + } + spa_feature_incr(spa, empty_bpobj_feat, tx); + ASSERT(dp->dp_empty_bpobj != 0); + return (dp->dp_empty_bpobj); + } else { + return (bpobj_alloc(os, blocksize, tx)); + } +} + +void +bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) +{ + zfeature_info_t *empty_bpobj_feat = + &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ]; + dsl_pool_t *dp = dmu_objset_pool(os); + + spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx); + if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) { + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, tx)); + VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); + dp->dp_empty_bpobj = 0; + } +} uint64_t bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) @@ -53,6 +101,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) int epb; dmu_buf_t *dbuf = NULL; + ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); mutex_enter(&bpo.bpo_lock); @@ -320,6 +369,12 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) ASSERT(bpo->bpo_havesubobj); ASSERT(bpo->bpo_havecomp); + ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + + if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { + bpobj_decr_empty(bpo->bpo_os, tx); + return; + } VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); @@ -388,6 +443,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) blkptr_t *bparray; ASSERT(!BP_IS_HOLE(bp)); + ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); /* We never need the fill count. */ stored_bp.blk_fill = 0; diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 1e89a68d7..909b5f8fc 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -165,12 +165,49 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) for (zap_cursor_init(&zc, os, dlobj); zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) - bpobj_free(os, za.za_first_integer, tx); + zap_cursor_advance(&zc)) { + uint64_t obj = za.za_first_integer; + if (obj == dmu_objset_pool(os)->dp_empty_bpobj) + bpobj_decr_empty(os, tx); + else + bpobj_free(os, obj, tx); + } zap_cursor_fini(&zc); VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); } +static void +dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + const blkptr_t *bp, dmu_tx_t *tx) +{ + if (dle->dle_bpobj.bpo_object == + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { + uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + bpobj_close(&dle->dle_bpobj); + bpobj_decr_empty(dl->dl_os, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + dle->dle_mintxg, obj, tx)); + } + bpobj_enqueue(&dle->dle_bpobj, bp, tx); +} + +static void +dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + uint64_t obj, dmu_tx_t *tx) +{ + if (dle->dle_bpobj.bpo_object != + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { + bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); + } else { + bpobj_close(&dle->dle_bpobj); + bpobj_decr_empty(dl->dl_os, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + dle->dle_mintxg, obj, tx)); + } +} + void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) { @@ -199,7 +236,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); else dle = AVL_PREV(&dl->dl_tree, dle); - bpobj_enqueue(&dle->dle_bpobj, bp, tx); + dle_enqueue(dl, dle, bp, tx); } /* @@ -219,7 +256,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE); dle->dle_mintxg = mintxg; - obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); avl_add(&dl->dl_tree, dle); @@ -245,8 +282,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); dle_prev = AVL_PREV(&dl->dl_tree, dle); - bpobj_enqueue_subobj(&dle_prev->dle_bpobj, - dle->dle_bpobj.bpo_object, tx); + dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); avl_remove(&dl->dl_tree, dle); bpobj_close(&dle->dle_bpobj); @@ -304,7 +340,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, if (dle->dle_mintxg >= maxtxg) break; - obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx); VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, dle->dle_mintxg, obj, tx)); } @@ -402,7 +438,7 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); - bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); + dle_enqueue_subobj(dl, dle, obj, tx); } static int diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 089a7f092..704f034e9 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -322,6 +322,15 @@ dsl_pool_open(dsl_pool_t *dp) goto out; } + if (spa_feature_is_active(dp->dp_spa, + &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj); + if (err != 0) + goto out; + } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 59f62fa3a..fd3021be6 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -1093,6 +1093,16 @@ zap_add_int_key(objset_t *os, uint64_t obj, return (zap_add(os, obj, name, 8, 1, &value, tx)); } +int +zap_update_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_update(os, obj, name, 8, 1, &value, tx)); +} + int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) { diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c index 24ff18fc3..c09b32d17 100644 --- a/module/zfs/zfeature.c +++ b/module/zfs/zfeature.c @@ -229,7 +229,12 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, uint64_t refcount; uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; - ASSERT(0 != zapobj); + /* + * If the pool is currently being created, the feature objects may not + * have been allocated yet. Act as though all features are disabled. + */ + if (zapobj == 0) + return (ENOTSUP); err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1, &refcount); diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c index 33d15133e..40066991b 100644 --- a/module/zfs/zfeature_common.c +++ b/module/zfs/zfeature_common.c @@ -157,4 +157,7 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_ASYNC_DESTROY, "com.delphix:async_destroy", "async_destroy", "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL); + zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, + "com.delphix:empty_bpobj", "empty_bpobj", + "Snapshots use less space.", B_TRUE, B_FALSE, NULL); } -- cgit v1.2.3 From 1eb5bfa3dcdaecb19543d9df13131374a7a42947 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 21 Dec 2012 14:57:09 -0800 Subject: Illumos #3145, #3212 3145 single-copy arc 3212 ztest: race condition between vdev_online() and spa_vdev_remove() Reviewed by: Matt Ahrens Reviewed by: Adam Leventhal Reviewed by: Eric Schrock Reviewed by: Justin T. Gibbs Approved by: Eric Schrock References: illumos-gate/commit/9253d63df408bb48584e0b1abfcc24ef2472382e illumos changeset: 13840:97fd5cdf328a https://www.illumos.org/issues/3145 https://www.illumos.org/issues/3212 Ported-by: Brian Behlendorf Closes #989 Closes #1137 --- cmd/ztest/ztest.c | 11 ++++++++ include/sys/arc.h | 1 + module/zfs/arc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- module/zfs/dbuf.c | 19 ++++++++++++- 4 files changed, 113 insertions(+), 2 deletions(-) (limited to 'module') diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index c9fdf466f..07b81cc27 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4965,7 +4965,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) if (islog) (void) rw_exit(&ztest_name_lock); } else { + /* + * Ideally we would like to be able to randomly + * call vdev_[on|off]line without holding locks + * to force unpredictable failures but the side + * effects of vdev_[on|off]line prevent us from + * doing so. We grab the ztest_vdev_lock here to + * prevent a race between injection testing and + * aux_vdev removal. + */ + mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); + mutex_exit(&ztest_vdev_lock); } } diff --git a/include/sys/arc.h b/include/sys/arc.h index 443597df0..dbc91ea3d 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -109,6 +109,7 @@ int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); +boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5d9b34fbf..5e21b1b71 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -189,6 +189,7 @@ unsigned long zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_disable_dup_eviction = 0; int zfs_arc_meta_prune = 0; /* @@ -307,6 +308,9 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_no_grow; @@ -387,6 +391,9 @@ static arc_stats_t arc_stats = { { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, @@ -1369,6 +1376,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1467,6 +1485,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1651,6 +1679,48 @@ arc_buf_size(arc_buf_t *buf) return (buf->b_hdr->b_size); } +/* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + /* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. @@ -2753,8 +2823,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -3324,6 +3396,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 99f728f4b..205abaada 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2189,7 +2189,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - if (!DBUF_IS_CACHEABLE(db)) + + /* + * A dbuf will be eligible for eviction if either the + * 'primarycache' property is set or a duplicate + * copy of this buffer is already cached in the arc. + * + * In the case of the 'primarycache' a buffer + * is considered for eviction if it matches the + * criteria set in the property. + * + * To decide if our buffer is considered a + * duplicate, we must call into the arc to determine + * if multiple buffers are referencing the same + * block on-disk. If so, then we simply evict + * ourselves. + */ + if (!DBUF_IS_CACHEABLE(db) || + arc_buf_eviction_needed(db->db_buf)) dbuf_clear(db); else mutex_exit(&db->db_mtx); -- cgit v1.2.3