summaryrefslogtreecommitdiffstats
path: root/zfs/lib/libzpool
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2008-11-20 12:01:55 -0800
committerBrian Behlendorf <[email protected]>2008-11-20 12:01:55 -0800
commit34dc7c2f2553220ebc6e29ca195fb6d57155f95f (patch)
tree634a0df4aa30200d83c16025768c9ef76a26136d /zfs/lib/libzpool
Initial Linux ZFS GIT Repo
Diffstat (limited to 'zfs/lib/libzpool')
-rw-r--r--zfs/lib/libzpool/Makefile.in102
-rw-r--r--zfs/lib/libzpool/arc.c4232
-rw-r--r--zfs/lib/libzpool/bplist.c313
-rw-r--r--zfs/lib/libzpool/dbuf.c2251
-rw-r--r--zfs/lib/libzpool/dmu.c1049
-rw-r--r--zfs/lib/libzpool/dmu_object.c160
-rw-r--r--zfs/lib/libzpool/dmu_objset.c1149
-rw-r--r--zfs/lib/libzpool/dmu_traverse.c917
-rw-r--r--zfs/lib/libzpool/dmu_tx.c1034
-rw-r--r--zfs/lib/libzpool/dmu_zfetch.c651
-rw-r--r--zfs/lib/libzpool/dnode.c1387
-rw-r--r--zfs/lib/libzpool/dnode_sync.c616
-rw-r--r--zfs/lib/libzpool/dsl_dataset.c2798
-rw-r--r--zfs/lib/libzpool/dsl_deleg.c744
-rw-r--r--zfs/lib/libzpool/dsl_dir.c1269
-rw-r--r--zfs/lib/libzpool/dsl_pool.c339
-rw-r--r--zfs/lib/libzpool/dsl_prop.c551
-rw-r--r--zfs/lib/libzpool/dsl_synctask.c225
-rw-r--r--zfs/lib/libzpool/fletcher.c145
-rw-r--r--zfs/lib/libzpool/gzip.c69
-rw-r--r--zfs/lib/libzpool/kernel.c894
-rw-r--r--zfs/lib/libzpool/lzjb.c128
-rw-r--r--zfs/lib/libzpool/metaslab.c1053
-rw-r--r--zfs/lib/libzpool/refcount.c195
-rw-r--r--zfs/lib/libzpool/sha256.c129
-rw-r--r--zfs/lib/libzpool/spa.c4501
-rw-r--r--zfs/lib/libzpool/spa_boot.c198
-rw-r--r--zfs/lib/libzpool/spa_config.c492
-rw-r--r--zfs/lib/libzpool/spa_errlog.c440
-rw-r--r--zfs/lib/libzpool/spa_history.c421
-rw-r--r--zfs/lib/libzpool/spa_misc.c1280
-rw-r--r--zfs/lib/libzpool/space_map.c506
-rw-r--r--zfs/lib/libzpool/taskq.c255
-rw-r--r--zfs/lib/libzpool/txg.c661
-rw-r--r--zfs/lib/libzpool/uberblock.c63
-rw-r--r--zfs/lib/libzpool/unique.c116
-rw-r--r--zfs/lib/libzpool/util.c151
-rw-r--r--zfs/lib/libzpool/vdev.c2207
-rw-r--r--zfs/lib/libzpool/vdev_cache.c435
-rw-r--r--zfs/lib/libzpool/vdev_disk.c639
-rw-r--r--zfs/lib/libzpool/vdev_file.c340
-rw-r--r--zfs/lib/libzpool/vdev_label.c1045
-rw-r--r--zfs/lib/libzpool/vdev_mirror.c496
-rw-r--r--zfs/lib/libzpool/vdev_missing.c96
-rw-r--r--zfs/lib/libzpool/vdev_queue.c320
-rw-r--r--zfs/lib/libzpool/vdev_raidz.c1239
-rw-r--r--zfs/lib/libzpool/vdev_root.c130
-rw-r--r--zfs/lib/libzpool/zap.c1085
-rw-r--r--zfs/lib/libzpool/zap_leaf.c853
-rw-r--r--zfs/lib/libzpool/zap_micro.c1069
-rw-r--r--zfs/lib/libzpool/zfs_byteswap.c175
-rw-r--r--zfs/lib/libzpool/zfs_fm.c355
-rw-r--r--zfs/lib/libzpool/zfs_znode.c1390
-rw-r--r--zfs/lib/libzpool/zil.c1618
-rw-r--r--zfs/lib/libzpool/zio.c2082
-rw-r--r--zfs/lib/libzpool/zio_checksum.c172
-rw-r--r--zfs/lib/libzpool/zio_compress.c148
-rw-r--r--zfs/lib/libzpool/zio_inject.c315
58 files changed, 47693 insertions, 0 deletions
diff --git a/zfs/lib/libzpool/Makefile.in b/zfs/lib/libzpool/Makefile.in
new file mode 100644
index 000000000..440fd1cbc
--- /dev/null
+++ b/zfs/lib/libzpool/Makefile.in
@@ -0,0 +1,102 @@
+# NOTE: kernel.c, taskq.c, util.c unused by kernel port.
+# Potentially they should just be removed if we don't care
+# able user space lustre intergration from this source base.
+
+DISTFILES = arc.c bplist.c dbuf.c dmu.c dmu_object.c dmu_objset.c
+DISTFILES += dmu_traverse.c dmu_tx.c dmu_zfetch.c dnode.c dnode_sync.c
+DISTFILES += dsl_dataset.c dsl_deleg.c dsl_dir.c dsl_pool.c dsl_prop.c
+DISTFILES += dsl_synctask.c fletcher.c gzip.c kernel.c lzjb.c metaslab.c
+DISTFILES += refcount.c sha256.c spa.c spa_boot.c spa_config.c spa_errlog.c
+DISTFILES += spa_history.c spa_misc.c space_map.c taskq.c txg.c uberblock.c
+DISTFILES += unique.c util.c vdev.c vdev_cache.c vdev_disk.c vdev_missing.c
+DISTFILES += vdev_file.c vdev_label.c vdev_mirror.c vdev_queue.c vdev_raidz.c
+DISTFILES += vdev_root.c zap.c zap_leaf.c zap_micro.c zfs_byteswap.c zfs_fm.c
+DISTFILES += zfs_znode.c zil.c zio.c zio_checksum.c zio_compress.c zio_inject.c
+
+MODULE := zpool
+
+EXTRA_CFLAGS = @KERNELCPPFLAGS@
+EXTRA_CFLAGS += -I@LIBDIR@/libzcommon/include
+EXTRA_CFLAGS += -I@LIBDIR@/libport/include
+EXTRA_CFLAGS += -I@LIBDIR@/libavl/include
+EXTRA_CFLAGS += -I@LIBDIR@/libnvpair/include
+
+obj-m := ${MODULE}.o
+
+# DMU (Data Management Unit)
+${MODULE}-objs += dmu.o # Interfaces dmu core
+${MODULE}-objs += dmu_objset.o # Interfaces dmu objset open/close/manipulate
+${MODULE}-objs += dmu_object.o # Interfaces dmu alloc/free
+${MODULE}-objs += txg.o # Transaction model control threads
+${MODULE}-objs += dmu_tx.o # Interfaces dmu transaction create/manipulate
+${MODULE}-objs += dnode.o # Open context object-level support
+${MODULE}-objs += dnode_sync.o # Syncing context object-level support
+${MODULE}-objs += dbuf.o # Buffer management support
+${MODULE}-objs += dmu_zfetch.o # Data stream prefetch logic
+${MODULE}-objs += refcount.o # Generic refcount support
+#${MODULE}-objs += dmu_send.o # XXX missing: Snapshot send/received support
+
+# DSL (Dataset and Snapshot Layer)
+${MODULE}-objs += dsl_dir.o # Namespace and management support
+${MODULE}-objs += dsl_dataset.o # Interfaces snapshot/rollback/clone
+${MODULE}-objs += dsl_pool.o # Pool-level support
+${MODULE}-objs += dsl_prop.o # Property manipulation support
+${MODULE}-objs += dsl_deleg.o # XXX: ?
+${MODULE}-objs += dsl_synctask.o# XXX: ?
+${MODULE}-objs += unique.o # Unique objset ID support
+
+# ZAP (ZFS Attribute Processor)
+${MODULE}-objs += zap.o # Interfaces (fat)
+${MODULE}-objs += zap_leaf.o # Low-level support
+${MODULE}-objs += zap_micro.o # Interfaces (micro)
+
+# ZIL (ZFS Intent Log)
+${MODULE}-objs += zil.o # Intent log
+
+# ARC (Adaptive Replacement Cache)
+${MODULE}-objs += arc.o # Adaptive replacement cache
+
+# SPA (Storage Pool Allocator)
+${MODULE}-objs += spa.o # Open/import/export/destroy support
+${MODULE}-objs += spa_misc.o # Misc support (includes locking)
+${MODULE}-objs += spa_config.o # Parse and update pool config data
+${MODULE}-objs += spa_errlog.o # Log of persistent pool-wide data errors
+${MODULE}-objs += spa_history.o # Command history support
+${MODULE}-objs += spa_boot.o # Bootable root partitions
+${MODULE}-objs += zfs_fm.o # Post reports of FMA consumption support
+
+# ZIO (ZFS I/O Pipeline)
+${MODULE}-objs += zio.o # Interfaces zio core
+${MODULE}-objs += zio_checksum.o# Generic checksum interface
+${MODULE}-objs += fletcher.o # Fletcher 2 and 4 checksum algorithms
+${MODULE}-objs += sha256.o # SHA256 checksum algorithms
+${MODULE}-objs += zio_compress.o# Generic compression interface
+${MODULE}-objs += lzjb.o # LZJB compression algorithm
+${MODULE}-objs += gzip.o # GZIP compression algorithm
+${MODULE}-objs += uberblock.o # Basic uberblock routines
+${MODULE}-objs += bplist.o # Keeps track of list of block pointers
+${MODULE}-objs += metaslab.o # Bulk of DVA translation
+${MODULE}-objs += space_map.o # Keeps track of free space
+${MODULE}-objs += zio_inject.o # Framework for persistent error injection
+
+# VDEV (Virtual Devices)
+${MODULE}-objs += vdev.o # Interfaces vdev code
+${MODULE}-objs += vdev_disk.o # Disk virtual device
+${MODULE}-objs += vdev_file.o # File virtual device
+${MODULE}-objs += vdev_mirror.o # N-Way mirroring
+${MODULE}-objs += vdev_raidz.o # RAID-Z grouping
+${MODULE}-objs += vdev_root.o # Top-level pseudo vdev
+${MODULE}-objs += vdev_missing.o# Special device for import
+${MODULE}-objs += vdev_label.o # Read/write indentifying label support
+${MODULE}-objs += vdev_cache.o # Simple device-level caching for reads
+${MODULE}-objs += vdev_queue.o # I/O scheduling algorithm for vdevs
+
+# ZPL (ZFS Posix Layer)
+${MODULE}-objs += zfs_byteswap.o# Byteswap support for ZPL
+${MODULE}-objs += zfs_znode.o # Vnode to znode mapping
+
+# Traversal
+${MODULE}-objs += dmu_traverse.o# Traversal code
+
+# SPL (Solaris Porting Layer)
+${MODULE}-objs += spl.o # Linux kernel glue and misc support
diff --git a/zfs/lib/libzpool/arc.c b/zfs/lib/libzpool/arc.c
new file mode 100644
index 000000000..8d091b7ce
--- /dev/null
+++ b/zfs/lib/libzpool/arc.c
@@ -0,0 +1,4232 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)arc.c 1.44 08/03/20 SMI"
+
+/*
+ * DVA-based Adjustable Replacement Cache
+ *
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory. This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about. Our cache is not so simple. At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them. Blocks are only evictable
+ * when there are no external references active. This makes
+ * eviction far more problematic: we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space. In these circumstances we are unable to adjust the cache
+ * size. To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss. Our model has a variable sized cache. It grows with
+ * high use, but also tries to react to memory pressure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefor exactly the same size. So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict. In our model, we
+ * have variable sized cache blocks (rangeing from 512 bytes to
+ * 128K bytes). We therefor choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists. The arc_read() interface
+ * uses method 1, while the internal arc algorithms for
+ * adjusting the cache use method 2. We therefor provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * arc list locks.
+ *
+ * Buffers do not have their own mutexs, rather they rely on the
+ * hash table mutexs for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexs).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table. It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each arc state also has a mutex which is used to protect the
+ * buffer list associated with the state. When attempting to
+ * obtain a hash table lock while holding an arc list lock you
+ * must use: mutex_tryenter() to avoid deadlock. Also note that
+ * the active state mutex must be held before the ghost state mutex.
+ *
+ * Arc buffers may have an associated eviction callback function.
+ * This function will be invoked prior to removing the buffer (e.g.
+ * in arc_do_user_evicts()). Note however that the data associated
+ * with the buffer may be evicted prior to the callback. The callback
+ * must be made with *no locks held* (to prevent deadlock). Additionally,
+ * the users of callbacks must ensure that their private data is
+ * protected from simultaneous callbacks from arc_buf_evict()
+ * and arc_do_user_evicts().
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ *
+ * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ *
+ * - L2ARC buflist creation
+ * - L2ARC buflist eviction
+ * - L2ARC write completion, which walks L2ARC buflists
+ * - ARC header destruction, as it removes from L2ARC buflists
+ * - ARC header release, as it removes from L2ARC buflists
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <vm/anon.h>
+#include <sys/fs/swapnode.h>
+#include <sys/dnlc.h>
+#endif
+#include <sys/callb.h>
+#include <sys/kstat.h>
+
+static kmutex_t arc_reclaim_thr_lock;
+static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
+static uint8_t arc_thread_exit;
+
+extern int zfs_write_limit_shift;
+extern uint64_t zfs_write_limit_max;
+extern uint64_t zfs_write_limit_inflated;
+
+#define ARC_REDUCE_DNLC_PERCENT 3
+uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+
+typedef enum arc_reclaim_strategy {
+ ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
+ ARC_RECLAIM_CONS /* Conservative reclaim strategy */
+} arc_reclaim_strategy_t;
+
+/* number of seconds before growing cache again */
+static int arc_grow_retry = 60;
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int arc_min_prefetch_lifespan;
+
+static int arc_dead;
+
+/*
+ * These tunables are for performance analysis.
+ */
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+uint64_t zfs_arc_meta_limit = 0;
+
+/*
+ * Note that buffers can be in one of 6 states:
+ * ARC_anon - anonymous (discussed below)
+ * ARC_mru - recently used, currently cached
+ * ARC_mru_ghost - recentely used, no longer in cache
+ * ARC_mfu - frequently used, currently cached
+ * ARC_mfu_ghost - frequently used, no longer in cache
+ * ARC_l2c_only - exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states. These are
+ * the only buffers that can be evicted or deleted. Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA. These are buffers that hold dirty block copies
+ * before they are written to stable storage. By definition,
+ * they are "ref'd" and are considered part of arc_mru
+ * that cannot be freed. Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
+ */
+
+typedef struct arc_state {
+ list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
+ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
+ uint64_t arcs_size; /* total amount of data in this state */
+ kmutex_t arcs_mtx;
+} arc_state_t;
+
+/* The 6 states: */
+static arc_state_t ARC_anon;
+static arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+static arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
+
+typedef struct arc_stats {
+ kstat_named_t arcstat_hits;
+ kstat_named_t arcstat_misses;
+ kstat_named_t arcstat_demand_data_hits;
+ kstat_named_t arcstat_demand_data_misses;
+ kstat_named_t arcstat_demand_metadata_hits;
+ kstat_named_t arcstat_demand_metadata_misses;
+ kstat_named_t arcstat_prefetch_data_hits;
+ kstat_named_t arcstat_prefetch_data_misses;
+ kstat_named_t arcstat_prefetch_metadata_hits;
+ kstat_named_t arcstat_prefetch_metadata_misses;
+ kstat_named_t arcstat_mru_hits;
+ kstat_named_t arcstat_mru_ghost_hits;
+ kstat_named_t arcstat_mfu_hits;
+ kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_deleted;
+ kstat_named_t arcstat_recycle_miss;
+ kstat_named_t arcstat_mutex_miss;
+ kstat_named_t arcstat_evict_skip;
+ kstat_named_t arcstat_hash_elements;
+ kstat_named_t arcstat_hash_elements_max;
+ kstat_named_t arcstat_hash_collisions;
+ kstat_named_t arcstat_hash_chains;
+ kstat_named_t arcstat_hash_chain_max;
+ kstat_named_t arcstat_p;
+ kstat_named_t arcstat_c;
+ kstat_named_t arcstat_c_min;
+ kstat_named_t arcstat_c_max;
+ kstat_named_t arcstat_size;
+ kstat_named_t arcstat_hdr_size;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_size;
+ kstat_named_t arcstat_l2_hdr_size;
+ kstat_named_t arcstat_memory_throttle_count;
+} arc_stats_t;
+
+static arc_stats_t arc_stats = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "demand_data_hits", KSTAT_DATA_UINT64 },
+ { "demand_data_misses", KSTAT_DATA_UINT64 },
+ { "demand_metadata_hits", KSTAT_DATA_UINT64 },
+ { "demand_metadata_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_data_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_data_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
+ { "mru_hits", KSTAT_DATA_UINT64 },
+ { "mru_ghost_hits", KSTAT_DATA_UINT64 },
+ { "mfu_hits", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "deleted", KSTAT_DATA_UINT64 },
+ { "recycle_miss", KSTAT_DATA_UINT64 },
+ { "mutex_miss", KSTAT_DATA_UINT64 },
+ { "evict_skip", KSTAT_DATA_UINT64 },
+ { "hash_elements", KSTAT_DATA_UINT64 },
+ { "hash_elements_max", KSTAT_DATA_UINT64 },
+ { "hash_collisions", KSTAT_DATA_UINT64 },
+ { "hash_chains", KSTAT_DATA_UINT64 },
+ { "hash_chain_max", KSTAT_DATA_UINT64 },
+ { "p", KSTAT_DATA_UINT64 },
+ { "c", KSTAT_DATA_UINT64 },
+ { "c_min", KSTAT_DATA_UINT64 },
+ { "c_max", KSTAT_DATA_UINT64 },
+ { "size", KSTAT_DATA_UINT64 },
+ { "hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_hits", KSTAT_DATA_UINT64 },
+ { "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_feeds", KSTAT_DATA_UINT64 },
+ { "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_writes_sent", KSTAT_DATA_UINT64 },
+ { "l2_writes_done", KSTAT_DATA_UINT64 },
+ { "l2_writes_error", KSTAT_DATA_UINT64 },
+ { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_cksum_bad", KSTAT_DATA_UINT64 },
+ { "l2_io_error", KSTAT_DATA_UINT64 },
+ { "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "memory_throttle_count", KSTAT_DATA_UINT64 }
+};
+
+#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
+
+#define ARCSTAT_INCR(stat, val) \
+ atomic_add_64(&arc_stats.stat.value.ui64, (val));
+
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
+
+#define ARCSTAT_MAX(stat, val) { \
+ uint64_t m; \
+ while ((val) > (m = arc_stats.stat.value.ui64) && \
+ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
+ continue; \
+}
+
+#define ARCSTAT_MAXSTAT(stat) \
+ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
+
+/*
+ * We define a macro to allow ARC hits/misses to be easily broken down by
+ * two separate conditions, giving a total of four different subtypes for
+ * each of hits and misses (so eight statistics total).
+ */
+#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
+ if (cond1) { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
+ } \
+ } else { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
+ } \
+ }
+
+kstat_t *arc_ksp;
+static arc_state_t *arc_anon;
+static arc_state_t *arc_mru;
+static arc_state_t *arc_mru_ghost;
+static arc_state_t *arc_mfu;
+static arc_state_t *arc_mfu_ghost;
+static arc_state_t *arc_l2c_only;
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them. For these variables, we therefore define them to be in
+ * terms of the statistic variable. This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
+#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
+#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
+#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
+#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+
+static int arc_no_grow; /* Don't try to grow cache size */
+static uint64_t arc_tempreserve;
+static uint64_t arc_meta_used;
+static uint64_t arc_meta_limit;
+static uint64_t arc_meta_max = 0;
+
+typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+ void *acb_private;
+ arc_done_func_t *acb_done;
+ arc_byteswap_func_t *acb_byteswap;
+ arc_buf_t *acb_buf;
+ zio_t *acb_zio_dummy;
+ arc_callback_t *acb_next;
+};
+
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+ void *awcb_private;
+ arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
+};
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ uint64_t b_cksum0;
+
+ kmutex_t b_freeze_lock;
+ zio_cksum_t *b_freeze_cksum;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_buf_t *b_buf;
+ uint32_t b_flags;
+ uint32_t b_datacnt;
+
+ arc_callback_t *b_acb;
+ kcondvar_t b_cv;
+
+ /* immutable */
+ arc_buf_contents_t b_type;
+ uint64_t b_size;
+ spa_t *b_spa;
+
+ /* protected by arc state mutex */
+ arc_state_t *b_state;
+ list_node_t b_arc_node;
+
+ /* updated atomically */
+ clock_t b_arc_access;
+
+ /* self protecting */
+ refcount_t b_refcnt;
+
+ l2arc_buf_hdr_t *b_l2hdr;
+ list_node_t b_l2node;
+};
+
+static arc_buf_t *arc_eviction_list;
+static kmutex_t arc_eviction_mtx;
+static arc_buf_hdr_t arc_eviction_hdr;
+static void arc_get_data_buf(arc_buf_t *buf);
+static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+static int arc_evict_needed(arc_buf_contents_t type);
+static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
+
+#define GHOST_STATE(state) \
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
+ (state) == arc_l2c_only)
+
+/*
+ * Private ARC flags. These flags are private ARC only flags that will show up
+ * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
+ * be passed in as arc_flags in things like arc_read. However, these flags
+ * should never be passed and should only be set by ARC code. When adding new
+ * public flags, make sure not to smash the private ones.
+ */
+
+#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
+#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
+#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
+#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
+#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
+#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
+#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
+#define ARC_DONT_L2CACHE (1 << 16) /* originated by prefetch */
+#define ARC_L2_READING (1 << 17) /* L2ARC read in progress */
+#define ARC_L2_WRITING (1 << 18) /* L2ARC write in progress */
+#define ARC_L2_EVICTED (1 << 19) /* evicted during I/O */
+#define ARC_L2_WRITE_HEAD (1 << 20) /* head of write list */
+
+#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
+#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
+#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
+#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
+#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
+#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
+#define HDR_DONT_L2CACHE(hdr) ((hdr)->b_flags & ARC_DONT_L2CACHE)
+#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_L2_READING)
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+
+/*
+ * Other sizes
+ */
+
+#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+
+/*
+ * Hash table routines
+ */
+
+#define HT_LOCK_PAD 64
+
+struct ht_lock {
+ kmutex_t ht_lock;
+#ifdef _KERNEL
+ unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+#define BUF_LOCKS 256
+typedef struct buf_hash_table {
+ uint64_t ht_mask;
+ arc_buf_hdr_t **ht_table;
+ struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define BUF_HASH_INDEX(spa, dva, birth) \
+ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define HDR_LOCK(buf) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+/*
+ * Level 2 ARC
+ */
+
+#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 4 /* num of writes */
+#define L2ARC_FEED_DELAY 180 /* starting grace */
+#define L2ARC_FEED_SECS 1 /* caching interval */
+
+#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
+#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
+
+/*
+ * L2ARC Performance Tunables
+ */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
+uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_write; /* desired write size, bytes */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ uint64_t l2ad_evict; /* last addr eviction reached */
+ boolean_t l2ad_first; /* first sweep through */
+ list_t *l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+} l2arc_dev_t;
+
+static list_t L2ARC_dev_list; /* device list */
+static list_t *l2arc_dev_list; /* device list pointer */
+static kmutex_t l2arc_dev_mtx; /* device list mutex */
+static l2arc_dev_t *l2arc_dev_last; /* last device used */
+static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
+static list_t L2ARC_free_on_write; /* free after write buf list */
+static list_t *l2arc_free_on_write; /* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
+static uint64_t l2arc_ndev; /* number of devices */
+
+typedef struct l2arc_read_callback {
+ arc_buf_t *l2rcb_buf; /* read buffer */
+ spa_t *l2rcb_spa; /* spa */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+} l2arc_write_callback_t;
+
+struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ daddr_t b_daddr; /* disk address, offset byte */
+};
+
+typedef struct l2arc_data_free {
+ /* protected by l2arc_free_on_write_mtx */
+ void *l2df_data;
+ size_t l2df_size;
+ void (*l2df_func)(void *, size_t);
+ list_node_t l2df_list_node;
+} l2arc_data_free_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static void l2arc_read_done(zio_t *zio);
+static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_remove(void);
+
+static uint64_t
+buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+{
+ uintptr_t spav = (uintptr_t)spa;
+ uint8_t *vdva = (uint8_t *)dva;
+ uint64_t crc = -1ULL;
+ int i;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ for (i = 0; i < sizeof (dva_t); i++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
+
+ crc ^= (spav>>8) ^ birth;
+
+ return (crc);
+}
+
+#define BUF_EMPTY(buf) \
+ ((buf)->b_dva.dva_word[0] == 0 && \
+ (buf)->b_dva.dva_word[1] == 0 && \
+ (buf)->b_birth == 0)
+
+#define BUF_EQUAL(spa, dva, birth, buf) \
+ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+
+static arc_buf_hdr_t *
+buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *buf;
+
+ mutex_enter(hash_lock);
+ for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+ buf = buf->b_hash_next) {
+ if (BUF_EQUAL(spa, dva, birth, buf)) {
+ *lockp = hash_lock;
+ return (buf);
+ }
+ }
+ mutex_exit(hash_lock);
+ *lockp = NULL;
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *fbuf;
+ uint32_t i;
+
+ ASSERT(!HDR_IN_HASH_TABLE(buf));
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+ fbuf = fbuf->b_hash_next, i++) {
+ if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
+ return (fbuf);
+ }
+
+ buf->b_hash_next = buf_hash_table.ht_table[idx];
+ buf_hash_table.ht_table[idx] = buf;
+ buf->b_flags |= ARC_IN_HASH_TABLE;
+
+ /* collect some hash table performance data */
+ if (i > 0) {
+ ARCSTAT_BUMP(arcstat_hash_collisions);
+ if (i == 1)
+ ARCSTAT_BUMP(arcstat_hash_chains);
+
+ ARCSTAT_MAX(arcstat_hash_chain_max, i);
+ }
+
+ ARCSTAT_BUMP(arcstat_hash_elements);
+ ARCSTAT_MAXSTAT(arcstat_hash_elements);
+
+ return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *buf)
+{
+ arc_buf_hdr_t *fbuf, **bufp;
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+
+ ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+ ASSERT(HDR_IN_HASH_TABLE(buf));
+
+ bufp = &buf_hash_table.ht_table[idx];
+ while ((fbuf = *bufp) != buf) {
+ ASSERT(fbuf != NULL);
+ bufp = &fbuf->b_hash_next;
+ }
+ *bufp = buf->b_hash_next;
+ buf->b_hash_next = NULL;
+ buf->b_flags &= ~ARC_IN_HASH_TABLE;
+
+ /* collect some hash table performance data */
+ ARCSTAT_BUMPDOWN(arcstat_hash_elements);
+
+ if (buf_hash_table.ht_table[idx] &&
+ buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+ ARCSTAT_BUMPDOWN(arcstat_hash_chains);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+static kmem_cache_t *hdr_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+ int i;
+
+ kmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+ for (i = 0; i < BUF_LOCKS; i++)
+ mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_hdr_t));
+ refcount_create(&buf->b_refcnt);
+ cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+ return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ refcount_destroy(&buf->b_refcnt);
+ cv_destroy(&buf->b_cv);
+ mutex_destroy(&buf->b_freeze_lock);
+
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+ dprintf("hdr_recl called\n");
+ /*
+ * umem calls the reclaim func when we destroy the buf cache,
+ * which is after we do arc_fini().
+ */
+ if (!arc_dead)
+ cv_signal(&arc_reclaim_thr_cv);
+}
+
+static void
+buf_init(void)
+{
+ uint64_t *ct;
+ uint64_t hsize = 1ULL << 12;
+ int i, j;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 64K block size. The table will take up
+ * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
+ */
+ while (hsize * 65536 < physmem * PAGESIZE)
+ hsize <<= 1;
+retry:
+ buf_hash_table.ht_mask = hsize - 1;
+ buf_hash_table.ht_table =
+ kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+ if (buf_hash_table.ht_table == NULL) {
+ ASSERT(hsize > (1ULL << 8));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
+ 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < 256; i++)
+ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+ for (i = 0; i < BUF_LOCKS; i++) {
+ mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+#define ARC_MINTIME (hz>>4) /* 62 ms */
+
+static void
+arc_cksum_verify(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum == NULL ||
+ (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+ return;
+ }
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+ panic("buffer modified while frozen!");
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+static int
+arc_cksum_equal(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+ int equal;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+ return (equal);
+}
+
+static void
+arc_cksum_compute(arc_buf_t *buf, boolean_t force)
+{
+ if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum != NULL) {
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+ return;
+ }
+ buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
+ buf->b_hdr->b_freeze_cksum);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_state != arc_anon)
+ panic("modifying non-anon buffer!");
+ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ panic("modifying buffer while i/o in progress!");
+ arc_cksum_verify(buf);
+ }
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum != NULL) {
+ kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ buf->b_hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_freeze(arc_buf_t *buf)
+{
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
+ buf->b_hdr->b_state == arc_anon);
+ arc_cksum_compute(buf, B_FALSE);
+}
+
+static void
+add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
+ (ab->b_state != arc_anon)) {
+ uint64_t delta = ab->b_size * ab->b_datacnt;
+ list_t *list = &ab->b_state->arcs_list[ab->b_type];
+ uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
+
+ ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
+ mutex_enter(&ab->b_state->arcs_mtx);
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(list, ab);
+ if (GHOST_STATE(ab->b_state)) {
+ ASSERT3U(ab->b_datacnt, ==, 0);
+ ASSERT3P(ab->b_buf, ==, NULL);
+ delta = ab->b_size;
+ }
+ ASSERT(delta > 0);
+ ASSERT3U(*size, >=, delta);
+ atomic_add_64(size, -delta);
+ mutex_exit(&ab->b_state->arcs_mtx);
+ /* remove the prefetch flag is we get a reference */
+ if (ab->b_flags & ARC_PREFETCH)
+ ab->b_flags &= ~ARC_PREFETCH;
+ }
+}
+
+static int
+remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ int cnt;
+ arc_state_t *state = ab->b_state;
+
+ ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
+ ASSERT(!GHOST_STATE(state));
+
+ if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+ (state != arc_anon)) {
+ uint64_t *size = &state->arcs_lsize[ab->b_type];
+
+ ASSERT(!MUTEX_HELD(&state->arcs_mtx));
+ mutex_enter(&state->arcs_mtx);
+ ASSERT(!list_link_active(&ab->b_arc_node));
+ list_insert_head(&state->arcs_list[ab->b_type], ab);
+ ASSERT(ab->b_datacnt > 0);
+ atomic_add_64(size, ab->b_size * ab->b_datacnt);
+ mutex_exit(&state->arcs_mtx);
+ }
+ return (cnt);
+}
+
+/*
+ * Move the supplied buffer to the indicated state. The mutex
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
+{
+ arc_state_t *old_state = ab->b_state;
+ int64_t refcnt = refcount_count(&ab->b_refcnt);
+ uint64_t from_delta, to_delta;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(new_state != old_state);
+ ASSERT(refcnt == 0 || ab->b_datacnt > 0);
+ ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+
+ from_delta = to_delta = ab->b_datacnt * ab->b_size;
+
+ /*
+ * If this buffer is evictable, transfer it from the
+ * old state list to the new state list.
+ */
+ if (refcnt == 0) {
+ if (old_state != arc_anon) {
+ int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+ uint64_t *size = &old_state->arcs_lsize[ab->b_type];
+
+ if (use_mutex)
+ mutex_enter(&old_state->arcs_mtx);
+
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(&old_state->arcs_list[ab->b_type], ab);
+
+ /*
+ * If prefetching out of the ghost cache,
+ * we will have a non-null datacnt.
+ */
+ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
+ /* ghost elements have a ghost size */
+ ASSERT(ab->b_buf == NULL);
+ from_delta = ab->b_size;
+ }
+ ASSERT3U(*size, >=, from_delta);
+ atomic_add_64(size, -from_delta);
+
+ if (use_mutex)
+ mutex_exit(&old_state->arcs_mtx);
+ }
+ if (new_state != arc_anon) {
+ int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+ uint64_t *size = &new_state->arcs_lsize[ab->b_type];
+
+ if (use_mutex)
+ mutex_enter(&new_state->arcs_mtx);
+
+ list_insert_head(&new_state->arcs_list[ab->b_type], ab);
+
+ /* ghost elements have a ghost size */
+ if (GHOST_STATE(new_state)) {
+ ASSERT(ab->b_datacnt == 0);
+ ASSERT(ab->b_buf == NULL);
+ to_delta = ab->b_size;
+ }
+ atomic_add_64(size, to_delta);
+
+ if (use_mutex)
+ mutex_exit(&new_state->arcs_mtx);
+ }
+ }
+
+ ASSERT(!BUF_EMPTY(ab));
+ if (new_state == arc_anon) {
+ buf_hash_remove(ab);
+ }
+
+ /* adjust state sizes */
+ if (to_delta)
+ atomic_add_64(&new_state->arcs_size, to_delta);
+ if (from_delta) {
+ ASSERT3U(old_state->arcs_size, >=, from_delta);
+ atomic_add_64(&old_state->arcs_size, -from_delta);
+ }
+ ab->b_state = new_state;
+
+ /* adjust l2arc hdr stats */
+ if (new_state == arc_l2c_only)
+ l2arc_hdr_stat_add();
+ else if (old_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+}
+
+void
+arc_space_consume(uint64_t space)
+{
+ atomic_add_64(&arc_meta_used, space);
+ atomic_add_64(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space)
+{
+ ASSERT(arc_meta_used >= space);
+ if (arc_meta_max < arc_meta_used)
+ arc_meta_max = arc_meta_used;
+ atomic_add_64(&arc_meta_used, -space);
+ ASSERT(arc_size >= space);
+ atomic_add_64(&arc_size, -space);
+}
+
+void *
+arc_data_buf_alloc(uint64_t size)
+{
+ if (arc_evict_needed(ARC_BUFC_DATA))
+ cv_signal(&arc_reclaim_thr_cv);
+ atomic_add_64(&arc_size, size);
+ return (zio_data_buf_alloc(size));
+}
+
+void
+arc_data_buf_free(void *buf, uint64_t size)
+{
+ zio_data_buf_free(buf, size);
+ ASSERT(arc_size >= size);
+ atomic_add_64(&arc_size, -size);
+}
+
+arc_buf_t *
+arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+
+ ASSERT3U(size, >, 0);
+ hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ ASSERT(BUF_EMPTY(hdr));
+ hdr->b_size = size;
+ hdr->b_type = type;
+ hdr->b_spa = spa;
+ hdr->b_state = arc_anon;
+ hdr->b_arc_access = 0;
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ hdr->b_datacnt = 1;
+ hdr->b_flags = 0;
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ (void) refcount_add(&hdr->b_refcnt, tag);
+
+ return (buf);
+}
+
+static arc_buf_t *
+arc_buf_clone(arc_buf_t *from)
+{
+ arc_buf_t *buf;
+ arc_buf_hdr_t *hdr = from->b_hdr;
+ uint64_t size = hdr->b_size;
+
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = hdr->b_buf;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ bcopy(from->b_data, buf->b_data, size);
+ hdr->b_datacnt += 1;
+ return (buf);
+}
+
+void
+arc_buf_add_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+
+ /*
+ * Check to see if this buffer is currently being evicted via
+ * arc_do_user_evicts().
+ */
+ mutex_enter(&arc_eviction_mtx);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ mutex_exit(&arc_eviction_mtx);
+ return;
+ }
+ hash_lock = HDR_LOCK(hdr);
+ mutex_exit(&arc_eviction_mtx);
+
+ mutex_enter(hash_lock);
+ if (buf->b_data == NULL) {
+ /*
+ * This buffer is evicted.
+ */
+ mutex_exit(hash_lock);
+ return;
+ }
+
+ ASSERT(buf->b_hdr == hdr);
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ add_reference(hdr, hash_lock, tag);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, hits);
+}
+
+/*
+ * Free the arc data buffer. If it is an l2arc write in progress,
+ * the buffer is placed on l2arc_free_on_write to be freed later.
+ */
+static void
+arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
+ void *data, size_t size)
+{
+ if (HDR_L2_WRITING(hdr)) {
+ l2arc_data_free_t *df;
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df->l2df_data = data;
+ df->l2df_size = size;
+ df->l2df_func = free_func;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else {
+ free_func(data, size);
+ }
+}
+
+static void
+arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
+{
+ arc_buf_t **bufp;
+
+ /* free up data associated with the buf */
+ if (buf->b_data) {
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+
+ arc_cksum_verify(buf);
+ if (!recycle) {
+ if (type == ARC_BUFC_METADATA) {
+ arc_buf_data_free(buf->b_hdr, zio_buf_free,
+ buf->b_data, size);
+ arc_space_return(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_buf_data_free(buf->b_hdr,
+ zio_data_buf_free, buf->b_data, size);
+ atomic_add_64(&arc_size, -size);
+ }
+ }
+ if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ uint64_t *cnt = &state->arcs_lsize[type];
+
+ ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
+ ASSERT(state != arc_anon);
+
+ ASSERT3U(*cnt, >=, size);
+ atomic_add_64(cnt, -size);
+ }
+ ASSERT3U(state->arcs_size, >=, size);
+ atomic_add_64(&state->arcs_size, -size);
+ buf->b_data = NULL;
+ ASSERT(buf->b_hdr->b_datacnt > 0);
+ buf->b_hdr->b_datacnt -= 1;
+ }
+
+ /* only remove the buf if requested */
+ if (!all)
+ return;
+
+ /* remove the buf from the hdr list */
+ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+ continue;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_efunc == NULL);
+
+ /* clean up the buf */
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ if (hdr->b_l2hdr != NULL) {
+ if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ ASSERT(hdr->b_l2hdr != NULL);
+
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ mutex_exit(&l2arc_buflist_mtx);
+ } else {
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ }
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
+
+ if (!BUF_EMPTY(hdr)) {
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ }
+ while (hdr->b_buf) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ ASSERT(buf->b_hdr != NULL);
+ arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
+ hdr->b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
+ }
+ }
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+ ASSERT3P(hdr->b_acb, ==, NULL);
+ kmem_cache_free(hdr_cache, hdr);
+}
+
+void
+arc_buf_free(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ int hashed = hdr->b_state != arc_anon;
+
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(buf->b_data != NULL);
+
+ if (hashed) {
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ mutex_enter(hash_lock);
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1)
+ arc_buf_destroy(buf, FALSE, TRUE);
+ else
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ mutex_exit(hash_lock);
+ } else if (HDR_IO_IN_PROGRESS(hdr)) {
+ int destroy_hdr;
+ /*
+ * We are in the middle of an async write. Don't destroy
+ * this buffer unless the write completes before we finish
+ * decrementing the reference count.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ (void) remove_reference(hdr, NULL, tag);
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ if (remove_reference(hdr, NULL, tag) > 0) {
+ ASSERT(HDR_IO_ERROR(hdr));
+ arc_buf_destroy(buf, FALSE, TRUE);
+ } else {
+ arc_hdr_destroy(hdr);
+ }
+ }
+}
+
+int
+arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ int no_callback = (buf->b_efunc == NULL);
+
+ if (hdr->b_state == arc_anon) {
+ arc_buf_free(buf, tag);
+ return (no_callback);
+ }
+
+ mutex_enter(hash_lock);
+ ASSERT(hdr->b_state != arc_anon);
+ ASSERT(buf->b_data != NULL);
+
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1) {
+ if (no_callback)
+ arc_buf_destroy(buf, FALSE, TRUE);
+ } else if (no_callback) {
+ ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
+ ASSERT(no_callback || hdr->b_datacnt > 1 ||
+ refcount_is_zero(&hdr->b_refcnt));
+ mutex_exit(hash_lock);
+ return (no_callback);
+}
+
+int
+arc_buf_size(arc_buf_t *buf)
+{
+ return (buf->b_hdr->b_size);
+}
+
+/*
+ * Evict buffers from list until we've removed the specified number of
+ * bytes. Move the removed buffers to the appropriate evict state.
+ * If the recycle flag is set, then attempt to "recycle" a buffer:
+ * - look for a buffer to evict that is `bytes' long.
+ * - return the data block from this buffer rather than freeing it.
+ * This flag is used by callers that are trying to make space for a
+ * new buffer in a full arc cache.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ */
+static void *
+arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
+ arc_buf_contents_t type)
+{
+ arc_state_t *evicted_state;
+ uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
+ arc_buf_hdr_t *ab, *ab_prev = NULL;
+ list_t *list = &state->arcs_list[type];
+ kmutex_t *hash_lock;
+ boolean_t have_lock;
+ void *stolen = NULL;
+
+ ASSERT(state == arc_mru || state == arc_mfu);
+
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ mutex_enter(&state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(ab) ||
+ (spa && ab->b_spa != spa) ||
+ (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
+ lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+ skipped++;
+ continue;
+ }
+ /* "lookahead" for better eviction candidate */
+ if (recycle && ab->b_size != bytes &&
+ ab_prev && ab_prev->b_size == bytes)
+ continue;
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (have_lock || mutex_tryenter(hash_lock)) {
+ ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+ ASSERT(ab->b_datacnt > 0);
+ while (ab->b_buf) {
+ arc_buf_t *buf = ab->b_buf;
+ if (buf->b_data) {
+ bytes_evicted += ab->b_size;
+ if (recycle && ab->b_type == type &&
+ ab->b_size == bytes &&
+ !HDR_L2_WRITING(ab)) {
+ stolen = buf->b_data;
+ recycle = FALSE;
+ }
+ }
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ arc_buf_destroy(buf,
+ buf->b_data == stolen, FALSE);
+ ab->b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(buf,
+ buf->b_data == stolen, TRUE);
+ }
+ }
+ ASSERT(ab->b_datacnt == 0);
+ arc_change_state(evicted_state, ab, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(ab));
+ ab->b_flags |= ARC_IN_HASH_TABLE;
+ ab->b_flags &= ~ARC_BUF_AVAILABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ if (!have_lock)
+ mutex_exit(hash_lock);
+ if (bytes >= 0 && bytes_evicted >= bytes)
+ break;
+ } else {
+ missed += 1;
+ }
+ }
+
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&state->arcs_mtx);
+
+ if (bytes_evicted < bytes)
+ dprintf("only evicted %lld bytes from %x",
+ (longlong_t)bytes_evicted, state);
+
+ if (skipped)
+ ARCSTAT_INCR(arcstat_evict_skip, skipped);
+
+ if (missed)
+ ARCSTAT_INCR(arcstat_mutex_miss, missed);
+
+ /*
+ * We have just evicted some date into the ghost state, make
+ * sure we also adjust the ghost state size if necessary.
+ */
+ if (arc_no_grow &&
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
+ int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
+ arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete =
+ MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+ } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
+ arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+ }
+ }
+
+ return (stolen);
+}
+
+/*
+ * Remove buffers from list until we've removed the specified number of
+ * bytes. Destroy the buffers that are removed.
+ */
+static void
+arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
+{
+ arc_buf_hdr_t *ab, *ab_prev;
+ list_t *list = &state->arcs_list[ARC_BUFC_DATA];
+ kmutex_t *hash_lock;
+ uint64_t bytes_deleted = 0;
+ uint64_t bufs_skipped = 0;
+
+ ASSERT(GHOST_STATE(state));
+top:
+ mutex_enter(&state->arcs_mtx);
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+ if (spa && ab->b_spa != spa)
+ continue;
+ hash_lock = HDR_LOCK(ab);
+ if (mutex_tryenter(hash_lock)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(ab));
+ ASSERT(ab->b_buf == NULL);
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_deleted += ab->b_size;
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, ab, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ arc_change_state(arc_anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ }
+
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
+ if (bytes >= 0 && bytes_deleted >= bytes)
+ break;
+ } else {
+ if (bytes < 0) {
+ mutex_exit(&state->arcs_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ bufs_skipped += 1;
+ }
+ }
+ mutex_exit(&state->arcs_mtx);
+
+ if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+ (bytes < 0 || bytes_deleted < bytes)) {
+ list = &state->arcs_list[ARC_BUFC_METADATA];
+ goto top;
+ }
+
+ if (bufs_skipped) {
+ ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
+ ASSERT(bytes >= 0);
+ }
+
+ if (bytes_deleted < bytes)
+ dprintf("only deleted %lld bytes from %p",
+ (longlong_t)bytes_deleted, state);
+}
+
+static void
+arc_adjust(void)
+{
+ int64_t top_sz, mru_over, arc_over, todelete;
+
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
+
+ if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
+ (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ }
+
+ if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
+ (void) arc_evict(arc_mru, NULL, toevict, FALSE,
+ ARC_BUFC_METADATA);
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ }
+
+ mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0) {
+ if (arc_mru_ghost->arcs_size > 0) {
+ todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+ }
+ }
+
+ if ((arc_over = arc_size - arc_c) > 0) {
+ int64_t tbl_over;
+
+ if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
+ (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+ ARC_BUFC_DATA);
+ arc_over = arc_size - arc_c;
+ }
+
+ if (arc_over > 0 &&
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t toevict =
+ MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
+ arc_over);
+ (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+ ARC_BUFC_METADATA);
+ }
+
+ tbl_over = arc_size + arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c * 2;
+
+ if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
+ todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+ }
+ }
+}
+
+static void
+arc_do_user_evicts(void)
+{
+ mutex_enter(&arc_eviction_mtx);
+ while (arc_eviction_list != NULL) {
+ arc_buf_t *buf = arc_eviction_list;
+ arc_eviction_list = buf->b_next;
+ buf->b_hdr = NULL;
+ mutex_exit(&arc_eviction_mtx);
+
+ if (buf->b_efunc != NULL)
+ VERIFY(buf->b_efunc(buf) == 0);
+
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ kmem_cache_free(buf_cache, buf);
+ mutex_enter(&arc_eviction_mtx);
+ }
+ mutex_exit(&arc_eviction_mtx);
+}
+
+/*
+ * Flush all *evictable* data from the cache for the given spa.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+arc_flush(spa_t *spa)
+{
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
+
+ arc_evict_ghost(arc_mru_ghost, spa, -1);
+ arc_evict_ghost(arc_mfu_ghost, spa, -1);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_do_user_evicts();
+ mutex_exit(&arc_reclaim_thr_lock);
+ ASSERT(spa || arc_eviction_list == NULL);
+}
+
+int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
+
+void
+arc_shrink(void)
+{
+ if (arc_c > arc_c_min) {
+ uint64_t to_free;
+
+#ifdef _KERNEL
+ to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
+#else
+ to_free = arc_c >> arc_shrink_shift;
+#endif
+ if (arc_c > arc_c_min + to_free)
+ atomic_add_64(&arc_c, -to_free);
+ else
+ arc_c = arc_c_min;
+
+ atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+ if (arc_c > arc_size)
+ arc_c = MAX(arc_size, arc_c_min);
+ if (arc_p > arc_c)
+ arc_p = (arc_c >> 1);
+ ASSERT(arc_c >= arc_c_min);
+ ASSERT((int64_t)arc_p >= 0);
+ }
+
+ if (arc_size > arc_c)
+ arc_adjust();
+}
+
+static int
+arc_reclaim_needed(void)
+{
+ uint64_t extra;
+
+#ifdef _KERNEL
+
+ if (needfree)
+ return (1);
+
+ /*
+ * take 'desfree' extra pages, so we reclaim sooner, rather than later
+ */
+ extra = desfree;
+
+ /*
+ * check that we're out of range of the pageout scanner. It starts to
+ * schedule paging if freemem is less than lotsfree and needfree.
+ * lotsfree is the high-water mark for pageout, and needfree is the
+ * number of needed free pages. We add extra pages here to make sure
+ * the scanner doesn't start up while we're freeing memory.
+ */
+ if (freemem < lotsfree + needfree + extra)
+ return (1);
+
+ /*
+ * check to make sure that swapfs has enough space so that anon
+ * reservations can still succeed. anon_resvmem() checks that the
+ * availrmem is greater than swapfs_minfree, and the number of reserved
+ * swap pages. We also add a bit of extra here just to prevent
+ * circumstances from getting really dire.
+ */
+ if (availrmem < swapfs_minfree + swapfs_reserve + extra)
+ return (1);
+
+#if defined(__i386)
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
+ * free)
+ */
+ if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+ (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
+ return (1);
+#endif
+
+#else
+ if (spa_get_random(100) == 0)
+ return (1);
+#endif
+ return (0);
+}
+
+static void
+arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+{
+ size_t i;
+ kmem_cache_t *prev_cache = NULL;
+ kmem_cache_t *prev_data_cache = NULL;
+ extern kmem_cache_t *zio_buf_cache[];
+ extern kmem_cache_t *zio_data_buf_cache[];
+
+#ifdef _KERNEL
+ if (arc_meta_used >= arc_meta_limit) {
+ /*
+ * We are exceeding our meta-data cache limit.
+ * Purge some DNLC entries to release holds on meta-data.
+ */
+ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+ }
+#if defined(__i386)
+ /*
+ * Reclaim unused memory from all kmem caches.
+ */
+ kmem_reap();
+#endif
+#endif
+
+ /*
+ * An aggressive reclamation will shrink the cache size as well as
+ * reap free buffers from the arc kmem caches.
+ */
+ if (strat == ARC_RECLAIM_AGGR)
+ arc_shrink();
+
+ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+ if (zio_buf_cache[i] != prev_cache) {
+ prev_cache = zio_buf_cache[i];
+ kmem_cache_reap_now(zio_buf_cache[i]);
+ }
+ if (zio_data_buf_cache[i] != prev_data_cache) {
+ prev_data_cache = zio_data_buf_cache[i];
+ kmem_cache_reap_now(zio_data_buf_cache[i]);
+ }
+ }
+ kmem_cache_reap_now(buf_cache);
+ kmem_cache_reap_now(hdr_cache);
+}
+
+static void
+arc_reclaim_thread(void)
+{
+ clock_t growtime = 0;
+ arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ while (arc_thread_exit == 0) {
+ if (arc_reclaim_needed()) {
+
+ if (arc_no_grow) {
+ if (last_reclaim == ARC_RECLAIM_CONS) {
+ last_reclaim = ARC_RECLAIM_AGGR;
+ } else {
+ last_reclaim = ARC_RECLAIM_CONS;
+ }
+ } else {
+ arc_no_grow = TRUE;
+ last_reclaim = ARC_RECLAIM_AGGR;
+ membar_producer();
+ }
+
+ /* reset the growth delay for every reclaim */
+ growtime = lbolt + (arc_grow_retry * hz);
+
+ arc_kmem_reap_now(last_reclaim);
+
+ } else if (arc_no_grow && lbolt >= growtime) {
+ arc_no_grow = FALSE;
+ }
+
+ if (2 * arc_c < arc_size +
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
+ arc_adjust();
+
+ if (arc_eviction_list != NULL)
+ arc_do_user_evicts();
+
+ /* block until needed, or one second, whichever is shorter */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&arc_reclaim_thr_cv,
+ &arc_reclaim_thr_lock, (lbolt + hz));
+ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+ }
+
+ arc_thread_exit = 0;
+ cv_broadcast(&arc_reclaim_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
+ thread_exit();
+}
+
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are comming from. This function is only called
+ * when we are adding new content to the cache.
+ */
+static void
+arc_adapt(int bytes, arc_state_t *state)
+{
+ int mult;
+
+ if (state == arc_l2c_only)
+ return;
+
+ ASSERT(bytes > 0);
+ /*
+ * Adapt the target size of the MRU list:
+ * - if we just hit in the MRU ghost list, then increase
+ * the target size of the MRU list.
+ * - if we just hit in the MFU ghost list, then increase
+ * the target size of the MFU list by decreasing the
+ * target size of the MRU list.
+ */
+ if (state == arc_mru_ghost) {
+ mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
+ 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+
+ arc_p = MIN(arc_c, arc_p + bytes * mult);
+ } else if (state == arc_mfu_ghost) {
+ mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
+ 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+
+ arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
+ }
+ ASSERT((int64_t)arc_p >= 0);
+
+ if (arc_reclaim_needed()) {
+ cv_signal(&arc_reclaim_thr_cv);
+ return;
+ }
+
+ if (arc_no_grow)
+ return;
+
+ if (arc_c >= arc_c_max)
+ return;
+
+ /*
+ * If we're within (2 * maxblocksize) bytes of the target
+ * cache size, increment the target cache size
+ */
+ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ atomic_add_64(&arc_c, (int64_t)bytes);
+ if (arc_c > arc_c_max)
+ arc_c = arc_c_max;
+ else if (state == arc_anon)
+ atomic_add_64(&arc_p, (int64_t)bytes);
+ if (arc_p > arc_c)
+ arc_p = arc_c;
+ }
+ ASSERT((int64_t)arc_p >= 0);
+}
+
+/*
+ * Check if the cache has reached its limits and eviction is required
+ * prior to insert.
+ */
+static int
+arc_evict_needed(arc_buf_contents_t type)
+{
+ if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
+ return (1);
+
+#ifdef _KERNEL
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this area remains
+ * above about 1/32nd free.
+ */
+ if (type == ARC_BUFC_DATA && zio_arena != NULL &&
+ vmem_size(zio_arena, VMEM_FREE) <
+ (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
+ return (1);
+#endif
+
+ if (arc_reclaim_needed())
+ return (1);
+
+ return (arc_size > arc_c);
+}
+
+/*
+ * The buffer, supplied as the first argument, needs a data block.
+ * So, if we are at cache max, determine which cache should be victimized.
+ * We have the following cases:
+ *
+ * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
+ * In this situation if we're out of space, but the resident size of the MFU is
+ * under the limit, victimize the MFU cache to satisfy this insertion request.
+ *
+ * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
+ * Here, we've used up all of the available space for the MRU, so we need to
+ * evict from our own cache instead. Evict from the set of resident MRU
+ * entries.
+ *
+ * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
+ * c minus p represents the MFU space in the cache, since p is the size of the
+ * cache that is dedicated to the MRU. In this situation there's still space on
+ * the MFU side, so the MRU side needs to be victimized.
+ *
+ * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
+ * MFU's resident set is consuming more space than it has been allotted. In
+ * this situation, we must victimize our own cache, the MFU, for this insertion.
+ */
+static void
+arc_get_data_buf(arc_buf_t *buf)
+{
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+
+ arc_adapt(size, state);
+
+ /*
+ * We have not yet reached cache maximum size,
+ * just allocate a new buffer.
+ */
+ if (!arc_evict_needed(type)) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ atomic_add_64(&arc_size, size);
+ }
+ goto out;
+ }
+
+ /*
+ * If we are prefetching from the mfu ghost list, this buffer
+ * will end up on the mru list; so steal space from there.
+ */
+ if (state == arc_mfu_ghost)
+ state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
+ else if (state == arc_mru_ghost)
+ state = arc_mru;
+
+ if (state == arc_mru || state == arc_anon) {
+ uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
+ state = (arc_mfu->arcs_lsize[type] > 0 &&
+ arc_p > mru_used) ? arc_mfu : arc_mru;
+ } else {
+ /* MFU cases */
+ uint64_t mfu_space = arc_c - arc_p;
+ state = (arc_mru->arcs_lsize[type] > 0 &&
+ mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ }
+ if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ atomic_add_64(&arc_size, size);
+ }
+ ARCSTAT_BUMP(arcstat_recycle_miss);
+ }
+ ASSERT(buf->b_data != NULL);
+out:
+ /*
+ * Update the state size. Note that ghost states have a
+ * "ghost size" and so don't need to be updated.
+ */
+ if (!GHOST_STATE(buf->b_hdr->b_state)) {
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ atomic_add_64(&hdr->b_state->arcs_size, size);
+ if (list_link_active(&hdr->b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
+ }
+ /*
+ * If we are growing the cache, and we are adding anonymous
+ * data, and we have outgrown arc_p, update arc_p
+ */
+ if (arc_size < arc_c && hdr->b_state == arc_anon &&
+ arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+ arc_p = MIN(arc_c, arc_p + size);
+ }
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
+ */
+static void
+arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+{
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if (buf->b_state == arc_anon) {
+ /*
+ * This buffer is not in the cache, and does not
+ * appear in our "ghost" list. Add the new buffer
+ * to the MRU state.
+ */
+
+ ASSERT(buf->b_arc_access == 0);
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mru, buf, hash_lock);
+
+ } else if (buf->b_state == arc_mru) {
+ /*
+ * If this buffer is here because of a prefetch, then either:
+ * - clear the flag if this is a "referencing" read
+ * (any subsequent access will bump this into the MFU state).
+ * or
+ * - move the buffer to the head of the list if this is
+ * another prefetch (to make it less likely to be evicted).
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ if (refcount_count(&buf->b_refcnt) == 0) {
+ ASSERT(list_link_active(&buf->b_arc_node));
+ } else {
+ buf->b_flags &= ~ARC_PREFETCH;
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ }
+ buf->b_arc_access = lbolt;
+ return;
+ }
+
+ /*
+ * This buffer has been "accessed" only once so far,
+ * but it is still in the cache. Move it to the MFU
+ * state.
+ */
+ if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+ /*
+ * More than 125ms have passed since we
+ * instantiated this buffer. Move it to the
+ * most frequently used state.
+ */
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
+ }
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ } else if (buf->b_state == arc_mru_ghost) {
+ arc_state_t *new_state;
+ /*
+ * This buffer has been "accessed" recently, but
+ * was evicted from the cache. Move it to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ new_state = arc_mru;
+ if (refcount_count(&buf->b_refcnt) > 0)
+ buf->b_flags &= ~ARC_PREFETCH;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ } else {
+ new_state = arc_mfu;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ }
+
+ buf->b_arc_access = lbolt;
+ arc_change_state(new_state, buf, hash_lock);
+
+ ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+ } else if (buf->b_state == arc_mfu) {
+ /*
+ * This buffer has been accessed more than once and is
+ * still in the cache. Keep it in the MFU state.
+ *
+ * NOTE: an add_reference() that occurred when we did
+ * the arc_read() will have kicked this off the list.
+ * If it was a prefetch, we will explicitly move it to
+ * the head of the list now.
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ ASSERT(refcount_count(&buf->b_refcnt) == 0);
+ ASSERT(list_link_active(&buf->b_arc_node));
+ }
+ ARCSTAT_BUMP(arcstat_mfu_hits);
+ buf->b_arc_access = lbolt;
+ } else if (buf->b_state == arc_mfu_ghost) {
+ arc_state_t *new_state = arc_mfu;
+ /*
+ * This buffer has been accessed more than once but has
+ * been evicted from the cache. Move it back to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ /*
+ * This is a prefetch access...
+ * move this block back to the MRU state.
+ */
+ ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+ new_state = arc_mru;
+ }
+
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(new_state, buf, hash_lock);
+
+ ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else if (buf->b_state == arc_l2c_only) {
+ /*
+ * This buffer is on the 2nd Level ARC.
+ */
+
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
+ } else {
+ ASSERT(!"invalid arc state");
+ }
+}
+
+/* a generic arc_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+}
+
+/* a generic arc_done_func_t */
+void
+arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ arc_buf_t **bufp = arg;
+ if (zio && zio->io_error) {
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+ *bufp = NULL;
+ } else {
+ *bufp = buf;
+ }
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+ arc_buf_hdr_t *hdr, *found;
+ arc_buf_t *buf;
+ arc_buf_t *abuf; /* buffer we're assigning to callback */
+ kmutex_t *hash_lock;
+ arc_callback_t *callback_list, *acb;
+ int freeable = FALSE;
+
+ buf = zio->io_private;
+ hdr = buf->b_hdr;
+
+ /*
+ * The hdr was inserted into hash-table and removed from lists
+ * prior to starting I/O. We should find this header, since
+ * it's in the hash table, and it should be legit since it's
+ * not possible to evict it during the I/O. The only possible
+ * reason for it not to be found is if we were freed during the
+ * read.
+ */
+ found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+ &hash_lock);
+
+ ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
+ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+
+ hdr->b_flags &= ~(ARC_L2_READING|ARC_L2_EVICTED);
+ if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
+ hdr->b_flags |= ARC_DONT_L2CACHE;
+
+ /* byteswap if necessary */
+ callback_list = hdr->b_acb;
+ ASSERT(callback_list != NULL);
+ if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
+ callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+
+ arc_cksum_compute(buf, B_FALSE);
+
+ /* create copies of the data buffer for the callers */
+ abuf = buf;
+ for (acb = callback_list; acb; acb = acb->acb_next) {
+ if (acb->acb_done) {
+ if (abuf == NULL)
+ abuf = arc_buf_clone(buf);
+ acb->acb_buf = abuf;
+ abuf = NULL;
+ }
+ }
+ hdr->b_acb = NULL;
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ ASSERT(!HDR_BUF_AVAILABLE(hdr));
+ if (abuf == buf)
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+
+ if (zio->io_error != 0) {
+ hdr->b_flags |= ARC_IO_ERROR;
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
+ if (HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ /* convert checksum errors into IO errors */
+ if (zio->io_error == ECKSUM)
+ zio->io_error = EIO;
+ }
+
+ /*
+ * Broadcast before we drop the hash_lock to avoid the possibility
+ * that the hdr (and hence the cv) might be freed before we get to
+ * the cv_broadcast().
+ */
+ cv_broadcast(&hdr->b_cv);
+
+ if (hash_lock) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ if (zio->io_error == 0 && hdr->b_state == arc_anon)
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ /*
+ * This block was freed while we waited for the read to
+ * complete. It has been removed from the hash table and
+ * moved to the anonymous state (so that it won't show up
+ * in the cache).
+ */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ }
+
+ /* execute each callback and free its structure */
+ while ((acb = callback_list) != NULL) {
+ if (acb->acb_done)
+ acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+
+ if (acb->acb_zio_dummy != NULL) {
+ acb->acb_zio_dummy->io_error = zio->io_error;
+ zio_nowait(acb->acb_zio_dummy);
+ }
+
+ callback_list = acb->acb_next;
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
+
+ if (freeable)
+ arc_hdr_destroy(hdr);
+}
+
+/*
+ * "Read" the block block at the specified DVA (in bp) via the
+ * cache. If the block is found in the cache, invoke the provided
+ * callback immediately and return. Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required. If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t *arc_flags, zbookmark_t *zb)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ kmutex_t *hash_lock;
+ zio_t *rzio;
+
+top:
+ hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ if (hdr && hdr->b_datacnt > 0) {
+
+ *arc_flags |= ARC_CACHED;
+
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+
+ if (*arc_flags & ARC_WAIT) {
+ cv_wait(&hdr->b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ ASSERT(*arc_flags & ARC_NOWAIT);
+
+ if (done) {
+ arc_callback_t *acb = NULL;
+
+ acb = kmem_zalloc(sizeof (arc_callback_t),
+ KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = swap;
+ if (pio != NULL)
+ acb->acb_zio_dummy = zio_null(pio,
+ spa, NULL, NULL, flags);
+
+ ASSERT(acb->acb_done != NULL);
+ acb->acb_next = hdr->b_acb;
+ hdr->b_acb = acb;
+ add_reference(hdr, hash_lock, private);
+ mutex_exit(hash_lock);
+ return (0);
+ }
+ mutex_exit(hash_lock);
+ return (0);
+ }
+
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+
+ if (done) {
+ add_reference(hdr, hash_lock, private);
+ /*
+ * If this block is already in use, create a new
+ * copy of the data so that we will be guaranteed
+ * that arc_release() will always succeed.
+ */
+ buf = hdr->b_buf;
+ ASSERT(buf);
+ ASSERT(buf->b_data);
+ if (HDR_BUF_AVAILABLE(hdr)) {
+ ASSERT(buf->b_efunc == NULL);
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+ } else {
+ buf = arc_buf_clone(buf);
+ }
+ } else if (*arc_flags & ARC_PREFETCH &&
+ refcount_count(&hdr->b_refcnt) == 0) {
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, hits);
+
+ if (done)
+ done(NULL, buf, private);
+ } else {
+ uint64_t size = BP_GET_LSIZE(bp);
+ arc_callback_t *acb;
+
+ if (hdr == NULL) {
+ /* this block is not in the cache */
+ arc_buf_hdr_t *exists;
+ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+ buf = arc_buf_alloc(spa, size, private, type);
+ hdr = buf->b_hdr;
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = bp->blk_birth;
+ hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* somebody beat us to the hash insert */
+ mutex_exit(hash_lock);
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ (void) arc_buf_remove_ref(buf, private);
+ goto top; /* restart the IO request */
+ }
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH) {
+ (void) remove_reference(hdr, hash_lock,
+ private);
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ if (BP_GET_LEVEL(bp) > 0)
+ hdr->b_flags |= ARC_INDIRECT;
+ } else {
+ /* this block is in the ghost cache */
+ ASSERT(GHOST_STATE(hdr->b_state));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+ ASSERT(hdr->b_buf == NULL);
+
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH)
+ hdr->b_flags |= ARC_PREFETCH;
+ else
+ add_reference(hdr, hash_lock, private);
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ ASSERT(hdr->b_datacnt == 0);
+ hdr->b_datacnt = 1;
+
+ }
+
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = swap;
+
+ ASSERT(hdr->b_acb == NULL);
+ hdr->b_acb = acb;
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+
+ /*
+ * If the buffer has been evicted, migrate it to a present state
+ * before issuing the I/O. Once we drop the hash-table lock,
+ * the header will be marked as I/O in progress and have an
+ * attached buffer. At this point, anybody who finds this
+ * buffer ought to notice that it's legit but has a pending I/O.
+ */
+
+ if (GHOST_STATE(hdr->b_state))
+ arc_access(hdr, hash_lock);
+
+ ASSERT3U(hdr->b_size, ==, size);
+ DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
+ zbookmark_t *, zb);
+ ARCSTAT_BUMP(arcstat_misses);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, misses);
+
+ if (l2arc_ndev != 0) {
+ /*
+ * Read from the L2ARC if the following are true:
+ * 1. This buffer has L2ARC metadata.
+ * 2. This buffer isn't currently writing to the L2ARC.
+ */
+ if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr)) {
+ vdev_t *vd = hdr->b_l2hdr->b_dev->l2ad_vdev;
+ daddr_t addr = hdr->b_l2hdr->b_daddr;
+ l2arc_read_callback_t *cb;
+
+ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_hits);
+
+ hdr->b_flags |= ARC_L2_READING;
+ mutex_exit(hash_lock);
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+ KM_SLEEP);
+ cb->l2rcb_buf = buf;
+ cb->l2rcb_spa = spa;
+ cb->l2rcb_bp = *bp;
+ cb->l2rcb_zb = *zb;
+ cb->l2rcb_flags = flags;
+
+ /*
+ * l2arc read.
+ */
+ rzio = zio_read_phys(pio, vd, addr, size,
+ buf->b_data, ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority,
+ flags | ZIO_FLAG_DONT_CACHE, B_FALSE);
+ DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+ zio_t *, rzio);
+
+ if (*arc_flags & ARC_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(*arc_flags & ARC_NOWAIT);
+ zio_nowait(rzio);
+ return (0);
+ } else {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ if (HDR_L2_WRITING(hdr))
+ ARCSTAT_BUMP(arcstat_l2_rw_clash);
+ }
+ }
+ mutex_exit(hash_lock);
+
+ rzio = zio_read(pio, spa, bp, buf->b_data, size,
+ arc_read_done, buf, priority, flags, zb);
+
+ if (*arc_flags & ARC_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(*arc_flags & ARC_NOWAIT);
+ zio_nowait(rzio);
+ }
+ return (0);
+}
+
+/*
+ * arc_read() variant to support pool traversal. If the block is already
+ * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
+ * The idea is that we don't want pool traversal filling up memory, but
+ * if the ARC already has the data anyway, we shouldn't pay for the I/O.
+ */
+int
+arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_mtx;
+ int rc = 0;
+
+ hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
+
+ if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ ASSERT(buf);
+ while (buf->b_data == NULL) {
+ buf = buf->b_next;
+ ASSERT(buf);
+ }
+ bcopy(buf->b_data, data, hdr->b_size);
+ } else {
+ rc = ENOENT;
+ }
+
+ if (hash_mtx)
+ mutex_exit(hash_mtx);
+
+ return (rc);
+}
+
+void
+arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
+{
+ ASSERT(buf->b_hdr != NULL);
+ ASSERT(buf->b_hdr->b_state != arc_anon);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ buf->b_efunc = func;
+ buf->b_private = private;
+}
+
+/*
+ * This is used by the DMU to let the ARC know that a buffer is
+ * being evicted, so the ARC should clean up. If this arc buf
+ * is not yet in the evicted state, it will be put there.
+ */
+int
+arc_buf_evict(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ arc_buf_t **bufp;
+
+ mutex_enter(&arc_eviction_mtx);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ /*
+ * We are in arc_do_user_evicts().
+ */
+ ASSERT(buf->b_data == NULL);
+ mutex_exit(&arc_eviction_mtx);
+ return (0);
+ }
+ hash_lock = HDR_LOCK(hdr);
+ mutex_exit(&arc_eviction_mtx);
+
+ mutex_enter(hash_lock);
+
+ if (buf->b_data == NULL) {
+ /*
+ * We are on the eviction list.
+ */
+ mutex_exit(hash_lock);
+ mutex_enter(&arc_eviction_mtx);
+ if (buf->b_hdr == NULL) {
+ /*
+ * We are already in arc_do_user_evicts().
+ */
+ mutex_exit(&arc_eviction_mtx);
+ return (0);
+ } else {
+ arc_buf_t copy = *buf; /* structure assignment */
+ /*
+ * Process this buffer now
+ * but let arc_do_user_evicts() do the reaping.
+ */
+ buf->b_efunc = NULL;
+ mutex_exit(&arc_eviction_mtx);
+ VERIFY(copy.b_efunc(&copy) == 0);
+ return (1);
+ }
+ }
+
+ ASSERT(buf->b_hdr == hdr);
+ ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+
+ /*
+ * Pull this buffer off of the hdr
+ */
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_data != NULL);
+ arc_buf_destroy(buf, FALSE, FALSE);
+
+ if (hdr->b_datacnt == 0) {
+ arc_state_t *old_state = hdr->b_state;
+ arc_state_t *evicted_state;
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+ evicted_state =
+ (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ mutex_enter(&old_state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ hdr->b_flags |= ARC_IN_HASH_TABLE;
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&old_state->arcs_mtx);
+ }
+ mutex_exit(hash_lock);
+
+ VERIFY(buf->b_efunc(buf) == 0);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+ return (1);
+}
+
+/*
+ * Release this buffer from the cache. This must be done
+ * after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * make a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ l2arc_buf_hdr_t *l2hdr = NULL;
+ uint64_t buf_size;
+
+ /* this buffer is not on any list */
+ ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+
+ if (hdr->b_state == arc_anon) {
+ /* this buffer is already released */
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+ ASSERT(BUF_EMPTY(hdr));
+ ASSERT(buf->b_efunc == NULL);
+ arc_buf_thaw(buf);
+ return;
+ }
+
+ mutex_enter(hash_lock);
+
+ /*
+ * Do we have more than one buf?
+ */
+ if (hdr->b_buf != buf || buf->b_next != NULL) {
+ arc_buf_hdr_t *nhdr;
+ arc_buf_t **bufp;
+ uint64_t blksz = hdr->b_size;
+ spa_t *spa = hdr->b_spa;
+ arc_buf_contents_t type = hdr->b_type;
+ uint32_t flags = hdr->b_flags;
+
+ ASSERT(hdr->b_datacnt > 1);
+ /*
+ * Pull the data off of this buf and attach it to
+ * a new anonymous buf.
+ */
+ (void) remove_reference(hdr, hash_lock, tag);
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = (*bufp)->b_next;
+ buf->b_next = NULL;
+
+ ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
+ atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
+ if (refcount_is_zero(&hdr->b_refcnt)) {
+ uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+ ASSERT3U(*size, >=, hdr->b_size);
+ atomic_add_64(size, -hdr->b_size);
+ }
+ hdr->b_datacnt -= 1;
+ if (hdr->b_l2hdr != NULL) {
+ mutex_enter(&l2arc_buflist_mtx);
+ l2hdr = hdr->b_l2hdr;
+ hdr->b_l2hdr = NULL;
+ buf_size = hdr->b_size;
+ }
+ arc_cksum_verify(buf);
+
+ mutex_exit(hash_lock);
+
+ nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ nhdr->b_size = blksz;
+ nhdr->b_spa = spa;
+ nhdr->b_type = type;
+ nhdr->b_buf = buf;
+ nhdr->b_state = arc_anon;
+ nhdr->b_arc_access = 0;
+ nhdr->b_flags = flags & ARC_L2_WRITING;
+ nhdr->b_l2hdr = NULL;
+ nhdr->b_datacnt = 1;
+ nhdr->b_freeze_cksum = NULL;
+ (void) refcount_add(&nhdr->b_refcnt, tag);
+ buf->b_hdr = nhdr;
+ atomic_add_64(&arc_anon->arcs_size, blksz);
+ } else {
+ ASSERT(refcount_count(&hdr->b_refcnt) == 1);
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_arc_access = 0;
+ if (hdr->b_l2hdr != NULL) {
+ mutex_enter(&l2arc_buflist_mtx);
+ l2hdr = hdr->b_l2hdr;
+ hdr->b_l2hdr = NULL;
+ buf_size = hdr->b_size;
+ }
+ mutex_exit(hash_lock);
+
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ arc_buf_thaw(buf);
+ }
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+
+ if (l2hdr) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -buf_size);
+ }
+ if (MUTEX_HELD(&l2arc_buflist_mtx))
+ mutex_exit(&l2arc_buflist_mtx);
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+ return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+}
+
+int
+arc_has_callback(arc_buf_t *buf)
+{
+ return (buf->b_efunc != NULL);
+}
+
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+ return (refcount_count(&buf->b_hdr->b_refcnt));
+}
+#endif
+
+static void
+arc_write_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ if (zio->io_error == 0 && callback->awcb_ready) {
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+ }
+ /*
+ * If the IO is already in progress, then this is a re-write
+ * attempt, so we need to thaw and re-compute the cksum. It is
+ * the responsibility of the callback to handle the freeing
+ * and accounting for any re-write attempt. If we don't have a
+ * callback registered then simply free the block here.
+ */
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ if (!BP_IS_HOLE(&zio->io_bp_orig) &&
+ callback->awcb_ready == NULL) {
+ zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+ &zio->io_bp_orig, NULL, NULL));
+ }
+ mutex_enter(&hdr->b_freeze_lock);
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_freeze_lock);
+ }
+ arc_cksum_compute(buf, B_FALSE);
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ hdr->b_acb = NULL;
+
+ /* this buffer is on no lists and is not in the hash table */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = zio->io_bp->blk_birth;
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ /*
+ * If the block to be written was all-zero, we may have
+ * compressed it away. In this case no write was performed
+ * so there will be no dva/birth-date/checksum. The buffer
+ * must therefor remain anonymous (and uncached).
+ */
+ if (!BUF_EMPTY(hdr)) {
+ arc_buf_hdr_t *exists;
+ kmutex_t *hash_lock;
+
+ arc_cksum_verify(buf);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /*
+ * This can only happen if we overwrite for
+ * sync-to-convergence, because we remove
+ * buffers from the hash table when we arc_free().
+ */
+ ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
+ BP_IDENTITY(zio->io_bp)));
+ ASSERT3U(zio->io_bp_orig.blk_birth, ==,
+ zio->io_bp->blk_birth);
+
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ }
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else if (callback->awcb_done == NULL) {
+ int destroy_hdr;
+ /*
+ * This is an anonymous buffer with no user callback,
+ * destroy it if there are no active references.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ }
+
+ if (callback->awcb_done) {
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
+ }
+
+ kmem_free(callback, sizeof (arc_write_callback_t));
+}
+
+zio_t *
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
+ uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_write_callback_t *callback;
+ zio_t *zio;
+
+ /* this is a private buffer - no locking required */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ ASSERT(BUF_EMPTY(hdr));
+ ASSERT(!HDR_IO_ERROR(hdr));
+ ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
+ ASSERT(hdr->b_acb == 0);
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ callback->awcb_ready = ready;
+ callback->awcb_done = done;
+ callback->awcb_private = private;
+ callback->awcb_buf = buf;
+ zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
+ buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
+ priority, flags, zb);
+
+ return (zio);
+}
+
+int
+arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+ arc_buf_hdr_t *ab;
+ kmutex_t *hash_lock;
+ zio_t *zio;
+
+ /*
+ * If this buffer is in the cache, release it, so it
+ * can be re-used.
+ */
+ ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ if (ab != NULL) {
+ /*
+ * The checksum of blocks to free is not always
+ * preserved (eg. on the deadlist). However, if it is
+ * nonzero, it should match what we have in the cache.
+ */
+ ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
+ ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+ if (ab->b_state != arc_anon)
+ arc_change_state(arc_anon, ab, hash_lock);
+ if (HDR_IO_IN_PROGRESS(ab)) {
+ /*
+ * This should only happen when we prefetch.
+ */
+ ASSERT(ab->b_flags & ARC_PREFETCH);
+ ASSERT3U(ab->b_datacnt, ==, 1);
+ ab->b_flags |= ARC_FREED_IN_READ;
+ if (HDR_IN_HASH_TABLE(ab))
+ buf_hash_remove(ab);
+ ab->b_arc_access = 0;
+ bzero(&ab->b_dva, sizeof (dva_t));
+ ab->b_birth = 0;
+ ab->b_cksum0 = 0;
+ ab->b_buf->b_efunc = NULL;
+ ab->b_buf->b_private = NULL;
+ mutex_exit(hash_lock);
+ } else if (refcount_is_zero(&ab->b_refcnt)) {
+ ab->b_flags |= ARC_FREE_IN_PROGRESS;
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ ARCSTAT_BUMP(arcstat_deleted);
+ } else {
+ /*
+ * We still have an active reference on this
+ * buffer. This can happen, e.g., from
+ * dbuf_unoverride().
+ */
+ ASSERT(!HDR_IN_HASH_TABLE(ab));
+ ab->b_arc_access = 0;
+ bzero(&ab->b_dva, sizeof (dva_t));
+ ab->b_birth = 0;
+ ab->b_cksum0 = 0;
+ ab->b_buf->b_efunc = NULL;
+ ab->b_buf->b_private = NULL;
+ mutex_exit(hash_lock);
+ }
+ }
+
+ zio = zio_free(pio, spa, txg, bp, done, private);
+
+ if (arc_flags & ARC_WAIT)
+ return (zio_wait(zio));
+
+ ASSERT(arc_flags & ARC_NOWAIT);
+ zio_nowait(zio);
+
+ return (0);
+}
+
+static int
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
+{
+#ifdef _KERNEL
+ uint64_t inflight_data = arc_anon->arcs_size;
+ uint64_t available_memory = ptob(freemem);
+ static uint64_t page_load = 0;
+ static uint64_t last_txg = 0;
+
+#if defined(__i386)
+ available_memory =
+ MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+#endif
+ if (available_memory >= zfs_write_limit_max)
+ return (0);
+
+ if (txg > last_txg) {
+ last_txg = txg;
+ page_load = 0;
+ }
+ /*
+ * If we are in pageout, we know that memory is already tight,
+ * the arc is already going to be evicting, so we just want to
+ * continue to let page writes occur as quickly as possible.
+ */
+ if (curproc == proc_pageout) {
+ if (page_load > MAX(ptob(minfree), available_memory) / 4)
+ return (ERESTART);
+ /* Note: reserve is inflated, so we deflate */
+ page_load += reserve / 8;
+ return (0);
+ } else if (page_load > 0 && arc_reclaim_needed()) {
+ /* memory is low, delay before restarting */
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (EAGAIN);
+ }
+ page_load = 0;
+
+ if (arc_size > arc_c_min) {
+ uint64_t evictable_memory =
+ arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+ available_memory += MIN(evictable_memory, arc_size - arc_c_min);
+ }
+
+ if (inflight_data > available_memory / 4) {
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (ERESTART);
+ }
+#endif
+ return (0);
+}
+
+void
+arc_tempreserve_clear(uint64_t reserve)
+{
+ atomic_add_64(&arc_tempreserve, -reserve);
+ ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(uint64_t reserve, uint64_t txg)
+{
+ int error;
+
+#ifdef ZFS_DEBUG
+ /*
+ * Once in a while, fail for no reason. Everything should cope.
+ */
+ if (spa_get_random(10000) == 0) {
+ dprintf("forcing random failure\n");
+ return (ERESTART);
+ }
+#endif
+ if (reserve > arc_c/4 && !arc_no_grow)
+ arc_c = MIN(arc_c_max, reserve * 4);
+ if (reserve > arc_c)
+ return (ENOMEM);
+
+ /*
+ * Writes will, almost always, require additional memory allocations
+ * in order to compress/encrypt/etc the data. We therefor need to
+ * make sure that there is sufficient available memory for this.
+ */
+ if (error = arc_memory_throttle(reserve, txg))
+ return (error);
+
+ /*
+ * Throttle writes when the amount of dirty data in the cache
+ * gets too large. We try to keep the cache less than half full
+ * of dirty blocks so that our sync times don't grow too large.
+ * Note: if two requests come in concurrently, we might let them
+ * both succeed, when one of them should fail. Not a huge deal.
+ */
+ if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
+ arc_anon->arcs_size > arc_c / 4) {
+ dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+ "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
+ arc_tempreserve>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
+ reserve>>10, arc_c>>10);
+ return (ERESTART);
+ }
+ atomic_add_64(&arc_tempreserve, reserve);
+ return (0);
+}
+
+void
+arc_init(void)
+{
+ mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+
+ /* Convert seconds to clock ticks */
+ arc_min_prefetch_lifespan = 1 * hz;
+
+ /* Start out with 1/8 of all memory */
+ arc_c = physmem * PAGESIZE / 8;
+
+#ifdef _KERNEL
+ /*
+ * On architectures where the physical memory can be larger
+ * than the addressable space (intel in 32-bit mode), we may
+ * need to limit the cache to 1/8 of VM size.
+ */
+ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+#endif
+
+ /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
+ arc_c_min = MAX(arc_c / 4, 64<<20);
+ /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
+ if (arc_c * 8 >= 1<<30)
+ arc_c_max = (arc_c * 8) - (1<<30);
+ else
+ arc_c_max = arc_c_min;
+ arc_c_max = MAX(arc_c * 6, arc_c_max);
+
+ /*
+ * Allow the tunables to override our calculations if they are
+ * reasonable (ie. over 64MB)
+ */
+ if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
+ arc_c_max = zfs_arc_max;
+ if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
+ arc_c_min = zfs_arc_min;
+
+ arc_c = arc_c_max;
+ arc_p = (arc_c >> 1);
+
+ /* limit meta-data to 1/4 of the arc capacity */
+ arc_meta_limit = arc_c_max / 4;
+
+ /* Allow the tunable to override if it is reasonable */
+ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
+ arc_meta_limit = zfs_arc_meta_limit;
+
+ if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
+ arc_c_min = arc_meta_limit / 2;
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc_c = arc_c / 2;
+ if (arc_c < arc_c_min)
+ arc_c = arc_c_min;
+
+ arc_anon = &ARC_anon;
+ arc_mru = &ARC_mru;
+ arc_mru_ghost = &ARC_mru_ghost;
+ arc_mfu = &ARC_mfu;
+ arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
+ arc_size = 0;
+
+ mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+
+ buf_init();
+
+ arc_thread_exit = 0;
+ arc_eviction_list = NULL;
+ mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
+ bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
+
+ arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+ if (arc_ksp != NULL) {
+ arc_ksp->ks_data = &arc_stats;
+ kstat_install(arc_ksp);
+ }
+
+ (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+
+ arc_dead = FALSE;
+
+ if (zfs_write_limit_max == 0)
+ zfs_write_limit_max = physmem * PAGESIZE >>
+ zfs_write_limit_shift;
+ else
+ zfs_write_limit_shift = 0;
+}
+
+void
+arc_fini(void)
+{
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_thread_exit = 1;
+ while (arc_thread_exit != 0)
+ cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
+ mutex_exit(&arc_reclaim_thr_lock);
+
+ arc_flush(NULL);
+
+ arc_dead = TRUE;
+
+ if (arc_ksp != NULL) {
+ kstat_delete(arc_ksp);
+ arc_ksp = NULL;
+ }
+
+ mutex_destroy(&arc_eviction_mtx);
+ mutex_destroy(&arc_reclaim_thr_lock);
+ cv_destroy(&arc_reclaim_thr_cv);
+
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+
+ mutex_destroy(&arc_anon->arcs_mtx);
+ mutex_destroy(&arc_mru->arcs_mtx);
+ mutex_destroy(&arc_mru_ghost->arcs_mtx);
+ mutex_destroy(&arc_mfu->arcs_mtx);
+ mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+
+ buf_fini();
+}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes. The main role of this cache is to boost
+ * the performance of random read workloads. The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ * +-----------------------+
+ * | ARC |
+ * +-----------------------+
+ * | ^ ^
+ * | | |
+ * l2arc_feed_thread() arc_read()
+ * | | |
+ * | l2arc read |
+ * V | |
+ * +---------------+ |
+ * | L2ARC | |
+ * +---------------+ |
+ * | ^ |
+ * l2arc_write() | |
+ * | | |
+ * V | |
+ * +-------+ +-------+
+ * | vdev | | vdev |
+ * | cache | | cache |
+ * +-------+ +-------+
+ * +=========+ .-----.
+ * : L2ARC : |-_____-|
+ * : devices : | Disks |
+ * +=========+ `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ * 1) ARC
+ * 2) vdev cache of L2ARC devices
+ * 3) L2ARC devices
+ * 4) vdev cache of disks
+ * 5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists. The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ * head --> tail
+ * +---------------------+----------+
+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+ * +---------------------+----------+ | o L2ARC eligible
+ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+ * +---------------------+----------+ |
+ * 15.9 Gbytes ^ 32 Mbytes |
+ * headroom |
+ * l2arc_feed_thread()
+ * |
+ * l2arc write hand <--[oooo]--'
+ * | 8 Mbyte
+ * | write max
+ * V
+ * +==============================+
+ * L2ARC dev |####|#|###|###| |####| ... |
+ * +==============================+
+ * 32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage. It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers. This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes. This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes. Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 6. The L2ARC does not store dirty content. It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 7. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ * l2arc_write_max max write bytes per interval
+ * l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_headroom number of max device writes to precache
+ * l2arc_feed_secs seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ */
+
+static void
+l2arc_hdr_stat_add(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+static void
+l2arc_hdr_stat_remove(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+}
+
+/*
+ * Cycle through L2ARC devices. This is how L2ARC load balances.
+ * This is called with l2arc_dev_mtx held, which also locks out spa removal.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+ l2arc_dev_t *next;
+
+ if (l2arc_dev_last == NULL) {
+ next = list_head(l2arc_dev_list);
+ } else {
+ next = list_next(l2arc_dev_list, l2arc_dev_last);
+ if (next == NULL)
+ next = list_head(l2arc_dev_list);
+ }
+
+ l2arc_dev_last = next;
+
+ return (next);
+}
+
+/*
+ * A write to a cache device has completed. Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+ l2arc_write_callback_t *cb;
+ l2arc_dev_t *dev;
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
+ arc_buf_hdr_t *head, *ab, *ab_prev;
+ kmutex_t *hash_lock;
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ dev = cb->l2wcb_dev;
+ ASSERT(dev != NULL);
+ head = cb->l2wcb_head;
+ ASSERT(head != NULL);
+ buflist = dev->l2ad_buflist;
+ ASSERT(buflist != NULL);
+ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+ l2arc_write_callback_t *, cb);
+
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ mutex_enter(&l2arc_buflist_mtx);
+
+ /*
+ * All writes completed, or an error was hit.
+ */
+ for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * This buffer misses out. It may be in a stage
+ * of eviction. Its ARC_L2_WRITING flag will be
+ * left set, denying reads to this buffer.
+ */
+ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
+ continue;
+ }
+
+ if (zio->io_error != 0) {
+ /*
+ * Error - invalidate L2ARC entry.
+ */
+ ab->b_l2hdr = NULL;
+ }
+
+ /*
+ * Allow ARC to begin reads to this L2ARC entry.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+
+ mutex_exit(hash_lock);
+ }
+
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ kmem_cache_free(hdr_cache, head);
+ mutex_exit(&l2arc_buflist_mtx);
+
+ /*
+ * Free buffers that were tagged for destruction.
+ */
+ mutex_enter(&l2arc_free_on_write_mtx);
+ buflist = l2arc_free_on_write;
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
+ ASSERT(df->l2df_data != NULL);
+ ASSERT(df->l2df_func != NULL);
+ df->l2df_func(df->l2df_data, df->l2df_size);
+ list_remove(buflist, df);
+ kmem_free(df, sizeof (l2arc_data_free_t));
+ }
+ mutex_exit(&l2arc_free_on_write_mtx);
+
+ kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+/*
+ * A read to a cache device completed. Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ zio_t *rzio;
+ kmutex_t *hash_lock;
+ int equal, err = 0;
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ buf = cb->l2rcb_buf;
+ ASSERT(buf != NULL);
+ hdr = buf->b_hdr;
+ ASSERT(hdr != NULL);
+
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * Check this survived the L2ARC journey.
+ */
+ equal = arc_cksum_equal(buf);
+ if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+ mutex_exit(hash_lock);
+ zio->io_private = buf;
+ arc_read_done(zio);
+ } else {
+ mutex_exit(hash_lock);
+ /*
+ * Buffer didn't survive caching. Increment stats and
+ * reissue to the original storage device.
+ */
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_io_error);
+ if (!equal)
+ ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+ zio->io_flags &= ~ZIO_FLAG_DONT_CACHE;
+ rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp,
+ buf->b_data, zio->io_size, arc_read_done, buf,
+ zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb);
+
+ /*
+ * Since this is a seperate thread, we can wait on this
+ * I/O whether there is an io_waiter or not.
+ */
+ err = zio_wait(rzio);
+
+ /*
+ * Let the resent I/O call arc_read_done() instead.
+ * io_error is set to the reissued I/O error status.
+ */
+ zio->io_done = NULL;
+ zio->io_waiter = NULL;
+ zio->io_error = err;
+ }
+
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache. This is used within loops (0..3) to cycle through lists in the
+ * desired order. This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists. This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static list_t *
+l2arc_list_locked(int list_num, kmutex_t **lock)
+{
+ list_t *list;
+
+ ASSERT(list_num >= 0 && list_num <= 3);
+
+ switch (list_num) {
+ case 0:
+ list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 1:
+ list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ case 2:
+ list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 3:
+ list = &arc_mru->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ }
+
+ ASSERT(!(MUTEX_HELD(*lock)));
+ mutex_enter(*lock);
+ return (list);
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes. This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+ list_t *buflist;
+ l2arc_buf_hdr_t *abl2;
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+ uint64_t taddr;
+
+ ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
+
+ buflist = dev->l2ad_buflist;
+
+ if (buflist == NULL)
+ return;
+
+ if (!all && dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict.
+ */
+ return;
+ }
+
+ if (dev->l2ad_hand >= (dev->l2ad_end - (2 * dev->l2ad_write))) {
+ /*
+ * When nearing the end of the device, evict to the end
+ * before the device write hand jumps to the start.
+ */
+ taddr = dev->l2ad_end;
+ } else {
+ taddr = dev->l2ad_hand + distance;
+ }
+ DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+ uint64_t, taddr, boolean_t, all);
+
+top:
+ mutex_enter(&l2arc_buflist_mtx);
+ for (ab = list_tail(buflist); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. Retry.
+ */
+ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+ mutex_exit(&l2arc_buflist_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ if (HDR_L2_WRITE_HEAD(ab)) {
+ /*
+ * We hit a write head node. Leave it for
+ * l2arc_write_done().
+ */
+ list_remove(buflist, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (!all && ab->b_l2hdr != NULL &&
+ (ab->b_l2hdr->b_daddr > taddr ||
+ ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+ /*
+ * We've evicted to the target address,
+ * or the end of the device.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (HDR_FREE_IN_PROGRESS(ab)) {
+ /*
+ * Already on the path to destruction.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_state == arc_l2c_only) {
+ ASSERT(!HDR_L2_READING(ab));
+ /*
+ * This doesn't exist in the ARC. Destroy.
+ * arc_hdr_destroy() will call list_remove()
+ * and decrement arcstat_l2_size.
+ */
+ arc_change_state(arc_anon, ab, hash_lock);
+ arc_hdr_destroy(ab);
+ } else {
+ /*
+ * Tell ARC this no longer exists in L2ARC.
+ */
+ if (ab->b_l2hdr != NULL) {
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+ list_remove(buflist, ab);
+
+ /*
+ * This may have been leftover after a
+ * failed write.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+
+ /*
+ * Invalidate issued or about to be issued
+ * reads, since we may be about to write
+ * over this location.
+ */
+ if (HDR_L2_READING(ab)) {
+ ARCSTAT_BUMP(arcstat_l2_evict_reading);
+ ab->b_flags |= ARC_L2_EVICTED;
+ }
+ }
+ mutex_exit(hash_lock);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+ dev->l2ad_evict = taddr;
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ */
+static void
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev)
+{
+ arc_buf_hdr_t *ab, *ab_prev, *head;
+ l2arc_buf_hdr_t *hdrl2;
+ list_t *list;
+ uint64_t passed_sz, write_sz, buf_sz;
+ uint64_t target_sz = dev->l2ad_write;
+ uint64_t headroom = dev->l2ad_write * l2arc_headroom;
+ void *buf_data;
+ kmutex_t *hash_lock, *list_lock;
+ boolean_t have_lock, full;
+ l2arc_write_callback_t *cb;
+ zio_t *pio, *wzio;
+
+ ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
+ ASSERT(dev->l2ad_vdev != NULL);
+
+ pio = NULL;
+ write_sz = 0;
+ full = B_FALSE;
+ head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ head->b_flags |= ARC_L2_WRITE_HEAD;
+
+ /*
+ * Copy buffers for L2ARC writing.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ for (int try = 0; try <= 3; try++) {
+ list = l2arc_list_locked(try, &list_lock);
+ passed_sz = 0;
+
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (!have_lock && !mutex_tryenter(hash_lock)) {
+ /*
+ * Skip this buffer rather than waiting.
+ */
+ continue;
+ }
+
+ passed_sz += ab->b_size;
+ if (passed_sz > headroom) {
+ /*
+ * Searched too far.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (ab->b_spa != spa) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * Already in L2ARC.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (HDR_IO_IN_PROGRESS(ab) || HDR_DONT_L2CACHE(ab)) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if ((write_sz + ab->b_size) > target_sz) {
+ full = B_TRUE;
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (ab->b_buf == NULL) {
+ DTRACE_PROBE1(l2arc__buf__null, void *, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (pio == NULL) {
+ /*
+ * Insert a dummy header on the buflist so
+ * l2arc_write_done() can find where the
+ * write buffers begin without searching.
+ */
+ list_insert_head(dev->l2ad_buflist, head);
+
+ cb = kmem_alloc(
+ sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb->l2wcb_dev = dev;
+ cb->l2wcb_head = head;
+ pio = zio_root(spa, l2arc_write_done, cb,
+ ZIO_FLAG_CANFAIL);
+ }
+
+ /*
+ * Create and add a new L2ARC header.
+ */
+ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ hdrl2->b_dev = dev;
+ hdrl2->b_daddr = dev->l2ad_hand;
+
+ ab->b_flags |= ARC_L2_WRITING;
+ ab->b_l2hdr = hdrl2;
+ list_insert_head(dev->l2ad_buflist, ab);
+ buf_data = ab->b_buf->b_data;
+ buf_sz = ab->b_size;
+
+ /*
+ * Compute and store the buffer cksum before
+ * writing. On debug the cksum is verified first.
+ */
+ arc_cksum_verify(ab->b_buf);
+ arc_cksum_compute(ab->b_buf, B_TRUE);
+
+ mutex_exit(hash_lock);
+
+ wzio = zio_write_phys(pio, dev->l2ad_vdev,
+ dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, B_FALSE);
+
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ write_sz += buf_sz;
+ dev->l2ad_hand += buf_sz;
+ }
+
+ mutex_exit(list_lock);
+
+ if (full == B_TRUE)
+ break;
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ if (pio == NULL) {
+ ASSERT3U(write_sz, ==, 0);
+ kmem_cache_free(hdr_cache, head);
+ return;
+ }
+
+ ASSERT3U(write_sz, <=, target_sz);
+ ARCSTAT_BUMP(arcstat_l2_writes_sent);
+ ARCSTAT_INCR(arcstat_l2_size, write_sz);
+ spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+
+ /*
+ * Bump device hand to the device start if it is approaching the end.
+ * l2arc_evict() will already have evicted ahead for this case.
+ */
+ if (dev->l2ad_hand >= (dev->l2ad_end - dev->l2ad_write)) {
+ spa_l2cache_space_update(dev->l2ad_vdev, 0,
+ dev->l2ad_end - dev->l2ad_hand);
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ }
+
+ (void) zio_wait(pio);
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals. This is the beating
+ * heart of the L2ARC.
+ */
+static void
+l2arc_feed_thread(void)
+{
+ callb_cpr_t cpr;
+ l2arc_dev_t *dev;
+ spa_t *spa;
+ int interval;
+ boolean_t startup = B_TRUE;
+
+ CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&l2arc_feed_thr_lock);
+
+ while (l2arc_thread_exit == 0) {
+ /*
+ * Initially pause for L2ARC_FEED_DELAY seconds as a grace
+ * interval during boot, followed by l2arc_feed_secs seconds
+ * thereafter.
+ */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ if (startup) {
+ interval = L2ARC_FEED_DELAY;
+ startup = B_FALSE;
+ } else {
+ interval = l2arc_feed_secs;
+ }
+ (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
+ lbolt + (hz * interval));
+ CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+
+ /*
+ * Do nothing until L2ARC devices exist.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ if (l2arc_ndev == 0) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+
+ /*
+ * Avoid contributing to memory pressure.
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+
+ /*
+ * This selects the next l2arc device to write to, and in
+ * doing so the next spa to feed from: dev->l2ad_spa.
+ */
+ if ((dev = l2arc_dev_get_next()) == NULL) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+ spa = dev->l2ad_spa;
+ ASSERT(spa != NULL);
+ ARCSTAT_BUMP(arcstat_l2_feeds);
+
+ /*
+ * Evict L2ARC buffers that will be overwritten.
+ */
+ l2arc_evict(dev, dev->l2ad_write, B_FALSE);
+
+ /*
+ * Write ARC buffers.
+ */
+ l2arc_write_buffers(spa, dev);
+ mutex_exit(&l2arc_dev_mtx);
+ }
+
+ l2arc_thread_exit = 0;
+ cv_broadcast(&l2arc_feed_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
+ thread_exit();
+}
+
+/*
+ * Add a vdev for use by the L2ARC. By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+{
+ l2arc_dev_t *adddev;
+
+ /*
+ * Create a new l2arc device entry.
+ */
+ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+ adddev->l2ad_spa = spa;
+ adddev->l2ad_vdev = vd;
+ adddev->l2ad_write = l2arc_write_max;
+ adddev->l2ad_start = start;
+ adddev->l2ad_end = end;
+ adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
+ adddev->l2ad_first = B_TRUE;
+ ASSERT3U(adddev->l2ad_write, >, 0);
+
+ /*
+ * This is a list of all ARC buffers that are still valid on the
+ * device.
+ */
+ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
+ list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2node));
+
+ spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+
+ /*
+ * Add device to global list
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ list_insert_head(l2arc_dev_list, adddev);
+ atomic_inc_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+ l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+
+ /*
+ * We can only grab the spa config lock when cache device writes
+ * complete.
+ */
+ ASSERT3U(l2arc_writes_sent, ==, l2arc_writes_done);
+
+ /*
+ * Find the device by vdev
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
+ nextdev = list_next(l2arc_dev_list, dev);
+ if (vd == dev->l2ad_vdev) {
+ remdev = dev;
+ break;
+ }
+ }
+ ASSERT(remdev != NULL);
+
+ /*
+ * Remove device from global list
+ */
+ list_remove(l2arc_dev_list, remdev);
+ l2arc_dev_last = NULL; /* may have been invalidated */
+
+ /*
+ * Clear all buflists and ARC references. L2ARC device flush.
+ */
+ l2arc_evict(remdev, 0, B_TRUE);
+ list_destroy(remdev->l2ad_buflist);
+ kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+ kmem_free(remdev, sizeof (l2arc_dev_t));
+
+ atomic_dec_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+void
+l2arc_init()
+{
+ l2arc_thread_exit = 0;
+ l2arc_ndev = 0;
+ l2arc_writes_sent = 0;
+ l2arc_writes_done = 0;
+
+ mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ l2arc_dev_list = &L2ARC_dev_list;
+ l2arc_free_on_write = &L2ARC_free_on_write;
+ list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+ offsetof(l2arc_dev_t, l2ad_node));
+ list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+ offsetof(l2arc_data_free_t, l2df_list_node));
+
+ (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+}
+
+void
+l2arc_fini()
+{
+ mutex_enter(&l2arc_feed_thr_lock);
+ cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
+ l2arc_thread_exit = 1;
+ while (l2arc_thread_exit != 0)
+ cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+ mutex_exit(&l2arc_feed_thr_lock);
+
+ mutex_destroy(&l2arc_feed_thr_lock);
+ cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_dev_mtx);
+ mutex_destroy(&l2arc_buflist_mtx);
+ mutex_destroy(&l2arc_free_on_write_mtx);
+
+ list_destroy(l2arc_dev_list);
+ list_destroy(l2arc_free_on_write);
+}
diff --git a/zfs/lib/libzpool/bplist.c b/zfs/lib/libzpool/bplist.c
new file mode 100644
index 000000000..099d499c0
--- /dev/null
+++ b/zfs/lib/libzpool/bplist.c
@@ -0,0 +1,313 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)bplist.c 1.5 07/10/29 SMI"
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+static int
+bplist_hold(bplist_t *bpl)
+{
+ ASSERT(MUTEX_HELD(&bpl->bpl_lock));
+ if (bpl->bpl_dbuf == NULL) {
+ int err = dmu_bonus_hold(bpl->bpl_mos,
+ bpl->bpl_object, bpl, &bpl->bpl_dbuf);
+ if (err)
+ return (err);
+ bpl->bpl_phys = bpl->bpl_dbuf->db_data;
+ }
+ return (0);
+}
+
+uint64_t
+bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
+ BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
+
+ return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
+ DMU_OT_BPLIST_HDR, size, tx));
+}
+
+void
+bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+ VERIFY(dmu_object_free(mos, object, tx) == 0);
+}
+
+int
+bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(mos, object, &doi);
+ if (err)
+ return (err);
+
+ mutex_enter(&bpl->bpl_lock);
+
+ ASSERT(bpl->bpl_dbuf == NULL);
+ ASSERT(bpl->bpl_phys == NULL);
+ ASSERT(bpl->bpl_cached_dbuf == NULL);
+ ASSERT(bpl->bpl_queue == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
+
+ bpl->bpl_mos = mos;
+ bpl->bpl_object = object;
+ bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
+ bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
+ bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
+
+ mutex_exit(&bpl->bpl_lock);
+ return (0);
+}
+
+void
+bplist_close(bplist_t *bpl)
+{
+ mutex_enter(&bpl->bpl_lock);
+
+ ASSERT(bpl->bpl_queue == NULL);
+
+ if (bpl->bpl_cached_dbuf) {
+ dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+ bpl->bpl_cached_dbuf = NULL;
+ }
+ if (bpl->bpl_dbuf) {
+ dmu_buf_rele(bpl->bpl_dbuf, bpl);
+ bpl->bpl_dbuf = NULL;
+ bpl->bpl_phys = NULL;
+ }
+
+ mutex_exit(&bpl->bpl_lock);
+}
+
+boolean_t
+bplist_empty(bplist_t *bpl)
+{
+ boolean_t rv;
+
+ if (bpl->bpl_object == 0)
+ return (B_TRUE);
+
+ mutex_enter(&bpl->bpl_lock);
+ VERIFY(0 == bplist_hold(bpl)); /* XXX */
+ rv = (bpl->bpl_phys->bpl_entries == 0);
+ mutex_exit(&bpl->bpl_lock);
+
+ return (rv);
+}
+
+static int
+bplist_cache(bplist_t *bpl, uint64_t blkid)
+{
+ int err = 0;
+
+ if (bpl->bpl_cached_dbuf == NULL ||
+ bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
+ if (bpl->bpl_cached_dbuf != NULL)
+ dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+ err = dmu_buf_hold(bpl->bpl_mos,
+ bpl->bpl_object, blkid << bpl->bpl_blockshift,
+ bpl, &bpl->bpl_cached_dbuf);
+ ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
+ 1ULL << bpl->bpl_blockshift);
+ }
+ return (err);
+}
+
+int
+bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
+{
+ uint64_t blk, off;
+ blkptr_t *bparray;
+ int err;
+
+ mutex_enter(&bpl->bpl_lock);
+
+ err = bplist_hold(bpl);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ if (*itorp >= bpl->bpl_phys->bpl_entries) {
+ mutex_exit(&bpl->bpl_lock);
+ return (ENOENT);
+ }
+
+ blk = *itorp >> bpl->bpl_bpshift;
+ off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
+
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ bparray = bpl->bpl_cached_dbuf->db_data;
+ *bp = bparray[off];
+ (*itorp)++;
+ mutex_exit(&bpl->bpl_lock);
+ return (0);
+}
+
+int
+bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+{
+ uint64_t blk, off;
+ blkptr_t *bparray;
+ int err;
+
+ ASSERT(!BP_IS_HOLE(bp));
+ mutex_enter(&bpl->bpl_lock);
+ err = bplist_hold(bpl);
+ if (err)
+ return (err);
+
+ blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
+ off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
+
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
+ bparray = bpl->bpl_cached_dbuf->db_data;
+ bparray[off] = *bp;
+
+ /* We never need the fill count. */
+ bparray[off].blk_fill = 0;
+
+ /* The bplist will compress better if we can leave off the checksum */
+ bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+
+ dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+ bpl->bpl_phys->bpl_entries++;
+ bpl->bpl_phys->bpl_bytes +=
+ bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
+ if (bpl->bpl_havecomp) {
+ bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
+ bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
+ }
+ mutex_exit(&bpl->bpl_lock);
+
+ return (0);
+}
+
+/*
+ * Deferred entry; will be written later by bplist_sync().
+ */
+void
+bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+{
+ bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+
+ ASSERT(!BP_IS_HOLE(bp));
+ mutex_enter(&bpl->bpl_lock);
+ bpq->bpq_blk = *bp;
+ bpq->bpq_next = bpl->bpl_queue;
+ bpl->bpl_queue = bpq;
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+{
+ bplist_q_t *bpq;
+
+ mutex_enter(&bpl->bpl_lock);
+ while ((bpq = bpl->bpl_queue) != NULL) {
+ bpl->bpl_queue = bpq->bpq_next;
+ mutex_exit(&bpl->bpl_lock);
+ VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
+ kmem_free(bpq, sizeof (*bpq));
+ mutex_enter(&bpl->bpl_lock);
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
+{
+ mutex_enter(&bpl->bpl_lock);
+ ASSERT3P(bpl->bpl_queue, ==, NULL);
+ VERIFY(0 == bplist_hold(bpl));
+ dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+ VERIFY(0 == dmu_free_range(bpl->bpl_mos,
+ bpl->bpl_object, 0, -1ULL, tx));
+ bpl->bpl_phys->bpl_entries = 0;
+ bpl->bpl_phys->bpl_bytes = 0;
+ if (bpl->bpl_havecomp) {
+ bpl->bpl_phys->bpl_comp = 0;
+ bpl->bpl_phys->bpl_uncomp = 0;
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
+
+int
+bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ int err;
+
+ mutex_enter(&bpl->bpl_lock);
+
+ err = bplist_hold(bpl);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ *usedp = bpl->bpl_phys->bpl_bytes;
+ if (bpl->bpl_havecomp) {
+ *compp = bpl->bpl_phys->bpl_comp;
+ *uncompp = bpl->bpl_phys->bpl_uncomp;
+ }
+ mutex_exit(&bpl->bpl_lock);
+
+ if (!bpl->bpl_havecomp) {
+ uint64_t itor = 0, comp = 0, uncomp = 0;
+ blkptr_t bp;
+
+ while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
+ comp += BP_GET_PSIZE(&bp);
+ uncomp += BP_GET_UCSIZE(&bp);
+ }
+ if (err == ENOENT)
+ err = 0;
+ *compp = comp;
+ *uncompp = uncomp;
+ }
+
+ return (err);
+}
diff --git a/zfs/lib/libzpool/dbuf.c b/zfs/lib/libzpool/dbuf.c
new file mode 100644
index 000000000..08d17fb58
--- /dev/null
+++ b/zfs/lib/libzpool/dbuf.c
@@ -0,0 +1,2251 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dbuf.c 1.32 08/03/20 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static void dbuf_destroy(dmu_buf_impl_t *db);
+static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+ int compress, dmu_tx_t *tx);
+static arc_done_func_t dbuf_write_ready;
+static arc_done_func_t dbuf_write_done;
+
+int zfs_mdcomp_disable = 0;
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+static kmem_cache_t *dbuf_cache;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+ dmu_buf_impl_t *db = vdb;
+ bzero(db, sizeof (dmu_buf_impl_t));
+
+ mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+ refcount_create(&db->db_holds);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+ dmu_buf_impl_t *db = vdb;
+ mutex_destroy(&db->db_mtx);
+ cv_destroy(&db->db_changed);
+ refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+ uintptr_t osv = (uintptr_t)os;
+ uint64_t crc = -1ULL;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
+
+ crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
+
+ return (crc);
+}
+
+#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
+
+#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
+ ((dbuf)->db.db_object == (obj) && \
+ (dbuf)->db_objset == (os) && \
+ (dbuf)->db_level == (level) && \
+ (dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t obj = dn->dn_object;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *db;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+ if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (db);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_impl_t *os = db->db_objset;
+ uint64_t obj = db->db.db_object;
+ int level = db->db_level;
+ uint64_t blkid = db->db_blkid;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
+ if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+ mutex_enter(&dbf->db_mtx);
+ if (dbf->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (dbf);
+ }
+ mutex_exit(&dbf->db_mtx);
+ }
+ }
+
+ mutex_enter(&db->db_mtx);
+ db->db_hash_next = h->hash_table[idx];
+ h->hash_table[idx] = db;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, 1);
+
+ return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table. This operation will
+ * fail if there are any existing holds on the db.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf, **dbp;
+
+ /*
+ * We musn't hold db_mtx to maintin lock ordering:
+ * DBUF_HASH_MUTEX > db_mtx.
+ */
+ ASSERT(refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_state == DB_EVICTING);
+ ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ dbp = &h->hash_table[idx];
+ while ((dbf = *dbp) != db) {
+ dbp = &dbf->db_hash_next;
+ ASSERT(dbf != NULL);
+ }
+ *dbp = db->db_hash_next;
+ db->db_hash_next = NULL;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, -1);
+}
+
+static arc_evict_func_t dbuf_do_evict;
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level != 0 || db->db_evict_func == NULL)
+ return;
+
+ if (db->db_user_data_ptr_ptr)
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ db->db_evict_func(&db->db, db->db_user_ptr);
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+}
+
+void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ dbuf_clear(db);
+ dbuf_destroy(db);
+}
+
+void
+dbuf_init(void)
+{
+ uint64_t hsize = 1ULL << 16;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 4K block size. The table will take up
+ * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
+ */
+ while (hsize * 4096 < physmem * PAGESIZE)
+ hsize <<= 1;
+
+retry:
+ h->hash_table_mask = hsize - 1;
+ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+ if (h->hash_table == NULL) {
+ /* XXX - we should really return an error instead of assert */
+ ASSERT(hsize > (1ULL << 10));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+ sizeof (dmu_buf_impl_t),
+ 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+dbuf_fini(void)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_destroy(&h->hash_mutexes[i]);
+ kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+ kmem_cache_destroy(dbuf_cache);
+}
+
+/*
+ * Other stuff.
+ */
+
+#ifdef ZFS_DEBUG
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+ return;
+
+ ASSERT(db->db_objset != NULL);
+ if (dn == NULL) {
+ ASSERT(db->db_parent == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ } else {
+ ASSERT3U(db->db.db_object, ==, dn->dn_object);
+ ASSERT3P(db->db_objset, ==, dn->dn_objset);
+ ASSERT3U(db->db_level, <, dn->dn_nlevels);
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ list_head(&dn->dn_dbufs));
+ }
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+ } else {
+ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+ }
+
+ if (db->db_level == 0) {
+ /* we can be momentarily larger in dnode_set_blksz() */
+ if (db->db_blkid != DB_BONUS_BLKID && dn) {
+ ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
+ }
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ /*
+ * it should only be modified in syncing
+ * context, so make sure we only have
+ * one copy of the data.
+ */
+ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
+ }
+ }
+
+ /* verify db->db_blkptr */
+ if (db->db_blkptr) {
+ if (db->db_parent == dn->dn_dbuf) {
+ /* db is pointed to by the dnode */
+ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+ if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ ASSERT(db->db_parent == NULL);
+ else
+ ASSERT(db->db_parent != NULL);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ } else {
+ /* db is pointed to by an indirect block */
+ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+ ASSERT3U(db->db_parent->db.db_object, ==,
+ db->db.db_object);
+ /*
+ * dnode_grow_indblksz() can make this fail if we don't
+ * have the struct_rwlock. XXX indblksz no longer
+ * grows. safe to do this now?
+ */
+ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+ ASSERT3P(db->db_blkptr, ==,
+ ((blkptr_t *)db->db_parent->db.db_data +
+ db->db_blkid % epb));
+ }
+ }
+ }
+ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+ db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+ db->db_state != DB_FILL && !dn->dn_free_txg) {
+ /*
+ * If the blkptr isn't set but they have nonzero data,
+ * it had better be dirty, otherwise we'll lose that
+ * data when we evict this buffer.
+ */
+ if (db->db_dirtycnt == 0) {
+ uint64_t *buf = db->db.db_data;
+ int i;
+
+ for (i = 0; i < db->db.db_size >> 3; i++) {
+ ASSERT(buf[i] == 0);
+ }
+ }
+ }
+}
+#endif
+
+static void
+dbuf_update_data(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ }
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
+ db->db_buf = buf;
+ if (buf != NULL) {
+ ASSERT(buf->b_data != NULL);
+ db->db.db_data = buf->b_data;
+ if (!arc_released(buf))
+ arc_set_callback(buf, dbuf_do_evict, db);
+ dbuf_update_data(db);
+ } else {
+ dbuf_evict_user(db);
+ db->db.db_data = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+}
+
+uint64_t
+dbuf_whichblock(dnode_t *dn, uint64_t offset)
+{
+ if (dn->dn_datablkshift) {
+ return (offset >> dn->dn_datablkshift);
+ } else {
+ ASSERT3U(offset, <, dn->dn_datablksz);
+ return (0);
+ }
+}
+
+static void
+dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db_state, ==, DB_READ);
+ /*
+ * All reads are synchronous, so we must have a hold on the dbuf
+ */
+ ASSERT(refcount_count(&db->db_holds) > 0);
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ /* we were freed in flight; disregard any error */
+ arc_release(buf, db);
+ bzero(buf->b_data, db->db.db_size);
+ arc_buf_freeze(buf);
+ db->db_freed_in_flight = FALSE;
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else if (zio == NULL || zio->io_error == 0) {
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT3P(db->db_buf, ==, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ db->db_state = DB_UNCACHED;
+ }
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+ dbuf_rele(db, NULL);
+}
+
+static void
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
+{
+ blkptr_t *bp;
+ zbookmark_t zb;
+ uint32_t aflags = ARC_NOWAIT;
+
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ /* We need the struct_rwlock to prevent db_blkptr from changing. */
+ ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_buf == NULL);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ int bonuslen = db->db_dnode->dn_bonuslen;
+
+ ASSERT3U(bonuslen, <=, db->db.db_size);
+ db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN);
+ if (bonuslen < DN_MAX_BONUSLEN)
+ bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
+ bonuslen);
+ dbuf_update_data(db);
+ db->db_state = DB_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
+ bp = NULL;
+ else
+ bp = db->db_blkptr;
+
+ if (bp == NULL)
+ dprintf_dbuf(db, "blkptr: %s\n", "NULL");
+ else
+ dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
+
+ if (bp == NULL || BP_IS_HOLE(bp)) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(bp == NULL || BP_IS_HOLE(bp));
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_state = DB_CACHED;
+ *flags |= DB_RF_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ db->db_state = DB_READ;
+ mutex_exit(&db->db_mtx);
+
+ zb.zb_objset = db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+
+ dbuf_add_ref(db, NULL);
+ /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
+ ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
+ (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
+ db->db_level > 0 ? byteswap_uint64_array :
+ dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+ (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ &aflags, &zb);
+ if (aflags & ARC_CACHED)
+ *flags |= DB_RF_CACHED;
+}
+
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+ int err = 0;
+ int havepzio = (zio != NULL);
+ int prefetch;
+
+ /*
+ * We don't have to hold the mutex to check db_state because it
+ * can't be freed while we have a hold on the buffer.
+ */
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+
+ prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_CACHED) {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ } else if (db->db_state == DB_UNCACHED) {
+ if (zio == NULL) {
+ zio = zio_root(db->db_dnode->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ }
+ dbuf_read_impl(db, zio, &flags);
+
+ /* dbuf_read_impl has dropped db_mtx for us */
+
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, flags & DB_RF_CACHED);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+ if (!havepzio)
+ err = zio_wait(zio);
+ } else {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+ mutex_enter(&db->db_mtx);
+ if ((flags & DB_RF_NEVERWAIT) == 0) {
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL) {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ }
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ }
+ mutex_exit(&db->db_mtx);
+ }
+
+ ASSERT(err || havepzio || db->db_state == DB_CACHED);
+ return (err);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ db->db_state = DB_FILL;
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * This is our just-in-time copy function. It makes a copy of
+ * buffers, that have been modified in a previous transaction
+ * group, before we modify them in the current active group.
+ *
+ * This function is used in two places: when we are dirtying a
+ * buffer for the first time in a txg, and when we are freeing
+ * a range in a dnode that includes this buffer.
+ *
+ * Note that when we are called from dbuf_free_range() we do
+ * not put a hold on the buffer, we just traverse the active
+ * dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+ if (dr == NULL ||
+ (dr->dt.dl.dr_data !=
+ ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ return;
+
+ /*
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
+ */
+ ASSERT(dr->dr_txg >= txg - 2);
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ /* Note that the data bufs here are zio_bufs */
+ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dr->dt.dl.dr_data = arc_buf_alloc(
+ db->db_dnode->dn_objset->os_spa, size, db, type);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ } else {
+ dbuf_set_data(db, NULL);
+ }
+}
+
+void
+dbuf_unoverride(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+ ASSERT(db->db_level == 0);
+
+ if (db->db_blkid == DB_BONUS_BLKID ||
+ dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+ return;
+
+ /* free this block */
+ if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
+ /* XXX can get silent EIO here */
+ (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+ txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
+ }
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ /*
+ * Release the already-written buffer, so we leave it in
+ * a consistent dirty state. Note that all callers are
+ * modifying the buffer, so they will immediately do
+ * another (redundant) arc_release(). Therefore, leave
+ * the buf thawed to save the effort of freezing &
+ * immediately re-thawing it.
+ */
+ arc_release(dr->dt.dl.dr_data, db);
+}
+
+void
+dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ uint64_t txg = tx->tx_txg;
+
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ if (db->db_level != 0)
+ continue;
+ dprintf_dbuf(db, "found buf %s\n", "");
+ if (db->db_blkid < blkid ||
+ db->db_blkid >= blkid+nblks)
+ continue;
+
+ /* found a level 0 buffer in the range */
+ if (dbuf_undirty(db, tx))
+ continue;
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_EVICTING) {
+ ASSERT(db->db.db_data == NULL);
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+ /* will be handled in dbuf_read_done or dbuf_rele */
+ db->db_freed_in_flight = TRUE;
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (refcount_count(&db->db_holds) == 0) {
+ ASSERT(db->db_buf);
+ dbuf_clear(db);
+ continue;
+ }
+ /* The dbuf is referenced */
+
+ if (db->db_last_dirty != NULL) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ if (dr->dr_txg == txg) {
+ /*
+ * This buffer is "in-use", re-adjust the file
+ * size to reflect that this buffer may
+ * contain new data when we sync.
+ */
+ if (db->db_blkid > dn->dn_maxblkid)
+ dn->dn_maxblkid = db->db_blkid;
+ dbuf_unoverride(dr);
+ } else {
+ /*
+ * This dbuf is not dirty in the open context.
+ * Either uncache it (if its not referenced in
+ * the open context) or reset its contents to
+ * empty.
+ */
+ dbuf_fix_old_data(db, txg);
+ }
+ }
+ /* clear the contents if its cached */
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ arc_release(db->db_buf, db);
+ bzero(db->db.db_data, db->db.db_size);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ mutex_exit(&db->db_mtx);
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+static int
+dbuf_block_freeable(dmu_buf_impl_t *db)
+{
+ dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
+ uint64_t birth_txg = 0;
+
+ /*
+ * We don't need any locking to protect db_blkptr:
+ * If it's syncing, then db_last_dirty will be set
+ * so we'll ignore db_blkptr.
+ */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ if (db->db_last_dirty)
+ birth_txg = db->db_last_dirty->dr_txg;
+ else if (db->db_blkptr)
+ birth_txg = db->db_blkptr->blk_birth;
+
+ /* If we don't exist or are in a snapshot, we can't be freed */
+ if (birth_txg)
+ return (ds == NULL ||
+ dsl_dataset_block_freeable(ds, birth_txg));
+ else
+ return (FALSE);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+ arc_buf_t *buf, *obuf;
+ int osize = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ /* XXX does *this* func really need the lock? */
+ ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+
+ /*
+ * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+ * is OK, because there can be no other references to the db
+ * when we are changing its size, so no concurrent DB_FILL can
+ * be happening.
+ */
+ /*
+ * XXX we should be doing a dbuf_read, checking the return
+ * value and returning that up to our callers
+ */
+ dbuf_will_dirty(db, tx);
+
+ /* create the data buffer for the new block */
+ buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+
+ /* copy old block data to the new block */
+ obuf = db->db_buf;
+ bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
+ /* zero the remainder */
+ if (size > osize)
+ bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+ mutex_enter(&db->db_mtx);
+ dbuf_set_data(db, buf);
+ VERIFY(arc_buf_remove_ref(obuf, db) == 1);
+ db->db.db_size = size;
+
+ if (db->db_level == 0) {
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ db->db_last_dirty->dt.dl.dr_data = buf;
+ }
+ mutex_exit(&db->db_mtx);
+
+ dnode_willuse_space(db->db_dnode, size-osize, tx);
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ dbuf_dirty_record_t **drp, *dr;
+ int drop_struct_lock = FALSE;
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ DMU_TX_DIRTY_BUF(tx, db);
+
+ /*
+ * Shouldn't dirty a regular buffer in syncing context. Private
+ * objects may be dirtied in syncing context, but only if they
+ * were already pre-dirtied in open context.
+ * XXX We may want to prohibit dirtying in syncing context even
+ * if they did pre-dirty.
+ */
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+ dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_objset->os_dsl_dataset == NULL ||
+ dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
+
+ /*
+ * We make this assert for private objects as well, but after we
+ * check if we're already dirty. They are allowed to re-dirty
+ * in syncing context.
+ */
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * XXX make this true for indirects too? The problem is that
+ * transactions created with dmu_tx_create_assigned() from
+ * syncing context don't bother holding ahead.
+ */
+ ASSERT(db->db_level != 0 ||
+ db->db_state == DB_CACHED || db->db_state == DB_FILL);
+
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * Don't set dirtyctx to SYNC if we're just modifying this as we
+ * initialize the objset.
+ */
+ if (dn->dn_dirtyctx == DN_UNDIRTIED &&
+ !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+ dn->dn_dirtyctx =
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+ ASSERT(dn->dn_dirtyctx_firstset == NULL);
+ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If this buffer is already dirty, we're done.
+ */
+ drp = &db->db_last_dirty;
+ ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
+ db->db.db_object == DMU_META_DNODE_OBJECT);
+ while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+ drp = &dr->dr_next;
+ if (dr && dr->dr_txg == tx->tx_txg) {
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(dr);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT)
+ arc_buf_thaw(db->db_buf);
+ }
+ mutex_exit(&db->db_mtx);
+ return (dr);
+ }
+
+ /*
+ * Only valid if not already dirty.
+ */
+ ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ ASSERT3U(dn->dn_nlevels, >, db->db_level);
+ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+ dn->dn_phys->dn_nlevels > db->db_level ||
+ dn->dn_next_nlevels[txgoff] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+ /*
+ * We should only be dirtying in syncing context if it's the
+ * mos, a spa os, or we're initializing the os. However, we are
+ * allowed to dirty in syncing context provided we already
+ * dirtied it in open context. Hence we must make this
+ * assertion only if we're not already dirty.
+ */
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ os->os_dsl_dataset == NULL ||
+ !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
+ !BP_IS_HOLE(os->os_rootbp));
+ ASSERT(db->db.db_size != 0);
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ */
+ if (dbuf_block_freeable(db)) {
+ blkptr_t *bp = db->db_blkptr;
+ int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+ bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn, -willfree, tx);
+ }
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ }
+
+ /*
+ * If this buffer is dirty in an old transaction group we need
+ * to make a copy of it so that the changes we make in this
+ * transaction group won't leak out when we sync the older txg.
+ */
+ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+ if (db->db_level == 0) {
+ void *data_old = db->db_buf;
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so that we
+ * can modify it without impacting possible other users
+ * of this cached data block. Note that indirect
+ * blocks and private objects are not released until the
+ * syncing state (since they are only modified then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
+ dr->dt.dl.dr_data = data_old;
+ } else {
+ mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&dr->dt.di.dr_children,
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+ dr->dr_dbuf = db;
+ dr->dr_txg = tx->tx_txg;
+ dr->dr_next = *drp;
+ *drp = dr;
+
+ /*
+ * We could have been freed_in_flight between the dbuf_noread
+ * and dbuf_dirty. We win, as though the dbuf_noread() had
+ * happened after the free.
+ */
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ db->db_freed_in_flight = FALSE;
+ }
+
+ /*
+ * This buffer is now part of this txg
+ */
+ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ db->db_dirtycnt += 1;
+ ASSERT3U(db->db_dirtycnt, <=, 3);
+
+ mutex_exit(&db->db_mtx);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ dnode_setdirty(dn, tx);
+ return (dr);
+ }
+
+ if (db->db_level == 0) {
+ dnode_new_blkid(dn, db->db_blkid, tx);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (db->db_level+1 < dn->dn_nlevels) {
+ dmu_buf_impl_t *parent = db->db_parent;
+ dbuf_dirty_record_t *di;
+ int parent_held = FALSE;
+
+ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ parent = dbuf_hold_level(dn, db->db_level+1,
+ db->db_blkid >> epbs, FTAG);
+ parent_held = TRUE;
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3U(db->db_level+1, ==, parent->db_level);
+ di = dbuf_dirty(parent, tx);
+ if (parent_held)
+ dbuf_rele(parent, FTAG);
+
+ mutex_enter(&db->db_mtx);
+ /* possible race with dbuf_undirty() */
+ if (db->db_last_dirty == dr ||
+ dn->dn_object == DMU_META_DNODE_OBJECT) {
+ mutex_enter(&di->dt.di.dr_mtx);
+ ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&di->dt.di.dr_children, dr);
+ mutex_exit(&di->dt.di.dr_mtx);
+ dr->dr_parent = di;
+ }
+ mutex_exit(&db->db_mtx);
+ } else {
+ ASSERT(db->db_level+1 == dn->dn_nlevels);
+ ASSERT(db->db_blkid < dn->dn_nblkptr);
+ ASSERT(db->db_parent == NULL ||
+ db->db_parent == db->db_dnode->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ dnode_setdirty(dn, tx);
+ return (dr);
+}
+
+static int
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ uint64_t txg = tx->tx_txg;
+ dbuf_dirty_record_t *dr, **drp;
+
+ ASSERT(txg != 0);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ mutex_enter(&db->db_mtx);
+
+ /*
+ * If this buffer is not dirty, we're done.
+ */
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+ if (dr->dr_txg <= txg)
+ break;
+ if (dr == NULL || dr->dr_txg < txg) {
+ mutex_exit(&db->db_mtx);
+ return (0);
+ }
+ ASSERT(dr->dr_txg == txg);
+
+ /*
+ * If this buffer is currently held, we cannot undirty
+ * it, since one of the current holders may be in the
+ * middle of an update. Note that users of dbuf_undirty()
+ * should not place a hold on the dbuf before the call.
+ */
+ if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ mutex_exit(&db->db_mtx);
+ /* Make sure we don't toss this buffer at sync phase */
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ ASSERT(db->db.db_size != 0);
+
+ /* XXX would be nice to fix up dn_towrite_space[] */
+
+ *drp = dr->dr_next;
+
+ if (dr->dr_parent) {
+ mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+ list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+ mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+ } else if (db->db_level+1 == dn->dn_nlevels) {
+ ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (db->db_level == 0) {
+ dbuf_unoverride(dr);
+
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ } else {
+ ASSERT(db->db_buf != NULL);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+
+ if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
+ arc_buf_t *buf = db->db_buf;
+
+ ASSERT(arc_released(buf));
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ return (1);
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (0);
+}
+
+#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
+void
+dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ (void) dbuf_read(db, NULL, rf);
+ (void) dbuf_dirty(db, tx);
+}
+
+void
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(db->db_level == 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
+ dmu_tx_private_ok(tx));
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ if (db->db_state == DB_FILL) {
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ /* we were freed while filling */
+ /* XXX dbuf_undirty? */
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_freed_in_flight = FALSE;
+ }
+ db->db_state = DB_CACHED;
+ cv_broadcast(&db->db_changed);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * "Clear" the contents of this dbuf. This will mark the dbuf
+ * EVICTING and clear *most* of its references. Unfortunetely,
+ * when we are not holding the dn_dbufs_mtx, we can't clear the
+ * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
+ * in this case. For callers from the DMU we will usually see:
+ * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ * For the arc callback, we will usually see:
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * Sometimes, though, we will get a mix of these two:
+ * DMU: dbuf_clear()->arc_buf_evict()
+ * ARC: dbuf_do_evict()->dbuf_destroy()
+ */
+void
+dbuf_clear(dmu_buf_impl_t *db)
+{
+ dnode_t *dn = db->db_dnode;
+ dmu_buf_impl_t *parent = db->db_parent;
+ dmu_buf_impl_t *dndb = dn->dn_dbuf;
+ int dbuf_gone = FALSE;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ dbuf_evict_user(db);
+
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN);
+ }
+ db->db.db_data = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_data_pending == NULL);
+
+ db->db_state = DB_EVICTING;
+ db->db_blkptr = NULL;
+
+ if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+ list_remove(&dn->dn_dbufs, db);
+ dnode_rele(dn, db);
+ db->db_dnode = NULL;
+ }
+
+ if (db->db_buf)
+ dbuf_gone = arc_buf_evict(db->db_buf);
+
+ if (!dbuf_gone)
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * If this dbuf is referened from an indirect dbuf,
+ * decrement the ref count on the indirect dbuf.
+ */
+ if (parent && parent != dndb)
+ dbuf_rele(parent, db);
+}
+
+static int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+ dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+ int nlevels, epbs;
+
+ *parentp = NULL;
+ *bpp = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+
+ if (dn->dn_phys->dn_nlevels == 0)
+ nlevels = 1;
+ else
+ nlevels = dn->dn_phys->dn_nlevels;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT3U(level * epbs, <, 64);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ if (level >= nlevels ||
+ (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+ /* the buffer has no parent yet */
+ return (ENOENT);
+ } else if (level < nlevels-1) {
+ /* this block is referenced from an indirect block */
+ int err = dbuf_hold_impl(dn, level+1,
+ blkid >> epbs, fail_sparse, NULL, parentp);
+ if (err)
+ return (err);
+ err = dbuf_read(*parentp, NULL,
+ (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err) {
+ dbuf_rele(*parentp, NULL);
+ *parentp = NULL;
+ return (err);
+ }
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ return (0);
+ } else {
+ /* the block is referenced from the dnode */
+ ASSERT3U(level, ==, nlevels-1);
+ ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+ blkid < dn->dn_phys->dn_nblkptr);
+ if (dn->dn_dbuf) {
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ }
+ *bpp = &dn->dn_phys->dn_blkptr[blkid];
+ return (0);
+ }
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+ dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+ objset_impl_t *os = dn->dn_objset;
+ dmu_buf_impl_t *db, *odb;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+
+ db->db_objset = os;
+ db->db.db_object = dn->dn_object;
+ db->db_level = level;
+ db->db_blkid = blkid;
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt = 0;
+ db->db_dnode = dn;
+ db->db_parent = parent;
+ db->db_blkptr = blkptr;
+
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+ db->db_immediate_evict = 0;
+ db->db_freed_in_flight = 0;
+
+ if (blkid == DB_BONUS_BLKID) {
+ ASSERT3P(parent, ==, dn->dn_dbuf);
+ db->db.db_size = DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ db->db.db_offset = DB_BONUS_BLKID;
+ db->db_state = DB_UNCACHED;
+ /* the bonus dbuf is not placed in the hash table */
+ arc_space_consume(sizeof (dmu_buf_impl_t));
+ return (db);
+ } else {
+ int blocksize =
+ db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
+ db->db.db_size = blocksize;
+ db->db.db_offset = db->db_blkid * blocksize;
+ }
+
+ /*
+ * Hold the dn_dbufs_mtx while we get the new dbuf
+ * in the hash table *and* added to the dbufs list.
+ * This prevents a possible deadlock with someone
+ * trying to look up this dbuf before its added to the
+ * dn_dbufs list.
+ */
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db->db_state = DB_EVICTING;
+ if ((odb = dbuf_hash_insert(db)) != NULL) {
+ /* someone else inserted it first */
+ kmem_cache_free(dbuf_cache, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ return (odb);
+ }
+ list_insert_head(&dn->dn_dbufs, db);
+ db->db_state = DB_UNCACHED;
+ mutex_exit(&dn->dn_dbufs_mtx);
+ arc_space_consume(sizeof (dmu_buf_impl_t));
+
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_add_ref(parent, db);
+
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ refcount_count(&dn->dn_holds) > 0);
+ (void) refcount_add(&dn->dn_holds, db);
+
+ dprintf_dbuf(db, "db=%p\n", db);
+
+ return (db);
+}
+
+static int
+dbuf_do_evict(void *private)
+{
+ arc_buf_t *buf = private;
+ dmu_buf_impl_t *db = buf->b_private;
+
+ if (!MUTEX_HELD(&db->db_mtx))
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_state != DB_EVICTING) {
+ ASSERT(db->db_state == DB_CACHED);
+ DBUF_VERIFY(db);
+ db->db_buf = NULL;
+ dbuf_evict(db);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dbuf_destroy(db);
+ }
+ return (0);
+}
+
+static void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * If this dbuf is still on the dn_dbufs list,
+ * remove it from that list.
+ */
+ if (db->db_dnode) {
+ dnode_t *dn = db->db_dnode;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ list_remove(&dn->dn_dbufs, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ dnode_rele(dn, db);
+ db->db_dnode = NULL;
+ }
+ dbuf_hash_remove(db);
+ }
+ db->db_parent = NULL;
+ db->db_buf = NULL;
+
+ ASSERT(!list_link_active(&db->db_link));
+ ASSERT(db->db.db_data == NULL);
+ ASSERT(db->db_hash_next == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ kmem_cache_free(dbuf_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t));
+}
+
+void
+dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+{
+ dmu_buf_impl_t *db = NULL;
+ blkptr_t *bp = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ if (dnode_block_freed(dn, blkid))
+ return;
+
+ /* dbuf_find() returns with db_mtx held */
+ if (db = dbuf_find(dn, 0, blkid)) {
+ if (refcount_count(&db->db_holds) > 0) {
+ /*
+ * This dbuf is active. We assume that it is
+ * already CACHED, or else about to be either
+ * read or filled.
+ */
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ mutex_exit(&db->db_mtx);
+ db = NULL;
+ }
+
+ if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
+ if (bp && !BP_IS_HOLE(bp)) {
+ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+ zbookmark_t zb;
+ zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
+ dn->dn_objset->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = dn->dn_object;
+ zb.zb_level = 0;
+ zb.zb_blkid = blkid;
+
+ (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
+ dmu_ot[dn->dn_type].ot_byteswap,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zb);
+ }
+ if (db)
+ dbuf_rele(db, NULL);
+ }
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ void *tag, dmu_buf_impl_t **dbp)
+{
+ dmu_buf_impl_t *db, *parent = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT3U(dn->dn_nlevels, >, level);
+
+ *dbp = NULL;
+top:
+ /* dbuf_find() returns with db_mtx held */
+ db = dbuf_find(dn, level, blkid);
+
+ if (db == NULL) {
+ blkptr_t *bp = NULL;
+ int err;
+
+ ASSERT3P(parent, ==, NULL);
+ err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+ if (fail_sparse) {
+ if (err == 0 && bp && BP_IS_HOLE(bp))
+ err = ENOENT;
+ if (err) {
+ if (parent)
+ dbuf_rele(parent, NULL);
+ return (err);
+ }
+ }
+ if (err && err != ENOENT)
+ return (err);
+ db = dbuf_create(dn, level, blkid, parent, bp);
+ }
+
+ if (db->db_buf && refcount_is_zero(&db->db_holds)) {
+ arc_buf_add_ref(db->db_buf, db);
+ if (db->db_buf->b_data == NULL) {
+ dbuf_clear(db);
+ if (parent) {
+ dbuf_rele(parent, NULL);
+ parent = NULL;
+ }
+ goto top;
+ }
+ ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+ }
+
+ ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
+ /*
+ * If this buffer is currently syncing out, and we are are
+ * still referencing it from db_data, we need to make a copy
+ * of it in case we decide we want to dirty it again in this txg.
+ */
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ db->db_state == DB_CACHED && db->db_data_pending) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+
+ if (dr->dt.dl.dr_data == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ dbuf_set_data(db,
+ arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
+ db->db.db_size);
+ }
+ }
+
+ (void) refcount_add(&db->db_holds, tag);
+ dbuf_update_data(db);
+ DBUF_VERIFY(db);
+ mutex_exit(&db->db_mtx);
+
+ /* NOTE: we can't rele the parent until after we drop the db_mtx */
+ if (parent)
+ dbuf_rele(parent, NULL);
+
+ ASSERT3P(db->db_dnode, ==, dn);
+ ASSERT3U(db->db_blkid, ==, blkid);
+ ASSERT3U(db->db_level, ==, level);
+ *dbp = db;
+
+ return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+void
+dbuf_create_bonus(dnode_t *dn)
+{
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ ASSERT(dn->dn_bonus == NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds = refcount_add(&db->db_holds, tag);
+ ASSERT(holds > 1);
+}
+
+#pragma weak dmu_buf_rele = dbuf_rele
+void
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds;
+
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ holds = refcount_remove(&db->db_holds, tag);
+ ASSERT(holds >= 0);
+
+ /*
+ * We can't freeze indirects if there is a possibility that they
+ * may be modified in the current syncing context.
+ */
+ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
+ arc_buf_freeze(db->db_buf);
+
+ if (holds == db->db_dirtycnt &&
+ db->db_level == 0 && db->db_immediate_evict)
+ dbuf_evict_user(db);
+
+ if (holds == 0) {
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ mutex_exit(&db->db_mtx);
+ dnode_rele(db->db_dnode, db);
+ } else if (db->db_buf == NULL) {
+ /*
+ * This is a special case: we never associated this
+ * dbuf with any data allocated from the ARC.
+ */
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ dbuf_evict(db);
+ } else if (arc_released(db->db_buf)) {
+ arc_buf_t *buf = db->db_buf;
+ /*
+ * This dbuf has anonymous data associated with it.
+ */
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ } else {
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+ mutex_exit(&db->db_mtx);
+ }
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+ return (refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_immediate_evict = TRUE;
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(db->db_level == 0);
+
+ ASSERT((user_ptr == NULL) == (evict_func == NULL));
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_user_ptr == old_user_ptr) {
+ db->db_user_ptr = user_ptr;
+ db->db_user_data_ptr_ptr = user_data_ptr_ptr;
+ db->db_evict_func = evict_func;
+
+ dbuf_update_data(db);
+ } else {
+ old_user_ptr = db->db_user_ptr;
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (old_user_ptr);
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ return (db->db_user_ptr);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+ /* ASSERT(dmu_tx_is_syncing(tx) */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_blkptr != NULL)
+ return;
+
+ if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+ /*
+ * This buffer was allocated at a time when there was
+ * no available blkptrs from the dnode, or it was
+ * inappropriate to hook it in (i.e., nlevels mis-match).
+ */
+ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+ ASSERT(db->db_parent == NULL);
+ db->db_parent = dn->dn_dbuf;
+ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+ DBUF_VERIFY(db);
+ } else {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT(dn->dn_phys->dn_nlevels > 1);
+ if (parent == NULL) {
+ mutex_exit(&db->db_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ (void) dbuf_hold_impl(dn, db->db_level+1,
+ db->db_blkid >> epbs, FALSE, db, &parent);
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ db->db_parent = parent;
+ }
+ db->db_blkptr = (blkptr_t *)parent->db.db_data +
+ (db->db_blkid & ((1ULL << epbs) - 1));
+ DBUF_VERIFY(db);
+ }
+}
+
+static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ zio_t *zio;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(db->db_level > 0);
+ DBUF_VERIFY(db);
+
+ if (db->db_buf == NULL) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ mutex_enter(&db->db_mtx);
+ }
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ ASSERT(db->db_buf != NULL);
+
+ dbuf_check_blkptr(dn, db);
+
+ db->db_data_pending = dr;
+
+ arc_release(db->db_buf, db);
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
+ zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+
+ zio = dr->dr_zio;
+ mutex_enter(&dr->dt.di.dr_mtx);
+ dbuf_sync_list(&dr->dt.di.dr_children, tx);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_exit(&dr->dt.di.dr_mtx);
+ zio_nowait(zio);
+}
+
+static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ arc_buf_t **datap = &dr->dt.dl.dr_data;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+ int checksum, compress;
+ int blksz;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * To be synced, we must be dirtied. But we
+ * might have been freed after the dirty.
+ */
+ if (db->db_state == DB_UNCACHED) {
+ /* This buffer has been freed since it was dirtied */
+ ASSERT(db->db.db_data == NULL);
+ } else if (db->db_state == DB_FILL) {
+ /* This buffer was freed and is now being re-filled */
+ ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ DBUF_VERIFY(db);
+
+ /*
+ * If this is a bonus buffer, simply copy the bonus data into the
+ * dnode. It will be written out when the dnode is synced (and it
+ * will be synced, since it must have been dirty for dbuf_sync to
+ * be called).
+ */
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dbuf_dirty_record_t **drp;
+
+ ASSERT(*datap != NULL);
+ ASSERT3U(db->db_level, ==, 0);
+ ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ if (*datap != db->db.db_data) {
+ zio_buf_free(*datap, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN);
+ }
+ db->db_data_pending = NULL;
+ drp = &db->db_last_dirty;
+ while (*drp != dr)
+ drp = &(*drp)->dr_next;
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ return;
+ }
+
+ /*
+ * This function may have dropped the db_mtx lock allowing a dmu_sync
+ * operation to sneak in. As a result, we need to ensure that we
+ * don't check the dr_override_state until we have returned from
+ * dbuf_check_blkptr.
+ */
+ dbuf_check_blkptr(dn, db);
+
+ /*
+ * If this buffer is in the middle of an immdiate write,
+ * wait for the synchronous IO to complete.
+ */
+ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+ }
+
+ /*
+ * If this dbuf has already been written out via an immediate write,
+ * just complete the write by copying over the new block pointer and
+ * updating the accounting via the write-completion functions.
+ */
+ if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ zio_t zio_fake;
+
+ zio_fake.io_private = &db;
+ zio_fake.io_error = 0;
+ zio_fake.io_bp = db->db_blkptr;
+ zio_fake.io_bp_orig = *db->db_blkptr;
+ zio_fake.io_txg = txg;
+
+ *db->db_blkptr = dr->dt.dl.dr_overridden_by;
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ db->db_data_pending = dr;
+ dr->dr_zio = &zio_fake;
+ mutex_exit(&db->db_mtx);
+
+ if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio_fake.io_bp_orig, dn->dn_zio, tx);
+
+ dbuf_write_ready(&zio_fake, db->db_buf, db);
+ dbuf_write_done(&zio_fake, db->db_buf, db);
+
+ return;
+ }
+
+ blksz = arc_buf_size(*datap);
+
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * If this buffer is currently "in use" (i.e., there are
+ * active holds and db_data still references it), then make
+ * a copy before we start the write so that any modifications
+ * from the open txg will not leak into this write.
+ *
+ * NOTE: this copy does not need to be made for objects only
+ * modified in the syncing context (e.g. DNONE_DNODE blocks).
+ */
+ if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+ bcopy(db->db.db_data, (*datap)->b_data, blksz);
+ }
+ } else {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ arc_release(db->db_buf, db);
+ }
+
+ ASSERT(*datap != NULL);
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * Allow dnode settings to override objset settings,
+ * except for metadata checksums.
+ */
+ if (dmu_ot[dn->dn_type].ot_metadata) {
+ checksum = os->os_md_checksum;
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_md_compress);
+ } else {
+ checksum = zio_checksum_select(dn->dn_checksum,
+ os->os_checksum);
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_compress);
+ }
+
+ dbuf_write(dr, *datap, checksum, compress, tx);
+
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
+ else
+ zio_nowait(dr->dr_zio);
+}
+
+void
+dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ if (dr->dr_zio != NULL) {
+ /*
+ * If we find an already initialized zio then we
+ * are processing the meta-dnode, and we have finished.
+ * The dbufs for all dnodes are put back on the list
+ * during processing, so that we can zio_wait()
+ * these IOs after initiating all child IOs.
+ */
+ ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+ DMU_META_DNODE_OBJECT);
+ break;
+ }
+ list_remove(list, dr);
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
+ }
+}
+
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+ int compress, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_t zb;
+ zio_t *zio;
+ int zio_flags;
+
+ if (parent != dn->dn_dbuf) {
+ ASSERT(parent && parent->db_data_pending);
+ ASSERT(db->db_level == parent->db_level-1);
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
+ } else {
+ ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ zio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+
+ zio_flags = ZIO_FLAG_MUSTSUCCEED;
+ if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
+ zio_flags |= ZIO_FLAG_METADATA;
+ if (BP_IS_OLDER(db->db_blkptr, txg))
+ dsl_dataset_block_kill(
+ os->os_dsl_dataset, db->db_blkptr, zio, tx);
+
+ dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
+ dmu_get_replication_level(os, &zb, dn->dn_type), txg,
+ db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ uint64_t fill = 0;
+ int old_size, new_size, i;
+
+ dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
+
+ old_size = bp_get_dasize(os->os_spa, bp_orig);
+ new_size = bp_get_dasize(os->os_spa, zio->io_bp);
+
+ dnode_diduse_space(dn, new_size-old_size);
+
+ if (BP_IS_HOLE(zio->io_bp)) {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (bp_orig->blk_birth == tx->tx_txg)
+ dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+ return;
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_level == 0) {
+ mutex_enter(&dn->dn_mtx);
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+ dn->dn_phys->dn_maxblkid = db->db_blkid;
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ dnode_phys_t *dnp = db->db.db_data;
+ for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
+ i--, dnp++) {
+ if (dnp->dn_type != DMU_OT_NONE)
+ fill++;
+ }
+ } else {
+ fill = 1;
+ }
+ } else {
+ blkptr_t *bp = db->db.db_data;
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ ASSERT3U(BP_GET_LSIZE(bp), ==,
+ db->db_level == 1 ? dn->dn_datablksz :
+ (1<<dn->dn_phys->dn_indblkshift));
+ fill += bp->blk_fill;
+ }
+ }
+
+ db->db_blkptr->blk_fill = fill;
+ BP_SET_TYPE(db->db_blkptr, dn->dn_type);
+ BP_SET_LEVEL(db->db_blkptr, db->db_level);
+
+ mutex_exit(&db->db_mtx);
+
+ /* We must do this after we've set the bp's type and level */
+ if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (bp_orig->blk_birth == tx->tx_txg)
+ dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ dsl_dataset_block_born(ds, zio->io_bp, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ uint64_t txg = zio->io_txg;
+ dbuf_dirty_record_t **drp, *dr;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ mutex_enter(&db->db_mtx);
+
+ drp = &db->db_last_dirty;
+ while ((dr = *drp) != db->db_data_pending)
+ drp = &dr->dr_next;
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
+
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ else if (!BP_IS_HOLE(db->db_blkptr))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ else
+ ASSERT(arc_released(db->db_buf));
+ } else {
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ int epbs =
+ dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ db->db.db_size);
+ ASSERT3U(dn->dn_phys->dn_maxblkid
+ >> (db->db_level * epbs), >=, db->db_blkid);
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ }
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ cv_broadcast(&db->db_changed);
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ db->db_data_pending = NULL;
+ mutex_exit(&db->db_mtx);
+
+ dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+}
diff --git a/zfs/lib/libzpool/dmu.c b/zfs/lib/libzpool/dmu.c
new file mode 100644
index 000000000..8e1278eb1
--- /dev/null
+++ b/zfs/lib/libzpool/dmu.c
@@ -0,0 +1,1049 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dmu.c 1.30 07/11/09 SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#endif
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+ { byteswap_uint8_array, TRUE, "unallocated" },
+ { zap_byteswap, TRUE, "object directory" },
+ { byteswap_uint64_array, TRUE, "object array" },
+ { byteswap_uint8_array, TRUE, "packed nvlist" },
+ { byteswap_uint64_array, TRUE, "packed nvlist size" },
+ { byteswap_uint64_array, TRUE, "bplist" },
+ { byteswap_uint64_array, TRUE, "bplist header" },
+ { byteswap_uint64_array, TRUE, "SPA space map header" },
+ { byteswap_uint64_array, TRUE, "SPA space map" },
+ { byteswap_uint64_array, TRUE, "ZIL intent log" },
+ { dnode_buf_byteswap, TRUE, "DMU dnode" },
+ { dmu_objset_byteswap, TRUE, "DMU objset" },
+ { byteswap_uint64_array, TRUE, "DSL directory" },
+ { zap_byteswap, TRUE, "DSL directory child map"},
+ { zap_byteswap, TRUE, "DSL dataset snap map" },
+ { zap_byteswap, TRUE, "DSL props" },
+ { byteswap_uint64_array, TRUE, "DSL dataset" },
+ { zfs_znode_byteswap, TRUE, "ZFS znode" },
+ { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" },
+ { byteswap_uint8_array, FALSE, "ZFS plain file" },
+ { zap_byteswap, TRUE, "ZFS directory" },
+ { zap_byteswap, TRUE, "ZFS master node" },
+ { zap_byteswap, TRUE, "ZFS delete queue" },
+ { byteswap_uint8_array, FALSE, "zvol object" },
+ { zap_byteswap, TRUE, "zvol prop" },
+ { byteswap_uint8_array, FALSE, "other uint8[]" },
+ { byteswap_uint64_array, FALSE, "other uint64[]" },
+ { zap_byteswap, TRUE, "other ZAP" },
+ { zap_byteswap, TRUE, "persistent error log" },
+ { byteswap_uint8_array, TRUE, "SPA history" },
+ { byteswap_uint64_array, TRUE, "SPA history offsets" },
+ { zap_byteswap, TRUE, "Pool properties" },
+ { zap_byteswap, TRUE, "DSL permissions" },
+ { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { byteswap_uint8_array, TRUE, "ZFS SYSACL" },
+ { byteswap_uint8_array, TRUE, "FUID table" },
+ { byteswap_uint64_array, TRUE, "FUID table size" },
+};
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ blkid = dbuf_whichblock(dn, offset);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL) {
+ err = EIO;
+ } else {
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ dbuf_rele(db, tag);
+ db = NULL;
+ }
+ }
+
+ dnode_rele(dn, FTAG);
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_bonus_max(void)
+{
+ return (DN_MAX_BONUSLEN);
+}
+
+int
+dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+ return (EINVAL);
+ if (newsize < 0 || newsize > db->db_size)
+ return (EINVAL);
+ dnode_setbonuslen(dn, newsize, tx);
+ return (0);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ */
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *db;
+ int error;
+
+ error = dnode_hold(os->os, object, FTAG, &dn);
+ if (error)
+ return (error);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dbuf_create_bonus(dn);
+ }
+ db = dn->dn_bonus;
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (refcount_add(&db->db_holds, tag) == 1)
+ VERIFY(dnode_add_ref(dn, db));
+
+ dnode_rele(dn, FTAG);
+
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+
+ *dbp = &db->db;
+ return (0);
+}
+
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+static int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dmu_buf_t **dbp;
+ uint64_t blkid, nblks, i;
+ uint32_t flags;
+ int err;
+ zio_t *zio;
+
+ ASSERT(length <= DMU_MAX_ACCESS);
+
+ flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+ if (length > zfetch_array_rd_sz)
+ flags |= DB_RF_NOPREFETCH;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
+ P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+ } else {
+ if (offset + length > dn->dn_datablksz) {
+ zfs_panic_recover("zfs: accessing past end of object "
+ "%llx/%llx (size=%u access=%llu+%llu)",
+ (longlong_t)dn->dn_objset->
+ os_dsl_dataset->ds_object,
+ (longlong_t)dn->dn_object, dn->dn_datablksz,
+ (longlong_t)offset, (longlong_t)length);
+ return (EIO);
+ }
+ nblks = 1;
+ }
+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+ if (db == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_rele_array(dbp, nblks, tag);
+ zio_nowait(zio);
+ return (EIO);
+ }
+ /* initiate async i/o */
+ if (read) {
+ rw_exit(&dn->dn_struct_rwlock);
+ (void) dbuf_read(db, zio, flags);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ }
+ dbp[i] = &db->db;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* wait for async i/o */
+ err = zio_wait(zio);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+
+ /* wait for other io to complete */
+ if (read) {
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ mutex_exit(&db->db_mtx);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+ }
+ }
+
+ *numbufsp = nblks;
+ *dbpp = dbp;
+ return (0);
+}
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ int err;
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp);
+
+ return (err);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+{
+ int i;
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+ if (numbufs == 0)
+ return;
+
+ for (i = 0; i < numbufs; i++) {
+ if (dbp[i])
+ dbuf_rele(dbp[i], tag);
+ }
+
+ kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+void
+dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ int nblks, i, err;
+
+ if (zfs_prefetch_disable)
+ return;
+
+ if (len == 0) { /* they're interested in the bonus buffer */
+ dn = os->os->os_meta_dnode;
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, blkid);
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+
+ /*
+ * XXX - Note, if the dnode for the requested object is not
+ * already cached, we will do a *synchronous* read in the
+ * dnode_hold() call. The same is true for any indirects.
+ */
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err != 0)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
+ P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+ } else {
+ nblks = (offset < dn->dn_datablksz);
+ }
+
+ if (nblks != 0) {
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, blkid+i);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ ASSERT(offset < UINT64_MAX);
+ ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+ dnode_free_range(dn, offset, size, tx);
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf)
+{
+ dnode_t *dn;
+ dmu_buf_t **dbp;
+ int numbufs, i, err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ /*
+ * Deal with odd block sizes, where there can't be data past the first
+ * block. If we ever do the tail block optimization, we will need to
+ * handle that here as well.
+ */
+ if (dn->dn_datablkshift == 0) {
+ int newsz = offset > dn->dn_datablksz ? 0 :
+ MIN(size, dn->dn_datablksz - offset);
+ bzero((char *)buf + newsz, size - newsz);
+ size = newsz;
+ }
+
+ while (size > 0) {
+ uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
+ TRUE, FTAG, &numbufs, &dbp);
+ if (err)
+ break;
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ bcopy((char *)db->db_data + bufoff, buf, tocpy);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ }
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ if (size == 0)
+ return;
+
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i, err;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
+ &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = uio->uio_loffset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (err);
+}
+
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+ int err = 0;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = uio->uio_loffset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ /*
+ * XXX uiomove could block forever (eg. nfs-backed
+ * pages). There needs to be a uiolockdown() function
+ * to lock the pages in memory, so that uiomove won't
+ * block.
+ */
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_WRITE, uio);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ page_t *pp, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy, copied, thiscpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+ caddr_t va;
+
+ ASSERT(size > 0);
+ ASSERT3U(db->db_size, >=, PAGESIZE);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+ ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
+ thiscpy = MIN(PAGESIZE, tocpy - copied);
+ va = ppmapin(pp, PROT_READ, (caddr_t)-1);
+ bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+ ppmapout(va);
+ pp = pp->p_next;
+ bufoff += PAGESIZE;
+ }
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+#endif
+
+typedef struct {
+ dbuf_dirty_record_t *dr;
+ dmu_sync_cb_t *done;
+ void *arg;
+} dmu_sync_arg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ dmu_sync_arg_t *in = varg;
+ dbuf_dirty_record_t *dr = in->dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dmu_sync_cb_t *done = in->done;
+
+ if (!BP_IS_HOLE(zio->io_bp)) {
+ zio->io_bp->blk_fill = 1;
+ BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
+ BP_SET_LEVEL(zio->io_bp, 0);
+ }
+
+ mutex_enter(&db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+ dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+
+ if (done)
+ done(&(db->db), in->arg);
+
+ kmem_free(in, sizeof (dmu_sync_arg_t));
+}
+
+/*
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ * EEXIST: this txg has already been synced, so there's nothing to to.
+ * The caller should not log the write.
+ *
+ * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ * The caller should not log the write.
+ *
+ * EALREADY: this block is already in the process of being synced.
+ * The caller should track its progress (somehow).
+ *
+ * EINPROGRESS: the IO has been initiated.
+ * The caller should log this blkptr in the callback.
+ *
+ * 0: completed. Sets *bp to the blkptr just written.
+ * The caller should log this blkptr immediately.
+ */
+int
+dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
+ blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ objset_impl_t *os = db->db_objset;
+ dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
+ tx_state_t *tx = &dp->dp_tx;
+ dbuf_dirty_record_t *dr;
+ dmu_sync_arg_t *in;
+ zbookmark_t zb;
+ zio_t *zio;
+ int zio_flags;
+ int err;
+
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT(txg != 0);
+
+
+ dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
+ txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+
+ /*
+ * XXX - would be nice if we could do this without suspending...
+ */
+ txg_suspend(dp);
+
+ /*
+ * If this txg already synced, there's nothing to do.
+ */
+ if (txg <= tx->tx_synced_txg) {
+ txg_resume(dp);
+ /*
+ * If we're running ziltest, we need the blkptr regardless.
+ */
+ if (txg > spa_freeze_txg(dp->dp_spa)) {
+ /* if db_blkptr == NULL, this was an empty write */
+ if (db->db_blkptr)
+ *bp = *db->db_blkptr; /* structure assignment */
+ return (0);
+ }
+ return (EEXIST);
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ if (txg == tx->tx_syncing_txg) {
+ while (db->db_data_pending) {
+ /*
+ * IO is in-progress. Wait for it to finish.
+ * XXX - would be nice to be able to somehow "attach"
+ * this zio to the parent zio passed in.
+ */
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (!db->db_data_pending &&
+ db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
+ /*
+ * IO was compressed away
+ */
+ *bp = *db->db_blkptr; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+ ASSERT(db->db_data_pending ||
+ (db->db_blkptr && db->db_blkptr->blk_birth == txg));
+ }
+
+ if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
+ /*
+ * IO is already completed.
+ */
+ *bp = *db->db_blkptr; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+ }
+
+ dr = db->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ if (dr == NULL || dr->dr_txg < txg) {
+ /*
+ * This dbuf isn't dirty, must have been free_range'd.
+ * There's no need to log writes to freed blocks, so we're done.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (ENOENT);
+ }
+
+ ASSERT(dr->dr_txg == txg);
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ /*
+ * We have already issued a sync write for this buffer.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (EALREADY);
+ } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * This buffer has already been synced. It could not
+ * have been dirtied since, or we would have cleared the state.
+ */
+ *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+
+ dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+ in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ in->dr = dr;
+ in->done = done;
+ in->arg = arg;
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+
+ zb.zb_objset = os->os_dsl_dataset->ds_object;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+ zio_flags = ZIO_FLAG_MUSTSUCCEED;
+ if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
+ zio_flags |= ZIO_FLAG_METADATA;
+ zio = arc_write(pio, os->os_spa,
+ zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
+ zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
+ dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
+ txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
+ ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
+
+ if (pio) {
+ zio_nowait(zio);
+ err = EINPROGRESS;
+ } else {
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+ }
+ return (err);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_blksz(dn, size, ibs, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
+ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ dn->dn_checksum = checksum;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
+ ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+ dn->dn_compress = compress;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+int
+dmu_get_replication_level(objset_impl_t *os,
+ zbookmark_t *zb, dmu_object_type_t ot)
+{
+ int ncopies = os->os_copies;
+
+ /* If it's the mos, it should have max copies set. */
+ ASSERT(zb->zb_objset != 0 ||
+ ncopies == spa_max_replication(os->os_spa));
+
+ if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
+ ncopies++;
+ return (MIN(ncopies, spa_max_replication(os->os_spa)));
+}
+
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+ dnode_t *dn;
+ int i, err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ /*
+ * Sync any current changes before
+ * we go trundling through the block pointers.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (list_link_active(&dn->dn_dirty_link[i]))
+ break;
+ }
+ if (i != TXG_SIZE) {
+ dnode_rele(dn, FTAG);
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ }
+
+ err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ mutex_enter(&dn->dn_mtx);
+
+ doi->doi_data_block_size = dn->dn_datablksz;
+ doi->doi_metadata_block_size = dn->dn_indblkshift ?
+ 1ULL << dn->dn_indblkshift : 0;
+ doi->doi_indirection = dn->dn_nlevels;
+ doi->doi_checksum = dn->dn_checksum;
+ doi->doi_compress = dn->dn_compress;
+ doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
+ SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
+ doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_bonus_type = dn->dn_bonustype;
+
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os->os, object, FTAG, &dn);
+
+ if (err)
+ return (err);
+
+ if (doi != NULL)
+ dmu_object_info_from_dnode(dn, doi);
+
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+}
+
+/*
+ * Faster still when you only care about the size.
+ * This is specifically optimized for zfs_getattr().
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ *blksize = dn->dn_datablksz;
+ /* add 1 for dnode space */
+ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
+ SPA_MINBLOCKSHIFT) + 1;
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+ uint64_t *buf = vbuf;
+ size_t count = size >> 3;
+ int i;
+
+ ASSERT((size & 7) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+ uint32_t *buf = vbuf;
+ size_t count = size >> 2;
+ int i;
+
+ ASSERT((size & 3) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+ uint16_t *buf = vbuf;
+ size_t count = size >> 1;
+ int i;
+
+ ASSERT((size & 1) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+ dbuf_init();
+ dnode_init();
+ arc_init();
+ l2arc_init();
+}
+
+void
+dmu_fini(void)
+{
+ arc_fini();
+ dnode_fini();
+ dbuf_fini();
+ l2arc_fini();
+}
diff --git a/zfs/lib/libzpool/dmu_object.c b/zfs/lib/libzpool/dmu_object.c
new file mode 100644
index 000000000..7a2c9e356
--- /dev/null
+++ b/zfs/lib/libzpool/dmu_object.c
@@ -0,0 +1,160 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dmu_object.c 1.3 06/10/31 SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ objset_impl_t *osi = os->os;
+ uint64_t object;
+ uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
+ (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+ dnode_t *dn = NULL;
+ int restarted = B_FALSE;
+
+ mutex_enter(&osi->os_obj_lock);
+ for (;;) {
+ object = osi->os_obj_next;
+ /*
+ * Each time we polish off an L2 bp worth of dnodes
+ * (2^13 objects), move to another L2 bp that's still
+ * reasonably sparse (at most 1/4 full). Look from the
+ * beginning once, but after that keep looking from here.
+ * If we can't find one, just keep going from here.
+ */
+ if (P2PHASE(object, L2_dnode_count) == 0) {
+ uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
+ int error = dnode_next_offset(osi->os_meta_dnode,
+ B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ restarted = B_TRUE;
+ if (error == 0)
+ object = offset >> DNODE_SHIFT;
+ }
+ osi->os_obj_next = ++object;
+
+ /*
+ * XXX We should check for an i/o error here and return
+ * up to our caller. Actually we should pre-read it in
+ * dmu_tx_assign(), but there is currently no mechanism
+ * to do so.
+ */
+ (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+ FTAG, &dn);
+ if (dn)
+ break;
+
+ if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
+ osi->os_obj_next = object - 1;
+ }
+
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ mutex_exit(&osi->os_obj_lock);
+
+ dmu_tx_add_new_object(tx, os, object);
+ return (object);
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ return (EBADF);
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ if (err)
+ return (err);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ dmu_tx_add_new_object(tx, os, object);
+ return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ return (EBADF);
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err)
+ return (err);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+ dnode_free(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
+{
+ uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+ int error;
+
+ error = dnode_next_offset(os->os->os_meta_dnode,
+ hole, &offset, 0, DNODES_PER_BLOCK, txg);
+
+ *objectp = offset >> DNODE_SHIFT;
+
+ return (error);
+}
diff --git a/zfs/lib/libzpool/dmu_objset.c b/zfs/lib/libzpool/dmu_objset.c
new file mode 100644
index 000000000..829df52a4
--- /dev/null
+++ b/zfs/lib/libzpool/dmu_objset.c
@@ -0,0 +1,1149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dmu_objset.c 1.37 08/04/27 SMI"
+
+#include <sys/cred.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zvol.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio_checksum.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+#include <sys/zfs_ioctl.h>
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+ return (os->os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+ return (os->os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+ dsl_dataset_t *ds;
+
+ if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+ return (ds->ds_dir->dd_pool);
+ else
+ return (spa_get_dsl(os->os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+ return (os->os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+ return (os->os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+ dsl_dataset_name(os->os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+
+ return (ds ? ds->ds_object : 0);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+ osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+}
+
+static void
+copies_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval > 0);
+ ASSERT(newval <= spa_max_replication(osi->os_spa));
+
+ osi->os_copies = newval;
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+ objset_phys_t *osp = buf;
+
+ ASSERT(size == sizeof (objset_phys_t));
+ dnode_byteswap(&osp->os_meta_dnode);
+ byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+ osp->os_type = BSWAP_64(osp->os_type);
+}
+
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ objset_impl_t **osip)
+{
+ objset_impl_t *osi;
+ int i, err, checksum;
+
+ ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
+
+ osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
+ osi->os.os = osi;
+ osi->os_dsl_dataset = ds;
+ osi->os_spa = spa;
+ osi->os_rootbp = bp;
+ if (!BP_IS_HOLE(osi->os_rootbp)) {
+ uint32_t aflags = ARC_WAIT;
+ zbookmark_t zb;
+ zb.zb_objset = ds ? ds->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
+
+ dprintf_bp(osi->os_rootbp, "reading %s", "");
+ err = arc_read(NULL, spa, osi->os_rootbp,
+ dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+ arc_getbuf_func, &osi->os_phys_buf,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
+ if (err) {
+ kmem_free(osi, sizeof (objset_impl_t));
+ return (err);
+ }
+ osi->os_phys = osi->os_phys_buf->b_data;
+ if (ds == NULL || dsl_dataset_is_snapshot(ds) == 0)
+ arc_release(osi->os_phys_buf, &osi->os_phys_buf);
+ } else {
+ osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
+ &osi->os_phys_buf, ARC_BUFC_METADATA);
+ osi->os_phys = osi->os_phys_buf->b_data;
+ bzero(osi->os_phys, sizeof (objset_phys_t));
+ }
+
+ /*
+ * Note: the changed_cb will be called once before the register
+ * func returns, thus changing the checksum/compression from the
+ * default (fletcher2/off). Snapshots don't need to know, and
+ * registering would complicate clone promotion.
+ */
+ if (ds && ds->ds_phys->ds_num_children == 0) {
+ err = dsl_prop_register(ds, "checksum",
+ checksum_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "compression",
+ compression_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "copies",
+ copies_changed_cb, osi);
+ if (err) {
+ VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
+ &osi->os_phys_buf) == 1);
+ kmem_free(osi, sizeof (objset_impl_t));
+ return (err);
+ }
+ } else if (ds == NULL) {
+ /* It's the meta-objset. */
+ osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ osi->os_compress = ZIO_COMPRESS_LZJB;
+ osi->os_copies = spa_max_replication(spa);
+ }
+
+ osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
+ /*
+ * Metadata always gets compressed and checksummed.
+ * If the data checksum is multi-bit correctable, and it's not
+ * a ZBT-style checksum, then it's suitable for metadata as well.
+ * Otherwise, the metadata checksum defaults to fletcher4.
+ */
+ checksum = osi->os_checksum;
+
+ if (zio_checksum_table[checksum].ci_correctable &&
+ !zio_checksum_table[checksum].ci_zbt)
+ osi->os_md_checksum = checksum;
+ else
+ osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ osi->os_md_compress = ZIO_COMPRESS_LZJB;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]));
+ list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]));
+ }
+ list_create(&osi->os_dnodes, sizeof (dnode_t),
+ offsetof(dnode_t, dn_link));
+ list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ osi->os_meta_dnode = dnode_special_open(osi,
+ &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+
+ /*
+ * We should be the only thread trying to do this because we
+ * have ds_opening_lock
+ */
+ if (ds) {
+ VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi,
+ dmu_objset_evict));
+ }
+
+ *osip = osi;
+ return (0);
+}
+
+static int
+dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type)
+{
+ objset_impl_t *osi;
+ int err;
+
+ mutex_enter(&ds->ds_opening_lock);
+ osi = dsl_dataset_get_user_ptr(ds);
+ if (osi == NULL) {
+ err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+ ds, &ds->ds_phys->ds_bp, &osi);
+ if (err)
+ return (err);
+ }
+ mutex_exit(&ds->ds_opening_lock);
+
+ os->os = osi;
+ os->os_mode = DS_MODE_NONE;
+
+ if (type != DMU_OST_ANY && type != os->os->os_phys->os_type)
+ return (EINVAL);
+ return (0);
+}
+
+int
+dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp)
+{
+ objset_t *os;
+ int err;
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ err = dmu_objset_open_ds_os(ds, os, type);
+ if (err)
+ kmem_free(os, sizeof (objset_t));
+ else
+ *osp = os;
+ return (err);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp)
+{
+ objset_t *os;
+ dsl_dataset_t *ds;
+ int err;
+
+ ASSERT(mode != DS_MODE_NONE);
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ err = dsl_dataset_open(name, mode, os, &ds);
+ if (err) {
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+
+ err = dmu_objset_open_ds_os(ds, os, type);
+ os->os_mode = mode;
+ if (err) {
+ kmem_free(os, sizeof (objset_t));
+ dsl_dataset_close(ds, mode, os);
+ } else {
+ *osp = os;
+ }
+ return (err);
+}
+
+void
+dmu_objset_close(objset_t *os)
+{
+ if (os->os_mode != DS_MODE_NONE)
+ dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+ kmem_free(os, sizeof (objset_t));
+}
+
+int
+dmu_objset_evict_dbufs(objset_t *os)
+{
+ objset_impl_t *osi = os->os;
+ dnode_t *dn;
+
+ mutex_enter(&osi->os_lock);
+
+ /* process the mdn last, since the other dnodes have holds on it */
+ list_remove(&osi->os_dnodes, osi->os_meta_dnode);
+ list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
+
+ /*
+ * Find the first dnode with holds. We have to do this dance
+ * because dnode_add_ref() only works if you already have a
+ * hold. If there are no holds then it has no dbufs so OK to
+ * skip.
+ */
+ for (dn = list_head(&osi->os_dnodes);
+ dn && !dnode_add_ref(dn, FTAG);
+ dn = list_next(&osi->os_dnodes, dn))
+ continue;
+
+ while (dn) {
+ dnode_t *next_dn = dn;
+
+ do {
+ next_dn = list_next(&osi->os_dnodes, next_dn);
+ } while (next_dn && !dnode_add_ref(next_dn, FTAG));
+
+ mutex_exit(&osi->os_lock);
+ dnode_evict_dbufs(dn);
+ dnode_rele(dn, FTAG);
+ mutex_enter(&osi->os_lock);
+ dn = next_dn;
+ }
+ mutex_exit(&osi->os_lock);
+ return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
+}
+
+void
+dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+{
+ objset_impl_t *osi = arg;
+ objset_t os;
+ int i;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
+ ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
+ }
+
+ if (ds && ds->ds_phys->ds_num_children == 0) {
+ VERIFY(0 == dsl_prop_unregister(ds, "checksum",
+ checksum_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "compression",
+ compression_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "copies",
+ copies_changed_cb, osi));
+ }
+
+ /*
+ * We should need only a single pass over the dnode list, since
+ * nothing can be added to the list at this point.
+ */
+ os.os = osi;
+ (void) dmu_objset_evict_dbufs(&os);
+
+ ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
+ ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
+ ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
+
+ dnode_special_close(osi->os_meta_dnode);
+ zil_free(osi->os_zil);
+
+ VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
+ mutex_destroy(&osi->os_lock);
+ mutex_destroy(&osi->os_obj_lock);
+ mutex_destroy(&osi->os_user_ptr_lock);
+ kmem_free(osi, sizeof (objset_impl_t));
+}
+
+/* called from dsl for meta-objset */
+objset_impl_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, dmu_tx_t *tx)
+{
+ objset_impl_t *osi;
+ dnode_t *mdn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ if (ds)
+ mutex_enter(&ds->ds_opening_lock);
+ VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
+ if (ds)
+ mutex_exit(&ds->ds_opening_lock);
+ mdn = osi->os_meta_dnode;
+
+ dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
+ DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+
+ /*
+ * We don't want to have to increase the meta-dnode's nlevels
+ * later, because then we could do it in quescing context while
+ * we are also accessing it in open context.
+ *
+ * This precaution is not necessary for the MOS (ds == NULL),
+ * because the MOS is only updated in syncing context.
+ * This is most fortunate: the MOS is the only objset that
+ * needs to be synced multiple times as spa_sync() iterates
+ * to convergence, so minimizing its dn_nlevels matters.
+ */
+ if (ds != NULL) {
+ int levels = 1;
+
+ /*
+ * Determine the number of levels necessary for the meta-dnode
+ * to contain DN_MAX_OBJECT dnodes.
+ */
+ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+ (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+ DN_MAX_OBJECT * sizeof (dnode_phys_t))
+ levels++;
+
+ mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+ mdn->dn_nlevels = levels;
+ }
+
+ ASSERT(type != DMU_OST_NONE);
+ ASSERT(type != DMU_OST_ANY);
+ ASSERT(type < DMU_OST_NUMTYPES);
+ osi->os_phys->os_type = type;
+
+ dsl_dataset_dirty(ds, tx);
+
+ return (osi);
+}
+
+struct oscarg {
+ void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+ void *userarg;
+ dsl_dataset_t *clone_parent;
+ const char *lastname;
+ dmu_objset_type_t type;
+ uint64_t flags;
+};
+
+/*ARGSUSED*/
+static int
+dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct oscarg *oa = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ int err;
+ uint64_t ddobj;
+
+ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
+ oa->lastname, sizeof (uint64_t), 1, &ddobj);
+ if (err != ENOENT)
+ return (err ? err : EEXIST);
+
+ if (oa->clone_parent != NULL) {
+ /*
+ * You can't clone across pools.
+ */
+ if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
+ return (EXDEV);
+
+ /*
+ * You can only clone snapshots, not the head datasets.
+ */
+ if (oa->clone_parent->ds_phys->ds_num_children == 0)
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+static void
+dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct oscarg *oa = arg2;
+ dsl_dataset_t *ds;
+ blkptr_t *bp;
+ uint64_t dsobj;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsobj = dsl_dataset_create_sync(dd, oa->lastname,
+ oa->clone_parent, oa->flags, cr, tx);
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
+ DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
+ bp = dsl_dataset_get_blkptr(ds);
+ if (BP_IS_HOLE(bp)) {
+ objset_impl_t *osi;
+
+ /* This is an empty dmu_objset; not a clone. */
+ osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+ ds, bp, oa->type, tx);
+
+ if (oa->userfunc)
+ oa->userfunc(&osi->os, oa->userarg, cr, tx);
+ }
+
+ spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
+ tx, cr, "dataset = %llu", dsobj);
+
+ dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent, uint64_t flags,
+ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
+{
+ dsl_dir_t *pdd;
+ const char *tail;
+ int err = 0;
+ struct oscarg oa = { 0 };
+
+ ASSERT(strchr(name, '@') == NULL);
+ err = dsl_dir_open(name, FTAG, &pdd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ dsl_dir_close(pdd, FTAG);
+ return (EEXIST);
+ }
+
+ dprintf("name=%s\n", name);
+
+ oa.userfunc = func;
+ oa.userarg = arg;
+ oa.lastname = tail;
+ oa.type = type;
+ oa.flags = flags;
+
+ if (clone_parent != NULL) {
+ /*
+ * You can't clone to a different type.
+ */
+ if (clone_parent->os->os_phys->os_type != type) {
+ dsl_dir_close(pdd, FTAG);
+ return (EINVAL);
+ }
+ oa.clone_parent = clone_parent->os->os_dsl_dataset;
+ }
+ err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
+ dmu_objset_create_sync, pdd, &oa, 5);
+ dsl_dir_close(pdd, FTAG);
+ return (err);
+}
+
+int
+dmu_objset_destroy(const char *name)
+{
+ objset_t *os;
+ int error;
+
+ /*
+ * If it looks like we'll be able to destroy it, and there's
+ * an unplayed replay log sitting around, destroy the log.
+ * It would be nicer to do this in dsl_dataset_destroy_sync(),
+ * but the replay log objset is modified in open context.
+ */
+ error = dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_EXCLUSIVE|DS_MODE_READONLY, &os);
+ if (error == 0) {
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+
+ /*
+ * dsl_dataset_destroy() closes the ds.
+ * os is just used as the tag after it's freed.
+ */
+ kmem_free(os, sizeof (objset_t));
+ error = dsl_dataset_destroy(ds, os);
+ }
+
+ return (error);
+}
+
+/*
+ * This will close the objset.
+ */
+int
+dmu_objset_rollback(objset_t *os)
+{
+ int err;
+ dsl_dataset_t *ds;
+
+ ds = os->os->os_dsl_dataset;
+
+ if (!dsl_dataset_tryupgrade(ds, DS_MODE_STANDARD, DS_MODE_EXCLUSIVE)) {
+ dmu_objset_close(os);
+ return (EBUSY);
+ }
+
+ err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
+
+ /*
+ * NB: we close the objset manually because the rollback
+ * actually implicitly called dmu_objset_evict(), thus freeing
+ * the objset_impl_t.
+ */
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, os);
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+}
+
+struct snaparg {
+ dsl_sync_task_group_t *dstg;
+ char *snapname;
+ char failed[MAXPATHLEN];
+ boolean_t checkperms;
+ list_t objsets;
+};
+
+struct osnode {
+ list_node_t node;
+ objset_t *os;
+};
+
+static int
+dmu_objset_snapshot_one(char *name, void *arg)
+{
+ struct snaparg *sn = arg;
+ objset_t *os;
+ dmu_objset_stats_t stat;
+ int err;
+
+ (void) strcpy(sn->failed, name);
+
+ /*
+ * Check permissions only when requested. This only applies when
+ * doing a recursive snapshot. The permission checks for the starting
+ * dataset have already been performed in zfs_secpolicy_snapshot()
+ */
+ if (sn->checkperms == B_TRUE &&
+ (err = zfs_secpolicy_snapshot_perms(name, CRED())))
+ return (err);
+
+ err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ if (err != 0)
+ return (err);
+
+ /*
+ * If the objset is in an inconsistent state, return busy.
+ */
+ dmu_objset_fast_stat(os, &stat);
+ if (stat.dds_inconsistent) {
+ dmu_objset_close(os);
+ return (EBUSY);
+ }
+
+ /*
+ * NB: we need to wait for all in-flight changes to get to disk,
+ * so that we snapshot those changes. zil_suspend does this as
+ * a side effect.
+ */
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err == 0) {
+ struct osnode *osn;
+ dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
+ dsl_dataset_snapshot_sync, os->os->os_dsl_dataset,
+ sn->snapname, 3);
+ osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP);
+ osn->os = os;
+ list_insert_tail(&sn->objsets, osn);
+ } else {
+ dmu_objset_close(os);
+ }
+
+ return (err);
+}
+
+int
+dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
+{
+ dsl_sync_task_t *dst;
+ struct osnode *osn;
+ struct snaparg sn = { 0 };
+ spa_t *spa;
+ int err;
+
+ (void) strcpy(sn.failed, fsname);
+
+ err = spa_open(fsname, &spa, FTAG);
+ if (err)
+ return (err);
+
+ sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ sn.snapname = snapname;
+ list_create(&sn.objsets, sizeof (struct osnode),
+ offsetof(struct osnode, node));
+
+ if (recursive) {
+ sn.checkperms = B_TRUE;
+ err = dmu_objset_find(fsname,
+ dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
+ } else {
+ sn.checkperms = B_FALSE;
+ err = dmu_objset_snapshot_one(fsname, &sn);
+ }
+
+ if (err)
+ goto out;
+
+ err = dsl_sync_task_group_wait(sn.dstg);
+
+ for (dst = list_head(&sn.dstg->dstg_tasks); dst;
+ dst = list_next(&sn.dstg->dstg_tasks, dst)) {
+ dsl_dataset_t *ds = dst->dst_arg1;
+ if (dst->dst_err)
+ dsl_dataset_name(ds, sn.failed);
+ }
+
+out:
+ while (osn = list_head(&sn.objsets)) {
+ list_remove(&sn.objsets, osn);
+ zil_resume(dmu_objset_zil(osn->os));
+ dmu_objset_close(osn->os);
+ kmem_free(osn, sizeof (struct osnode));
+ }
+ list_destroy(&sn.objsets);
+
+ if (err)
+ (void) strcpy(fsname, sn.failed);
+ dsl_sync_task_group_destroy(sn.dstg);
+ spa_close(spa, FTAG);
+ return (err);
+}
+
+static void
+dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ while (dn = list_head(list)) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ ASSERT(dn->dn_dbuf->db_data_pending);
+ /*
+ * Initialize dn_zio outside dnode_sync()
+ * to accomodate meta-dnode
+ */
+ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
+ ASSERT(dn->dn_zio);
+
+ ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
+ list_remove(list, dn);
+ dnode_sync(dn, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ objset_impl_t *os = arg;
+ blkptr_t *bp = os->os_rootbp;
+ dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+ int i;
+
+ ASSERT(bp == zio->io_bp);
+
+ /*
+ * Update rootbp fill count.
+ */
+ bp->blk_fill = 1; /* count the meta-dnode */
+ for (i = 0; i < dnp->dn_nblkptr; i++)
+ bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+
+ BP_SET_TYPE(bp, DMU_OT_OBJSET);
+ BP_SET_LEVEL(bp, 0);
+
+ /* We must do this after we've set the bp's type and level */
+ if (!DVA_EQUAL(BP_IDENTITY(bp),
+ BP_IDENTITY(&zio->io_bp_orig))) {
+ if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio->io_bp_orig, NULL, os->os_synctx);
+ dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+ }
+}
+
+/* ARGSUSED */
+static void
+killer(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ objset_impl_t *os = arg;
+
+ ASSERT3U(zio->io_error, ==, 0);
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+}
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
+{
+ int txgoff;
+ zbookmark_t zb;
+ zio_t *zio;
+ list_t *list;
+ dbuf_dirty_record_t *dr;
+
+ dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* XXX the write_done callback should really give us the tx... */
+ os->os_synctx = tx;
+
+ if (os->os_dsl_dataset == NULL) {
+ /*
+ * This is the MOS. If we have upgraded,
+ * spa_max_replication() could change, so reset
+ * os_copies here.
+ */
+ os->os_copies = spa_max_replication(os->os_spa);
+ }
+
+ /*
+ * Create the root block IO
+ */
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
+ if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ os->os_rootbp, pio, tx);
+ }
+ zio = arc_write(pio, os->os_spa, os->os_md_checksum,
+ os->os_md_compress,
+ dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
+ tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_METADATA,
+ &zb);
+
+ /*
+ * Sync meta-dnode - the parent IO for the sync is the root block
+ */
+ os->os_meta_dnode->dn_zio = zio;
+ dnode_sync(os->os_meta_dnode, tx);
+
+ txgoff = tx->tx_txg & TXG_MASK;
+
+ dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
+ dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
+
+ list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+ while (dr = list_head(list)) {
+ ASSERT(dr->dr_dbuf->db_level == 0);
+ list_remove(list, dr);
+ if (dr->dr_zio)
+ zio_nowait(dr->dr_zio);
+ }
+ /*
+ * Free intent log blocks up to this tx.
+ */
+ zil_sync(os->os_zil, tx);
+ zio_nowait(zio);
+}
+
+void
+dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
+ usedobjsp, availobjsp);
+}
+
+uint64_t
+dmu_objset_fsid_guid(objset_t *os)
+{
+ return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
+}
+
+void
+dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
+{
+ stat->dds_type = os->os->os_phys->os_type;
+ if (os->os->os_dsl_dataset)
+ dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
+}
+
+void
+dmu_objset_stats(objset_t *os, nvlist_t *nv)
+{
+ ASSERT(os->os->os_dsl_dataset ||
+ os->os->os_phys->os_type == DMU_OST_META);
+
+ if (os->os->os_dsl_dataset != NULL)
+ dsl_dataset_stats(os->os->os_dsl_dataset, nv);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
+ os->os->os_phys->os_type);
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+ if (os->os->os_dsl_dataset != NULL)
+ return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+ else
+ return (B_FALSE);
+}
+
+int
+dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
+ boolean_t *conflict)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ uint64_t ignored;
+
+ if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ return (ENOENT);
+
+ return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
+ real, maxlen, conflict));
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ return (ENOENT);
+
+ zap_cursor_init_serialized(&cursor,
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (ENOENT);
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (ENAMETOOLONG);
+ }
+
+ (void) strcpy(name, attr.za_name);
+ if (idp)
+ *idp = attr.za_first_integer;
+ if (case_conflict)
+ *case_conflict = attr.za_normalization_conflict;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+int
+dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp)
+{
+ dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ /* there is no next dir on a snapshot! */
+ if (os->os->os_dsl_dataset->ds_object !=
+ dd->dd_phys->dd_head_dataset_obj)
+ return (ENOENT);
+
+ zap_cursor_init_serialized(&cursor,
+ dd->dd_pool->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (ENOENT);
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (ENAMETOOLONG);
+ }
+
+ (void) strcpy(name, attr.za_name);
+ if (idp)
+ *idp = attr.za_first_integer;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ */
+int
+dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
+{
+ dsl_dir_t *dd;
+ objset_t *os;
+ uint64_t snapobj;
+ zap_cursor_t zc;
+ zap_attribute_t *attr;
+ char *child;
+ int do_self, err;
+
+ err = dsl_dir_open(name, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+
+ /* NB: the $MOS dir doesn't have a head dataset */
+ do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+ attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /*
+ * Iterate over all children.
+ */
+ if (flags & DS_FIND_CHILDREN) {
+ for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr->za_integer_length == sizeof (uint64_t));
+ ASSERT(attr->za_num_integers == 1);
+
+ /*
+ * No separating '/' because parent's name ends in /.
+ */
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ /* XXX could probably just use name here */
+ dsl_dir_name(dd, child);
+ (void) strcat(child, "/");
+ (void) strcat(child, attr->za_name);
+ err = dmu_objset_find(child, func, arg, flags);
+ kmem_free(child, MAXPATHLEN);
+ if (err)
+ break;
+ }
+ zap_cursor_fini(&zc);
+
+ if (err) {
+ dsl_dir_close(dd, FTAG);
+ kmem_free(attr, sizeof (zap_attribute_t));
+ return (err);
+ }
+ }
+
+ /*
+ * Iterate over all snapshots.
+ */
+ if ((flags & DS_FIND_SNAPSHOTS) &&
+ dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
+
+ snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
+ dmu_objset_close(os);
+
+ for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr->za_integer_length == sizeof (uint64_t));
+ ASSERT(attr->za_num_integers == 1);
+
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ /* XXX could probably just use name here */
+ dsl_dir_name(dd, child);
+ (void) strcat(child, "@");
+ (void) strcat(child, attr->za_name);
+ err = func(child, arg);
+ kmem_free(child, MAXPATHLEN);
+ if (err)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ dsl_dir_close(dd, FTAG);
+ kmem_free(attr, sizeof (zap_attribute_t));
+
+ if (err)
+ return (err);
+
+ /*
+ * Apply to self if appropriate.
+ */
+ if (do_self)
+ err = func(name, arg);
+ return (err);
+}
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+ ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+ os->os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+ ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+ return (os->os->os_user_ptr);
+}
diff --git a/zfs/lib/libzpool/dmu_traverse.c b/zfs/lib/libzpool/dmu_traverse.c
new file mode 100644
index 000000000..6d5723249
--- /dev/null
+++ b/zfs/lib/libzpool/dmu_traverse.c
@@ -0,0 +1,917 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dmu_traverse.c 1.7 08/04/01 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+#include <sys/zvol.h>
+
+#define BP_SPAN_SHIFT(level, width) ((level) * (width))
+
+#define BP_EQUAL(b1, b2) \
+ (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
+ (b1)->blk_birth == (b2)->blk_birth)
+
+/*
+ * Compare two bookmarks.
+ *
+ * For ADVANCE_PRE, the visitation order is:
+ *
+ * objset 0, 1, 2, ..., ZB_MAXOBJSET.
+ * object 0, 1, 2, ..., ZB_MAXOBJECT.
+ * blkoff 0, 1, 2, ...
+ * level ZB_MAXLEVEL, ..., 2, 1, 0.
+ *
+ * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
+ * ordering vector is:
+ *
+ * < objset, object, blkoff, -level >
+ *
+ * For ADVANCE_POST, the starting offsets aren't sequential but ending
+ * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
+ * The visitation order is:
+ *
+ * objset 1, 2, ..., ZB_MAXOBJSET, 0.
+ * object 1, 2, ..., ZB_MAXOBJECT, 0.
+ * blkoff 1, 2, ...
+ * level 0, 1, 2, ..., ZB_MAXLEVEL.
+ *
+ * and thus a valid ordering vector is:
+ *
+ * < objset - 1, object - 1, blkoff, level >
+ *
+ * Both orderings can be expressed as:
+ *
+ * < objset + bias, object + bias, blkoff, level ^ bias >
+ *
+ * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
+ * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
+ *
+ * Special case: an objset's osphys is represented as level -1 of object 0.
+ * It is always either the very first or very last block we visit in an objset.
+ * Therefore, if either bookmark's level is -1, level alone determines order.
+ */
+static int
+compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
+ int advance)
+{
+ int bias = (advance & ADVANCE_PRE) ? 0 : -1;
+ uint64_t sblkoff, eblkoff;
+ int slevel, elevel, wshift;
+
+ if (szb->zb_objset + bias < ezb->zb_objset + bias)
+ return (-1);
+
+ if (szb->zb_objset + bias > ezb->zb_objset + bias)
+ return (1);
+
+ slevel = szb->zb_level;
+ elevel = ezb->zb_level;
+
+ if ((slevel | elevel) < 0)
+ return ((slevel ^ bias) - (elevel ^ bias));
+
+ if (szb->zb_object + bias < ezb->zb_object + bias)
+ return (-1);
+
+ if (szb->zb_object + bias > ezb->zb_object + bias)
+ return (1);
+
+ if (dnp == NULL)
+ return (0);
+
+ wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
+ eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
+
+ if (sblkoff < eblkoff)
+ return (-1);
+
+ if (sblkoff > eblkoff)
+ return (1);
+
+ return ((elevel ^ bias) - (slevel ^ bias));
+}
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define SET_BOOKMARK_LB(zb, level, blkid) \
+{ \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+static int
+advance_objset(zseg_t *zseg, uint64_t objset, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ if (advance & ADVANCE_PRE) {
+ if (objset >= ZB_MAXOBJSET)
+ return (ERANGE);
+ SET_BOOKMARK(zb, objset, 0, -1, 0);
+ } else {
+ if (objset >= ZB_MAXOBJSET)
+ objset = 0;
+ SET_BOOKMARK(zb, objset, 1, 0, 0);
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_object(zseg_t *zseg, uint64_t object, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ if (advance & ADVANCE_PRE) {
+ if (object >= ZB_MAXOBJECT) {
+ SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
+ } else {
+ SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
+ }
+ } else {
+ if (zb->zb_object == 0) {
+ SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
+ } else {
+ if (object >= ZB_MAXOBJECT)
+ object = 0;
+ SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
+ }
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_from_osphys(zseg_t *zseg, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ ASSERT(zb->zb_object == 0);
+ ASSERT(zb->zb_level == -1);
+ ASSERT(zb->zb_blkid == 0);
+
+ if (advance & ADVANCE_PRE) {
+ SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
+ } else {
+ if (zb->zb_objset == 0)
+ return (ERANGE);
+ SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int maxlevel = dnp->dn_nlevels - 1;
+ int level = zb->zb_level;
+ uint64_t blkid = zb->zb_blkid;
+
+ if (advance & ADVANCE_PRE) {
+ if (level > 0 && rc == 0) {
+ level--;
+ blkid <<= wshift;
+ } else {
+ blkid++;
+
+ if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+ dnp->dn_maxblkid)
+ return (ERANGE);
+
+ while (level < maxlevel) {
+ if (P2PHASE(blkid, 1ULL << wshift))
+ break;
+ blkid >>= wshift;
+ level++;
+ }
+ }
+ } else {
+ if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
+ blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
+ level = 0;
+ } else {
+ blkid >>= wshift;
+ level++;
+ }
+
+ while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+ dnp->dn_maxblkid) {
+ if (level == maxlevel)
+ return (ERANGE);
+ blkid >>= wshift;
+ level++;
+ }
+ }
+ SET_BOOKMARK_LB(zb, level, blkid);
+
+ if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+/*
+ * The traverse_callback function will call the function specified in th_func.
+ * In the event of an error the callee, specified by th_func, must return
+ * one of the following errors:
+ *
+ * EINTR - Indicates that the callee wants the traversal to
+ * abort immediately.
+ * ERESTART - The callee has acknowledged the error and would
+ * like to continue.
+ */
+static int
+traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
+{
+ /*
+ * Before we issue the callback, prune against maxtxg.
+ *
+ * We prune against mintxg before we get here because it's a big win.
+ * If a given block was born in txg 37, then we know that the entire
+ * subtree below that block must have been born in txg 37 or earlier.
+ * We can therefore lop off huge branches of the tree as we go.
+ *
+ * There's no corresponding optimization for maxtxg because knowing
+ * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
+ * children. In fact, the copy-on-write design of ZFS ensures that
+ * top-level blocks will pretty much always be new.
+ *
+ * Therefore, in the name of simplicity we don't prune against
+ * maxtxg until the last possible moment -- that being right now.
+ */
+ if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
+ return (0);
+
+ /*
+ * Debugging: verify that the order we visit things agrees with the
+ * order defined by compare_bookmark(). We don't check this for
+ * log blocks because there's no defined ordering for them; they're
+ * always visited (or not) as part of visiting the objset_phys_t.
+ */
+ if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zbookmark_t *szb = &zseg->seg_start;
+ zbookmark_t *ezb = &zseg->seg_end;
+ zbookmark_t *lzb = &th->th_lastcb;
+ dnode_phys_t *dnp = bc->bc_dnode;
+
+ ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
+ ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
+ ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
+ lzb->zb_level == ZB_NO_LEVEL);
+ *lzb = *zb;
+ }
+
+ th->th_callbacks++;
+ return (th->th_func(bc, th->th_spa, th->th_arg));
+}
+
+static int
+traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
+ dnode_phys_t *dnp)
+{
+ zbookmark_t *zb = &bc->bc_bookmark;
+ int error;
+
+ th->th_hits++;
+
+ bc->bc_dnode = dnp;
+ bc->bc_errno = 0;
+
+ if (BP_EQUAL(&bc->bc_blkptr, bp))
+ return (0);
+
+ bc->bc_blkptr = *bp;
+
+ if (bc->bc_data == NULL)
+ return (0);
+
+ if (BP_IS_HOLE(bp)) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ return (0);
+ }
+
+ if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
+ error = EIO;
+ } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
+ error = 0;
+ th->th_arc_hits++;
+ } else {
+ error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
+ BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
+
+ if (BP_SHOULD_BYTESWAP(bp) && error == 0)
+ (zb->zb_level > 0 ? byteswap_uint64_array :
+ dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
+ BP_GET_LSIZE(bp));
+ th->th_reads++;
+ }
+
+ if (error) {
+ bc->bc_errno = error;
+ error = traverse_callback(th, NULL, bc);
+ ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
+ bc->bc_blkptr.blk_birth = -1ULL;
+ }
+
+ dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
+ bc - &th->th_cache[0][0], error,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+ return (error);
+}
+
+static int
+find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ traverse_blk_cache_t *bc;
+ blkptr_t *bp = dnp->dn_blkptr;
+ int i, first, level;
+ int nbp = dnp->dn_nblkptr;
+ int minlevel = zb->zb_level;
+ int maxlevel = dnp->dn_nlevels - 1;
+ int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
+ uint64_t blkid = zb->zb_blkid >> bp_shift;
+ int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
+ int rc;
+
+ if (minlevel > maxlevel || blkid >= nbp)
+ return (ERANGE);
+
+ for (level = maxlevel; level >= minlevel; level--) {
+ first = P2PHASE(blkid, 1ULL << wshift);
+
+ for (i = first; i < nbp; i++)
+ if (bp[i].blk_birth > zseg->seg_mintxg ||
+ BP_IS_HOLE(&bp[i]) && do_holes)
+ break;
+
+ if (i != first) {
+ i--;
+ SET_BOOKMARK_LB(zb, level, blkid + (i - first));
+ return (ENOTBLK);
+ }
+
+ bc = &th->th_cache[depth][level];
+
+ SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
+ level, blkid);
+
+ if (rc = traverse_read(th, bc, bp + i, dnp)) {
+ if (rc != EAGAIN) {
+ SET_BOOKMARK_LB(zb, level, blkid);
+ }
+ return (rc);
+ }
+
+ if (BP_IS_HOLE(&bp[i])) {
+ SET_BOOKMARK_LB(zb, level, blkid);
+ th->th_lastcb.zb_level = ZB_NO_LEVEL;
+ return (0);
+ }
+
+ nbp = 1 << wshift;
+ bp = bc->bc_data;
+ bp_shift -= wshift;
+ blkid = zb->zb_blkid >> bp_shift;
+ }
+
+ return (0);
+}
+
+static int
+get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
+ uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
+{
+ zseg_t zseg;
+ zbookmark_t *zb = &zseg.seg_start;
+ uint64_t object = *objectp;
+ int i, rc;
+
+ SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
+ SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
+
+ zseg.seg_mintxg = txg;
+ zseg.seg_maxtxg = -1ULL;
+
+ for (;;) {
+ rc = find_block(th, &zseg, mdn, depth);
+
+ if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+ break;
+
+ if (rc == 0 && zb->zb_level == 0) {
+ dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
+ for (i = 0; i < DNODES_PER_BLOCK; i++) {
+ object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
+ if (object >= *objectp &&
+ dnp[i].dn_type != DMU_OT_NONE &&
+ (type == -1 || dnp[i].dn_type == type)) {
+ *objectp = object;
+ *dnpp = &dnp[i];
+ return (0);
+ }
+ }
+ }
+
+ rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
+
+ if (rc == ERANGE)
+ break;
+ }
+
+ if (rc == ERANGE)
+ *objectp = ZB_MAXOBJECT;
+
+ return (rc);
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ traverse_handle_t *th = arg;
+ traverse_blk_cache_t *bc = &th->th_zil_cache;
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zseg_t *zseg = list_head(&th->th_seglist);
+
+ if (bp->blk_birth <= zseg->seg_mintxg)
+ return;
+
+ if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
+ zb->zb_object = 0;
+ zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ bc->bc_blkptr = *bp;
+ (void) traverse_callback(th, zseg, bc);
+ }
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ traverse_handle_t *th = arg;
+ traverse_blk_cache_t *bc = &th->th_zil_cache;
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zseg_t *zseg = list_head(&th->th_seglist);
+
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ if (bp->blk_birth <= zseg->seg_mintxg)
+ return;
+
+ if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
+ zb->zb_object = lr->lr_foid;
+ zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+ bc->bc_blkptr = *bp;
+ (void) traverse_callback(th, zseg, bc);
+ }
+ }
+}
+
+static void
+traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
+{
+ spa_t *spa = th->th_spa;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_phys_t *osphys = bc->bc_data;
+ zil_header_t *zh = &osphys->os_zil_header;
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zilog_t *zilog;
+
+ ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
+ ASSERT(bc->bc_bookmark.zb_level == -1);
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0 && (spa_mode & FWRITE))
+ return;
+
+ th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
+ claim_txg);
+
+ zil_free(zilog);
+}
+
+static int
+traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ traverse_blk_cache_t *bc;
+ dnode_phys_t *dn, *dn_tmp;
+ int worklimit = 100;
+ int rc;
+
+ dprintf("<%llu, %llu, %d, %llx>\n",
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+ bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
+ dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+ SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
+
+ rc = traverse_read(th, bc, mosbp, dn);
+
+ if (rc) /* If we get ERESTART, we've got nowhere left to go */
+ return (rc == ERESTART ? EINTR : rc);
+
+ ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
+
+ if (zb->zb_objset != 0) {
+ uint64_t objset = zb->zb_objset;
+ dsl_dataset_phys_t *dsp;
+
+ rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
+ DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
+
+ if (objset != zb->zb_objset)
+ rc = advance_objset(zseg, objset, th->th_advance);
+
+ if (rc != 0)
+ return (rc);
+
+ dsp = DN_BONUS(dn_tmp);
+
+ bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
+ dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+ SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
+
+ /*
+ * If we're traversing an open snapshot, we know that it
+ * can't be deleted (because it's open) and it can't change
+ * (because it's a snapshot). Therefore, once we've gotten
+ * from the uberblock down to the snapshot's objset_phys_t,
+ * we no longer need to synchronize with spa_sync(); we're
+ * traversing a completely static block tree from here on.
+ */
+ if (th->th_advance & ADVANCE_NOLOCK) {
+ ASSERT(th->th_locked);
+ rw_exit(spa_traverse_rwlock(th->th_spa));
+ th->th_locked = 0;
+ }
+
+ rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+
+ if (rc != 0) {
+ if (rc == ERESTART)
+ rc = advance_objset(zseg, zb->zb_objset + 1,
+ th->th_advance);
+ return (rc);
+ }
+
+ if (th->th_advance & ADVANCE_PRUNE)
+ zseg->seg_mintxg =
+ MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
+ }
+
+ if (zb->zb_level == -1) {
+ ASSERT(zb->zb_object == 0);
+ ASSERT(zb->zb_blkid == 0);
+ ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
+
+ if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
+ rc = traverse_callback(th, zseg, bc);
+ if (rc) {
+ ASSERT(rc == EINTR);
+ return (rc);
+ }
+ if ((th->th_advance & ADVANCE_ZIL) &&
+ zb->zb_objset != 0)
+ traverse_zil(th, bc);
+ }
+
+ return (advance_from_osphys(zseg, th->th_advance));
+ }
+
+ if (zb->zb_object != 0) {
+ uint64_t object = zb->zb_object;
+
+ rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
+ zseg->seg_mintxg, -1, ZB_MDN_CACHE);
+
+ if (object != zb->zb_object)
+ rc = advance_object(zseg, object, th->th_advance);
+
+ if (rc != 0)
+ return (rc);
+
+ dn = dn_tmp;
+ }
+
+ if (zb->zb_level == ZB_MAXLEVEL)
+ zb->zb_level = dn->dn_nlevels - 1;
+
+ for (;;) {
+ rc = find_block(th, zseg, dn, ZB_DN_CACHE);
+
+ if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+ break;
+
+ if (rc == 0) {
+ bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
+ ASSERT(bc->bc_dnode == dn);
+ ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
+ rc = traverse_callback(th, zseg, bc);
+ if (rc) {
+ ASSERT(rc == EINTR);
+ return (rc);
+ }
+ if (BP_IS_HOLE(&bc->bc_blkptr)) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ rc = ENOTBLK;
+ }
+ }
+
+ rc = advance_block(zseg, dn, rc, th->th_advance);
+
+ if (rc == ERANGE)
+ break;
+
+ /*
+ * Give spa_sync() a chance to run.
+ */
+ if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
+ th->th_syncs++;
+ return (EAGAIN);
+ }
+
+ if (--worklimit == 0)
+ return (EAGAIN);
+ }
+
+ if (rc == ERANGE)
+ rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
+
+ return (rc);
+}
+
+/*
+ * It is the caller's responsibility to ensure that the dsl_dataset_t
+ * doesn't go away during traversal.
+ */
+int
+traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
+ blkptr_cb_t func, void *arg)
+{
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ traverse_handle_t *th;
+ int err;
+
+ th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
+
+ traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
+
+ while ((err = traverse_more(th)) == EAGAIN)
+ continue;
+
+ traverse_fini(th);
+ return (err);
+}
+
+int
+traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ traverse_handle_t *th;
+ int err;
+
+ th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
+
+ traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
+
+ while ((err = traverse_more(th)) == EAGAIN)
+ continue;
+
+ traverse_fini(th);
+ return (err);
+}
+
+int
+traverse_more(traverse_handle_t *th)
+{
+ zseg_t *zseg = list_head(&th->th_seglist);
+ uint64_t save_txg; /* XXX won't be necessary with real itinerary */
+ krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
+ blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
+ int rc;
+
+ if (zseg == NULL)
+ return (0);
+
+ th->th_restarts++;
+
+ save_txg = zseg->seg_mintxg;
+
+ rw_enter(rw, RW_READER);
+ th->th_locked = 1;
+
+ rc = traverse_segment(th, zseg, mosbp);
+ ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
+
+ if (th->th_locked)
+ rw_exit(rw);
+ th->th_locked = 0;
+
+ zseg->seg_mintxg = save_txg;
+
+ if (rc == ERANGE) {
+ list_remove(&th->th_seglist, zseg);
+ kmem_free(zseg, sizeof (*zseg));
+ return (EAGAIN);
+ }
+
+ return (rc);
+}
+
+/*
+ * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
+ * are not included. The blocks covered by this segment will all have
+ * mintxg < birth < maxtxg.
+ */
+static void
+traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
+ uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
+{
+ zseg_t *zseg;
+
+ zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
+
+ zseg->seg_mintxg = mintxg;
+ zseg->seg_maxtxg = maxtxg;
+
+ zseg->seg_start.zb_objset = sobjset;
+ zseg->seg_start.zb_object = sobject;
+ zseg->seg_start.zb_level = slevel;
+ zseg->seg_start.zb_blkid = sblkid;
+
+ zseg->seg_end.zb_objset = eobjset;
+ zseg->seg_end.zb_object = eobject;
+ zseg->seg_end.zb_level = elevel;
+ zseg->seg_end.zb_blkid = eblkid;
+
+ list_insert_tail(&th->th_seglist, zseg);
+}
+
+void
+traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t objset, uint64_t object)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, object, ZB_MAXLEVEL, 0,
+ objset, object, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, object, 0, 0,
+ objset, object, 0, ZB_MAXBLKID);
+}
+
+void
+traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t objset)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, 0, -1, 0,
+ objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, 1, 0, 0,
+ objset, 0, -1, 0);
+}
+
+void
+traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ 0, 0, -1, 0,
+ ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ 1, 1, 0, 0,
+ 0, 0, -1, 0);
+}
+
+traverse_handle_t *
+traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
+ int zio_flags)
+{
+ traverse_handle_t *th;
+ int d, l;
+
+ th = kmem_zalloc(sizeof (*th), KM_SLEEP);
+
+ th->th_spa = spa;
+ th->th_func = func;
+ th->th_arg = arg;
+ th->th_advance = advance;
+ th->th_lastcb.zb_level = ZB_NO_LEVEL;
+ th->th_noread.zb_level = ZB_NO_LEVEL;
+ th->th_zio_flags = zio_flags;
+
+ list_create(&th->th_seglist, sizeof (zseg_t),
+ offsetof(zseg_t, seg_node));
+
+ for (d = 0; d < ZB_DEPTH; d++) {
+ for (l = 0; l < ZB_MAXLEVEL; l++) {
+ if ((advance & ADVANCE_DATA) ||
+ l != 0 || d != ZB_DN_CACHE)
+ th->th_cache[d][l].bc_data =
+ zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ }
+ }
+
+ return (th);
+}
+
+void
+traverse_fini(traverse_handle_t *th)
+{
+ int d, l;
+ zseg_t *zseg;
+
+ for (d = 0; d < ZB_DEPTH; d++)
+ for (l = 0; l < ZB_MAXLEVEL; l++)
+ if (th->th_cache[d][l].bc_data != NULL)
+ zio_buf_free(th->th_cache[d][l].bc_data,
+ SPA_MAXBLOCKSIZE);
+
+ while ((zseg = list_head(&th->th_seglist)) != NULL) {
+ list_remove(&th->th_seglist, zseg);
+ kmem_free(zseg, sizeof (*zseg));
+ }
+
+ list_destroy(&th->th_seglist);
+
+ dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
+ th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
+ th->th_syncs, th->th_restarts);
+
+ kmem_free(th, sizeof (*th));
+}
diff --git a/zfs/lib/libzpool/dmu_tx.c b/zfs/lib/libzpool/dmu_tx.c
new file mode 100644
index 000000000..8c40c2680
--- /dev/null
+++ b/zfs/lib/libzpool/dmu_tx.c
@@ -0,0 +1,1034 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dmu_tx.c 1.19 08/03/20 SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
+#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h> /* for fzap_default_block_shift */
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+ uint64_t arg1, uint64_t arg2);
+
+
+dmu_tx_t *
+dmu_tx_create_dd(dsl_dir_t *dd)
+{
+ dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+ tx->tx_dir = dd;
+ if (dd)
+ tx->tx_pool = dd->dd_pool;
+ list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+ offsetof(dmu_tx_hold_t, txh_node));
+#ifdef ZFS_DEBUG
+ refcount_create(&tx->tx_space_written);
+ refcount_create(&tx->tx_space_freed);
+#endif
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
+ tx->tx_objset = os;
+ tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(NULL);
+
+ ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+ tx->tx_pool = dp;
+ tx->tx_txg = txg;
+ tx->tx_anyobj = TRUE;
+
+ return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+ enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn = NULL;
+ int err;
+
+ if (object != DMU_NEW_OBJECT) {
+ err = dnode_hold(os->os, object, tx, &dn);
+ if (err) {
+ tx->tx_err = err;
+ return (NULL);
+ }
+
+ if (err == 0 && tx->tx_txg != 0) {
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+ * problem, but there's no way for it to happen (for
+ * now, at least).
+ */
+ ASSERT(dn->dn_assigned_txg == 0);
+ dn->dn_assigned_txg = tx->tx_txg;
+ (void) refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ }
+
+ txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+ txh->txh_tx = tx;
+ txh->txh_dnode = dn;
+#ifdef ZFS_DEBUG
+ txh->txh_type = type;
+ txh->txh_arg1 = arg1;
+ txh->txh_arg2 = arg2;
+#endif
+ list_insert_tail(&tx->tx_holds, txh);
+
+ return (txh);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
+{
+ /*
+ * If we're syncing, they can manipulate any object anyhow, and
+ * the hold on the dnode_t can cause problems.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ (void) dmu_tx_hold_object_impl(tx, os,
+ object, THT_NEWOBJECT, 0, 0);
+ }
+}
+
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+ int err;
+ dmu_buf_impl_t *db;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold_level(dn, level, blkid, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL)
+ return (EIO);
+ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+ dbuf_rele(db, FTAG);
+ return (err);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dnode_t *dn = txh->txh_dnode;
+ uint64_t start, end, i;
+ int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
+ int err = 0;
+
+ if (len == 0)
+ return;
+
+ min_bs = SPA_MINBLOCKSHIFT;
+ max_bs = SPA_MAXBLOCKSHIFT;
+ min_ibs = DN_MIN_INDBLKSHIFT;
+ max_ibs = DN_MAX_INDBLKSHIFT;
+
+
+ /*
+ * For i/o error checking, read the first and last level-0
+ * blocks (if they are not aligned), and all the level-1 blocks.
+ */
+
+ if (dn) {
+ if (dn->dn_maxblkid == 0) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err)
+ goto out;
+ } else {
+ zio_t *zio = zio_root(dn->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ /* first level-0 block */
+ start = off >> dn->dn_datablkshift;
+ if (P2PHASE(off, dn->dn_datablksz) ||
+ len < dn->dn_datablksz) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, start);
+ if (err)
+ goto out;
+ }
+
+ /* last level-0 block */
+ end = (off+len-1) >> dn->dn_datablkshift;
+ if (end != start &&
+ P2PHASE(off+len, dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, end);
+ if (err)
+ goto out;
+ }
+
+ /* level-1 blocks */
+ if (dn->dn_nlevels > 1) {
+ start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (i = start+1; i < end; i++) {
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err)
+ goto out;
+ }
+ }
+
+ err = zio_wait(zio);
+ if (err)
+ goto out;
+ }
+ }
+
+ /*
+ * If there's more than one block, the blocksize can't change,
+ * so we can make a more precise estimate. Alternatively,
+ * if the dnode's ibs is larger than max_ibs, always use that.
+ * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+ * the code will still work correctly on existing pools.
+ */
+ if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
+ min_ibs = max_ibs = dn->dn_indblkshift;
+ if (dn->dn_datablkshift != 0)
+ min_bs = max_bs = dn->dn_datablkshift;
+ }
+
+ /*
+ * 'end' is the last thing we will access, not one past.
+ * This way we won't overflow when accessing the last byte.
+ */
+ start = P2ALIGN(off, 1ULL << max_bs);
+ end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
+ txh->txh_space_towrite += end - start + 1;
+
+ start >>= min_bs;
+ end >>= min_bs;
+
+ epbs = min_ibs - SPA_BLKPTRSHIFT;
+
+ /*
+ * The object contains at most 2^(64 - min_bs) blocks,
+ * and each indirect level maps 2^epbs.
+ */
+ for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
+ start >>= epbs;
+ end >>= epbs;
+ /*
+ * If we increase the number of levels of indirection,
+ * we'll need new blkid=0 indirect blocks. If start == 0,
+ * we're already accounting for that blocks; and if end == 0,
+ * we can't increase the number of levels beyond that.
+ */
+ if (start != 0 && end != 0)
+ txh->txh_space_towrite += 1ULL << max_ibs;
+ txh->txh_space_towrite += (end - start + 1) << max_ibs;
+ }
+
+ ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
+
+out:
+ if (err)
+ txh->txh_tx->tx_err = err;
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_hold_t *txh)
+{
+ dnode_t *dn = txh->txh_dnode;
+ dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
+ uint64_t space = mdn->dn_datablksz +
+ ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
+
+ if (dn && dn->dn_dbuf->db_blkptr &&
+ dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ dn->dn_dbuf->db_blkptr->blk_birth)) {
+ txh->txh_space_tooverwrite += space;
+ } else {
+ txh->txh_space_towrite += space;
+ if (dn && dn->dn_dbuf->db_blkptr)
+ txh->txh_space_tounref += space;
+ }
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(len < DMU_MAX_ACCESS);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_WRITE, off, len);
+ if (txh == NULL)
+ return;
+
+ dmu_tx_count_write(txh, off, len);
+ dmu_tx_count_dnode(txh);
+}
+
+static void
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ uint64_t blkid, nblks;
+ uint64_t space = 0, unref = 0;
+ dnode_t *dn = txh->txh_dnode;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
+ int dirty;
+
+ /*
+ * We don't need to use any locking to check for dirtyness
+ * because it's OK if we get stale data -- the dnode may become
+ * dirty immediately after our check anyway. This is just a
+ * means to avoid the expensive count when we aren't sure we
+ * need it. We need to be able to deal with a dirty dnode.
+ */
+ dirty = list_link_active(&dn->dn_dirty_link[0]) |
+ list_link_active(&dn->dn_dirty_link[1]) |
+ list_link_active(&dn->dn_dirty_link[2]) |
+ list_link_active(&dn->dn_dirty_link[3]);
+ if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
+ return;
+
+ /*
+ * the struct_rwlock protects us against dn_phys->dn_nlevels
+ * changing, in case (against all odds) we manage to dirty &
+ * sync out the changes after we check for being dirty.
+ * also, dbuf_hold_impl() wants us to have the struct_rwlock.
+ *
+ * It's fine to use dn_datablkshift rather than the dn_phys
+ * equivalent because if it is changing, maxblkid==0 and we will
+ * bail.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_phys->dn_maxblkid == 0) {
+ if (off == 0 && len >= dn->dn_datablksz) {
+ blkid = 0;
+ nblks = 1;
+ } else {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+ } else {
+ blkid = off >> dn->dn_datablkshift;
+ nblks = (off + len) >> dn->dn_datablkshift;
+
+ if (blkid >= dn->dn_phys->dn_maxblkid) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+ if (blkid + nblks > dn->dn_phys->dn_maxblkid)
+ nblks = dn->dn_phys->dn_maxblkid - blkid;
+
+ /* don't bother after 128,000 blocks */
+ nblks = MIN(nblks, 128*1024);
+ }
+
+ if (dn->dn_phys->dn_nlevels == 1) {
+ int i;
+ for (i = 0; i < nblks; i++) {
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+ bp += blkid + i;
+ if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
+ dprintf_bp(bp, "can free old%s", "");
+ space += bp_get_dasize(spa, bp);
+ }
+ unref += BP_GET_ASIZE(bp);
+ }
+ nblks = 0;
+ }
+
+ while (nblks) {
+ dmu_buf_impl_t *dbuf;
+ int err, epbs, blkoff, tochk;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ blkoff = P2PHASE(blkid, 1<<epbs);
+ tochk = MIN((1<<epbs) - blkoff, nblks);
+
+ err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
+ if (err == 0) {
+ int i;
+ blkptr_t *bp;
+
+ err = dbuf_read(dbuf, NULL,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ dbuf_rele(dbuf, FTAG);
+ break;
+ }
+
+ bp = dbuf->db.db_data;
+ bp += blkoff;
+
+ for (i = 0; i < tochk; i++) {
+ if (dsl_dataset_block_freeable(ds,
+ bp[i].blk_birth)) {
+ dprintf_bp(&bp[i],
+ "can free old%s", "");
+ space += bp_get_dasize(spa, &bp[i]);
+ }
+ unref += BP_GET_ASIZE(bp);
+ }
+ dbuf_rele(dbuf, FTAG);
+ }
+ if (err && err != ENOENT) {
+ txh->txh_tx->tx_err = err;
+ break;
+ }
+
+ blkid += tochk;
+ nblks -= tochk;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ txh->txh_space_tofree += space;
+ txh->txh_space_tounref += unref;
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn;
+ uint64_t start, end, i;
+ int err, shift;
+ zio_t *zio;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_FREE, off, len);
+ if (txh == NULL)
+ return;
+ dn = txh->txh_dnode;
+
+ /* first block */
+ if (off != 0)
+ dmu_tx_count_write(txh, off, 1);
+ /* last block */
+ if (len != DMU_OBJECT_END)
+ dmu_tx_count_write(txh, off+len, 1);
+
+ if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
+ return;
+ if (len == DMU_OBJECT_END)
+ len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+
+ /*
+ * For i/o error checking, read the first and last level-0
+ * blocks, and all the level-1 blocks. The above count_write's
+ * will take care of the level-0 blocks.
+ */
+ if (dn->dn_nlevels > 1) {
+ shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ start = off >> shift;
+ end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+
+ zio = zio_root(tx->tx_pool->dp_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (i = start; i <= end; i++) {
+ uint64_t ibyte = i << shift;
+ err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (err == ESRCH)
+ break;
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+ err = zio_wait(zio);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn;
+ uint64_t nblocks;
+ int epbs, err;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_ZAP, add, (uintptr_t)name);
+ if (txh == NULL)
+ return;
+ dn = txh->txh_dnode;
+
+ dmu_tx_count_dnode(txh);
+
+ if (dn == NULL) {
+ /*
+ * We will be able to fit a new object's entries into one leaf
+ * block. So there will be at most 2 blocks total,
+ * including the header block.
+ */
+ dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
+ return;
+ }
+
+ ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+
+ if (dn->dn_maxblkid == 0 && !add) {
+ /*
+ * If there is only one block (i.e. this is a micro-zap)
+ * and we are not adding anything, the accounting is simple.
+ */
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ /*
+ * Use max block size here, since we don't know how much
+ * the size will change between now and the dbuf dirty call.
+ */
+ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ dn->dn_phys->dn_blkptr[0].blk_birth)) {
+ txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ } else {
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref +=
+ BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
+ }
+ return;
+ }
+
+ if (dn->dn_maxblkid > 0 && name) {
+ /*
+ * access the name in this fat-zap so that we'll check
+ * for i/o errors to the leaf blocks, etc.
+ */
+ err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+ 8, 0, NULL);
+ if (err == EIO) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+
+ /*
+ * 3 blocks overwritten: target leaf, ptrtbl block, header block
+ * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
+ */
+ dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
+ (3 + add ? 3 : 0) << dn->dn_datablkshift);
+
+ /*
+ * If the modified blocks are scattered to the four winds,
+ * we'll have to modify an indirect twig for each.
+ */
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
+ txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_BONUS, 0, 0);
+ if (txh)
+ dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+ dmu_tx_hold_t *txh;
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ DMU_NEW_OBJECT, THT_SPACE, space, 0);
+
+ txh->txh_space_towrite += space;
+}
+
+int
+dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+ int holds = 0;
+
+ /*
+ * By asserting that the tx is assigned, we're counting the
+ * number of dn_tx_holds, which is the same as the number of
+ * dn_holds. Otherwise, we'd be counting dn_holds, but
+ * dn_tx_holds could be 0.
+ */
+ ASSERT(tx->tx_txg != 0);
+
+ /* if (tx->tx_anyobj == TRUE) */
+ /* return (0); */
+
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
+ holds++;
+ }
+
+ return (holds);
+}
+
+#ifdef ZFS_DEBUG
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+ dmu_tx_hold_t *txh;
+ int match_object = FALSE, match_offset = FALSE;
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+ ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+ if (tx->tx_anyobj)
+ return;
+
+ /* XXX No checking on the meta dnode for now */
+ if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ return;
+
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
+ if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
+ match_object = TRUE;
+ if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
+ int datablkshift = dn->dn_datablkshift ?
+ dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int shift = datablkshift + epbs * db->db_level;
+ uint64_t beginblk = shift >= 64 ? 0 :
+ (txh->txh_arg1 >> shift);
+ uint64_t endblk = shift >= 64 ? 0 :
+ ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
+ uint64_t blkid = db->db_blkid;
+
+ /* XXX txh_arg2 better not be zero... */
+
+ dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
+ txh->txh_type, beginblk, endblk);
+
+ switch (txh->txh_type) {
+ case THT_WRITE:
+ if (blkid >= beginblk && blkid <= endblk)
+ match_offset = TRUE;
+ /*
+ * We will let this hold work for the bonus
+ * buffer so that we don't need to hold it
+ * when creating a new object.
+ */
+ if (blkid == DB_BONUS_BLKID)
+ match_offset = TRUE;
+ /*
+ * They might have to increase nlevels,
+ * thus dirtying the new TLIBs. Or the
+ * might have to change the block size,
+ * thus dirying the new lvl=0 blk=0.
+ */
+ if (blkid == 0)
+ match_offset = TRUE;
+ break;
+ case THT_FREE:
+ if (blkid == beginblk &&
+ (txh->txh_arg1 != 0 ||
+ dn->dn_maxblkid == 0))
+ match_offset = TRUE;
+ if (blkid == endblk &&
+ txh->txh_arg2 != DMU_OBJECT_END)
+ match_offset = TRUE;
+ break;
+ case THT_BONUS:
+ if (blkid == DB_BONUS_BLKID)
+ match_offset = TRUE;
+ break;
+ case THT_ZAP:
+ match_offset = TRUE;
+ break;
+ case THT_NEWOBJECT:
+ match_object = TRUE;
+ break;
+ default:
+ ASSERT(!"bad txh_type");
+ }
+ }
+ if (match_object && match_offset)
+ return;
+ }
+ panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+ (u_longlong_t)db->db.db_object, db->db_level,
+ (u_longlong_t)db->db_blkid);
+}
+#endif
+
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ dmu_tx_hold_t *txh;
+ spa_t *spa = tx->tx_pool->dp_spa;
+ uint64_t lsize, asize, fsize, usize;
+ uint64_t towrite, tofree, tooverwrite, tounref;
+
+ ASSERT3U(tx->tx_txg, ==, 0);
+
+ if (tx->tx_err)
+ return (tx->tx_err);
+
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE) {
+ /*
+ * If the user has indicated a blocking failure mode
+ * then return ERESTART which will block in dmu_tx_wait().
+ * Otherwise, return EIO so that an error can get
+ * propagated back to the VOP calls.
+ *
+ * Note that we always honor the txg_how flag regardless
+ * of the failuremode setting.
+ */
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+ txg_how != TXG_WAIT)
+ return (EIO);
+
+ return (ERESTART);
+ }
+
+ tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+ tx->tx_needassign_txh = NULL;
+
+ /*
+ * NB: No error returns are allowed after txg_hold_open, but
+ * before processing the dnode holds, due to the
+ * dmu_tx_unassign() logic.
+ */
+
+ towrite = tofree = tooverwrite = tounref = 0;
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+ if (dn != NULL) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_assigned_txg == tx->tx_txg - 1) {
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = txh;
+ return (ERESTART);
+ }
+ if (dn->dn_assigned_txg == 0)
+ dn->dn_assigned_txg = tx->tx_txg;
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+ (void) refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ towrite += txh->txh_space_towrite;
+ tofree += txh->txh_space_tofree;
+ tooverwrite += txh->txh_space_tooverwrite;
+ tounref += txh->txh_space_tounref;
+ }
+
+ /*
+ * NB: This check must be after we've held the dnodes, so that
+ * the dmu_tx_unassign() logic will work properly
+ */
+ if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
+ return (ERESTART);
+
+ /*
+ * If a snapshot has been taken since we made our estimates,
+ * assume that we won't be able to free or overwrite anything.
+ */
+ if (tx->tx_objset &&
+ dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+ tx->tx_lastsnap_txg) {
+ towrite += tooverwrite;
+ tooverwrite = tofree = 0;
+ }
+
+ /*
+ * Convert logical size to worst-case allocated size.
+ */
+ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
+ lsize = towrite + tooverwrite;
+ asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+ usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
+
+#ifdef ZFS_DEBUG
+ tx->tx_space_towrite = asize;
+ tx->tx_space_tofree = tofree;
+ tx->tx_space_tooverwrite = tooverwrite;
+ tx->tx_space_tounref = tounref;
+#endif
+
+ if (tx->tx_dir && asize != 0) {
+ int err = dsl_dir_tempreserve_space(tx->tx_dir,
+ lsize, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
+ if (err)
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+dmu_tx_unassign(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ if (tx->tx_txg == 0)
+ return;
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ txg_rele_to_sync(&tx->tx_txgh);
+
+ tx->tx_lasttried_txg = tx->tx_txg;
+ tx->tx_txg = 0;
+}
+
+/*
+ * Assign tx to a transaction group. txg_how can be one of:
+ *
+ * (1) TXG_WAIT. If the current open txg is full, waits until there's
+ * a new one. This should be used when you're not holding locks.
+ * If will only fail if we're truly out of space (or over quota).
+ *
+ * (2) TXG_NOWAIT. If we can't assign into the current open txg without
+ * blocking, returns immediately with ERESTART. This should be used
+ * whenever you're holding locks. On an ERESTART error, the caller
+ * should drop locks, do a dmu_tx_wait(tx), and try again.
+ *
+ * (3) A specific txg. Use this if you need to ensure that multiple
+ * transactions all sync in the same txg. Like TXG_NOWAIT, it
+ * returns ERESTART if it can't assign you into the requested txg.
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(txg_how != 0);
+ ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+
+ while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
+ dmu_tx_unassign(tx);
+
+ if (err != ERESTART || txg_how != TXG_WAIT)
+ return (err);
+
+ dmu_tx_wait(tx);
+ }
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ return (0);
+}
+
+void
+dmu_tx_wait(dmu_tx_t *tx)
+{
+ spa_t *spa = tx->tx_pool->dp_spa;
+
+ ASSERT(tx->tx_txg == 0);
+
+ /*
+ * It's possible that the pool has become active after this thread
+ * has tried to obtain a tx. If that's the case then his
+ * tx_lasttried_txg would not have been assigned.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE ||
+ tx->tx_lasttried_txg == 0) {
+ txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+ } else if (tx->tx_needassign_txh) {
+ dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
+
+ mutex_enter(&dn->dn_mtx);
+ while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
+ cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = NULL;
+ } else {
+ txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
+ }
+}
+
+void
+dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
+{
+#ifdef ZFS_DEBUG
+ if (tx->tx_dir == NULL || delta == 0)
+ return;
+
+ if (delta > 0) {
+ ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
+ tx->tx_space_towrite);
+ (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
+ } else {
+ (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
+ }
+#endif
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg != 0);
+
+ while (txh = list_head(&tx->tx_holds)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ dnode_rele(dn, tx);
+ }
+
+ if (tx->tx_tempreserve_cookie)
+ dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+
+ if (tx->tx_anyobj == FALSE)
+ txg_rele_to_sync(&tx->tx_txgh);
+ list_destroy(&tx->tx_holds);
+#ifdef ZFS_DEBUG
+ dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
+ tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
+ tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ while (txh = list_head(&tx->tx_holds)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn != NULL)
+ dnode_rele(dn, tx);
+ }
+ list_destroy(&tx->tx_holds);
+#ifdef ZFS_DEBUG
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+ return (tx->tx_txg);
+}
diff --git a/zfs/lib/libzpool/dmu_zfetch.c b/zfs/lib/libzpool/dmu_zfetch.c
new file mode 100644
index 000000000..a47a0bcba
--- /dev/null
+++ b/zfs/lib/libzpool/dmu_zfetch.c
@@ -0,0 +1,651 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dmu_zfetch.c 1.6 06/10/26 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+
+/*
+ * I'm against tune-ables, but these should probably exist as tweakable globals
+ * until we can get this working the way we want it to.
+ */
+
+int zfs_prefetch_disable = 0;
+
+/* max # of streams per zfetch */
+uint32_t zfetch_max_streams = 8;
+/* min time before stream reclaim */
+uint32_t zfetch_min_sec_reap = 2;
+/* max number of blocks to fetch at a time */
+uint32_t zfetch_block_cap = 256;
+/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+uint64_t zfetch_array_rd_sz = 1024 * 1024;
+
+/* forward decls for static routines */
+static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
+static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
+static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
+static int dmu_zfetch_find(zfetch_t *, zstream_t *, int);
+static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
+static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
+static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
+static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
+
+/*
+ * Given a zfetch structure and a zstream structure, determine whether the
+ * blocks to be read are part of a co-linear pair of existing prefetch
+ * streams. If a set is found, coalesce the streams, removing one, and
+ * configure the prefetch so it looks for a strided access pattern.
+ *
+ * In other words: if we find two sequential access streams that are
+ * the same length and distance N appart, and this read is N from the
+ * last stream, then we are probably in a strided access pattern. So
+ * combine the two sequential streams into a single strided stream.
+ *
+ * If no co-linear streams are found, return NULL.
+ */
+static int
+dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
+{
+ zstream_t *z_walk;
+ zstream_t *z_comp;
+
+ if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
+ return (0);
+
+ if (zh == NULL) {
+ rw_exit(&zf->zf_rwlock);
+ return (0);
+ }
+
+ for (z_walk = list_head(&zf->zf_stream); z_walk;
+ z_walk = list_next(&zf->zf_stream, z_walk)) {
+ for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
+ z_comp = list_next(&zf->zf_stream, z_comp)) {
+ int64_t diff;
+
+ if (z_walk->zst_len != z_walk->zst_stride ||
+ z_comp->zst_len != z_comp->zst_stride) {
+ continue;
+ }
+
+ diff = z_comp->zst_offset - z_walk->zst_offset;
+ if (z_comp->zst_offset + diff == zh->zst_offset) {
+ z_walk->zst_offset = zh->zst_offset;
+ z_walk->zst_direction = diff < 0 ? -1 : 1;
+ z_walk->zst_stride =
+ diff * z_walk->zst_direction;
+ z_walk->zst_ph_offset =
+ zh->zst_offset + z_walk->zst_stride;
+ dmu_zfetch_stream_remove(zf, z_comp);
+ mutex_destroy(&z_comp->zst_lock);
+ kmem_free(z_comp, sizeof (zstream_t));
+
+ dmu_zfetch_dofetch(zf, z_walk);
+
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+
+ diff = z_walk->zst_offset - z_comp->zst_offset;
+ if (z_walk->zst_offset + diff == zh->zst_offset) {
+ z_walk->zst_offset = zh->zst_offset;
+ z_walk->zst_direction = diff < 0 ? -1 : 1;
+ z_walk->zst_stride =
+ diff * z_walk->zst_direction;
+ z_walk->zst_ph_offset =
+ zh->zst_offset + z_walk->zst_stride;
+ dmu_zfetch_stream_remove(zf, z_comp);
+ mutex_destroy(&z_comp->zst_lock);
+ kmem_free(z_comp, sizeof (zstream_t));
+
+ dmu_zfetch_dofetch(zf, z_walk);
+
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+ }
+ }
+
+ rw_exit(&zf->zf_rwlock);
+ return (0);
+}
+
+/*
+ * Given a zstream_t, determine the bounds of the prefetch. Then call the
+ * routine that actually prefetches the individual blocks.
+ */
+static void
+dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
+{
+ uint64_t prefetch_tail;
+ uint64_t prefetch_limit;
+ uint64_t prefetch_ofst;
+ uint64_t prefetch_len;
+ uint64_t blocks_fetched;
+
+ zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
+ zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
+
+ prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
+ (int64_t)(zs->zst_offset + zs->zst_stride));
+ /*
+ * XXX: use a faster division method?
+ */
+ prefetch_limit = zs->zst_offset + zs->zst_len +
+ (zs->zst_cap * zs->zst_stride) / zs->zst_len;
+
+ while (prefetch_tail < prefetch_limit) {
+ prefetch_ofst = zs->zst_offset + zs->zst_direction *
+ (prefetch_tail - zs->zst_offset);
+
+ prefetch_len = zs->zst_len;
+
+ /*
+ * Don't prefetch beyond the end of the file, if working
+ * backwards.
+ */
+ if ((zs->zst_direction == ZFETCH_BACKWARD) &&
+ (prefetch_ofst > prefetch_tail)) {
+ prefetch_len += prefetch_ofst;
+ prefetch_ofst = 0;
+ }
+
+ /* don't prefetch more than we're supposed to */
+ if (prefetch_len > zs->zst_len)
+ break;
+
+ blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
+ prefetch_ofst, zs->zst_len);
+
+ prefetch_tail += zs->zst_stride;
+ /* stop if we've run out of stuff to prefetch */
+ if (blocks_fetched < zs->zst_len)
+ break;
+ }
+ zs->zst_ph_offset = prefetch_tail;
+ zs->zst_last = lbolt;
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode. It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+ if (zf == NULL) {
+ return;
+ }
+
+ zf->zf_dnode = dno;
+ zf->zf_stream_cnt = 0;
+ zf->zf_alloc_fail = 0;
+
+ list_create(&zf->zf_stream, sizeof (zstream_t),
+ offsetof(zstream_t, zst_node));
+
+ rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * This function computes the actual size, in blocks, that can be prefetched,
+ * and fetches it.
+ */
+static uint64_t
+dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+ uint64_t fetchsz;
+ uint64_t i;
+
+ fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
+
+ for (i = 0; i < fetchsz; i++) {
+ dbuf_prefetch(dn, blkid + i);
+ }
+
+ return (fetchsz);
+}
+
+/*
+ * this function returns the number of blocks that would be prefetched, based
+ * upon the supplied dnode, blockid, and nblks. This is used so that we can
+ * update streams in place, and then prefetch with their old value after the
+ * fact. This way, we can delay the prefetch, but subsequent accesses to the
+ * stream won't result in the same data being prefetched multiple times.
+ */
+static uint64_t
+dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+ uint64_t fetchsz;
+
+ if (blkid > dn->dn_maxblkid) {
+ return (0);
+ }
+
+ /* compute fetch size */
+ if (blkid + nblks + 1 > dn->dn_maxblkid) {
+ fetchsz = (dn->dn_maxblkid - blkid) + 1;
+ ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
+ } else {
+ fetchsz = nblks;
+ }
+
+
+ return (fetchsz);
+}
+
+/*
+ * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * for this block read. If so, it starts a prefetch for the stream it
+ * located and returns true, otherwise it returns false
+ */
+static int
+dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
+{
+ zstream_t *zs;
+ int64_t diff;
+ int reset = !prefetched;
+ int rc = 0;
+
+ if (zh == NULL)
+ return (0);
+
+ /*
+ * XXX: This locking strategy is a bit coarse; however, it's impact has
+ * yet to be tested. If this turns out to be an issue, it can be
+ * modified in a number of different ways.
+ */
+
+ rw_enter(&zf->zf_rwlock, RW_READER);
+top:
+
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+
+ /*
+ * XXX - should this be an assert?
+ */
+ if (zs->zst_len == 0) {
+ /* bogus stream */
+ continue;
+ }
+
+ /*
+ * We hit this case when we are in a strided prefetch stream:
+ * we will read "len" blocks before "striding".
+ */
+ if (zh->zst_offset >= zs->zst_offset &&
+ zh->zst_offset < zs->zst_offset + zs->zst_len) {
+ /* already fetched */
+ rc = 1;
+ goto out;
+ }
+
+ /*
+ * This is the forward sequential read case: we increment
+ * len by one each time we hit here, so we will enter this
+ * case on every read.
+ */
+ if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
+
+ reset = !prefetched && zs->zst_len > 1;
+
+ mutex_enter(&zs->zst_lock);
+
+ if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+ zs->zst_len += zh->zst_len;
+ diff = zs->zst_len - zfetch_block_cap;
+ if (diff > 0) {
+ zs->zst_offset += diff;
+ zs->zst_len = zs->zst_len > diff ?
+ zs->zst_len - diff : 0;
+ }
+ zs->zst_direction = ZFETCH_FORWARD;
+
+ break;
+
+ /*
+ * Same as above, but reading backwards through the file.
+ */
+ } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
+ /* backwards sequential access */
+
+ reset = !prefetched && zs->zst_len > 1;
+
+ mutex_enter(&zs->zst_lock);
+
+ if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset = zs->zst_offset > zh->zst_len ?
+ zs->zst_offset - zh->zst_len : 0;
+ zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
+ zs->zst_ph_offset - zh->zst_len : 0;
+ zs->zst_len += zh->zst_len;
+
+ diff = zs->zst_len - zfetch_block_cap;
+ if (diff > 0) {
+ zs->zst_ph_offset = zs->zst_ph_offset > diff ?
+ zs->zst_ph_offset - diff : 0;
+ zs->zst_len = zs->zst_len > diff ?
+ zs->zst_len - diff : zs->zst_len;
+ }
+ zs->zst_direction = ZFETCH_BACKWARD;
+
+ break;
+
+ } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
+ zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+ /* strided forward access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
+ zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset += zs->zst_stride;
+ zs->zst_direction = ZFETCH_FORWARD;
+
+ break;
+
+ } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
+ zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+ /* strided reverse access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
+ zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset = zs->zst_offset > zs->zst_stride ?
+ zs->zst_offset - zs->zst_stride : 0;
+ zs->zst_ph_offset = (zs->zst_ph_offset >
+ (2 * zs->zst_stride)) ?
+ (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
+ zs->zst_direction = ZFETCH_BACKWARD;
+
+ break;
+ }
+ }
+
+ if (zs) {
+ if (reset) {
+ zstream_t *remove = zs;
+
+ rc = 0;
+ mutex_exit(&zs->zst_lock);
+ rw_exit(&zf->zf_rwlock);
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ /*
+ * Relocate the stream, in case someone removes
+ * it while we were acquiring the WRITER lock.
+ */
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if (zs == remove) {
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_destroy(&zs->zst_lock);
+ kmem_free(zs, sizeof (zstream_t));
+ break;
+ }
+ }
+ } else {
+ rc = 1;
+ dmu_zfetch_dofetch(zf, zs);
+ mutex_exit(&zs->zst_lock);
+ }
+ }
+out:
+ rw_exit(&zf->zf_rwlock);
+ return (rc);
+}
+
+/*
+ * Clean-up state associated with a zfetch structure. This frees allocated
+ * structure members, empties the zf_stream tree, and generally makes things
+ * nice. This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_rele(zfetch_t *zf)
+{
+ zstream_t *zs;
+ zstream_t *zs_next;
+
+ ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
+
+ for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs);
+
+ list_remove(&zf->zf_stream, zs);
+ mutex_destroy(&zs->zst_lock);
+ kmem_free(zs, sizeof (zstream_t));
+ }
+ list_destroy(&zf->zf_stream);
+ rw_destroy(&zf->zf_rwlock);
+
+ zf->zf_dnode = NULL;
+}
+
+/*
+ * Given a zfetch and zstream structure, insert the zstream structure into the
+ * AVL tree contained within the zfetch structure. Peform the appropriate
+ * book-keeping. It is possible that another thread has inserted a stream which
+ * matches one that we are about to insert, so we must be sure to check for this
+ * case. If one is found, return failure, and let the caller cleanup the
+ * duplicates.
+ */
+static int
+dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+{
+ zstream_t *zs_walk;
+ zstream_t *zs_next;
+
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs_walk);
+
+ if (dmu_zfetch_streams_equal(zs_walk, zs)) {
+ return (0);
+ }
+ }
+
+ list_insert_head(&zf->zf_stream, zs);
+ zf->zf_stream_cnt++;
+
+ return (1);
+}
+
+
+/*
+ * Walk the list of zstreams in the given zfetch, find an old one (by time), and
+ * reclaim it for use by the caller.
+ */
+static zstream_t *
+dmu_zfetch_stream_reclaim(zfetch_t *zf)
+{
+ zstream_t *zs;
+
+ if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
+ return (0);
+
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+
+ if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+ break;
+ }
+
+ if (zs) {
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_destroy(&zs->zst_lock);
+ bzero(zs, sizeof (zstream_t));
+ } else {
+ zf->zf_alloc_fail++;
+ }
+ rw_exit(&zf->zf_rwlock);
+
+ return (zs);
+}
+
+/*
+ * Given a zfetch and zstream structure, remove the zstream structure from its
+ * container in the zfetch structure. Perform the appropriate book-keeping.
+ */
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ list_remove(&zf->zf_stream, zs);
+ zf->zf_stream_cnt--;
+}
+
+static int
+dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
+{
+ if (zs1->zst_offset != zs2->zst_offset)
+ return (0);
+
+ if (zs1->zst_len != zs2->zst_len)
+ return (0);
+
+ if (zs1->zst_stride != zs2->zst_stride)
+ return (0);
+
+ if (zs1->zst_ph_offset != zs2->zst_ph_offset)
+ return (0);
+
+ if (zs1->zst_cap != zs2->zst_cap)
+ return (0);
+
+ if (zs1->zst_direction != zs2->zst_direction)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * This is the prefetch entry point. It calls all of the other dmu_zfetch
+ * routines to create, delete, find, or operate upon prefetch streams.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
+{
+ zstream_t zst;
+ zstream_t *newstream;
+ int fetched;
+ int inserted;
+ unsigned int blkshft;
+ uint64_t blksz;
+
+ if (zfs_prefetch_disable)
+ return;
+
+ /* files that aren't ln2 blocksz are only one block -- nothing to do */
+ if (!zf->zf_dnode->dn_datablkshift)
+ return;
+
+ /* convert offset and size, into blockid and nblocks */
+ blkshft = zf->zf_dnode->dn_datablkshift;
+ blksz = (1 << blkshft);
+
+ bzero(&zst, sizeof (zstream_t));
+ zst.zst_offset = offset >> blkshft;
+ zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
+ P2ALIGN(offset, blksz)) >> blkshft;
+
+ fetched = dmu_zfetch_find(zf, &zst, prefetched);
+ if (!fetched) {
+ fetched = dmu_zfetch_colinear(zf, &zst);
+ }
+
+ if (!fetched) {
+ newstream = dmu_zfetch_stream_reclaim(zf);
+
+ /*
+ * we still couldn't find a stream, drop the lock, and allocate
+ * one if possible. Otherwise, give up and go home.
+ */
+ if (newstream == NULL) {
+ uint64_t maxblocks;
+ uint32_t max_streams;
+ uint32_t cur_streams;
+
+ cur_streams = zf->zf_stream_cnt;
+ maxblocks = zf->zf_dnode->dn_maxblkid;
+
+ max_streams = MIN(zfetch_max_streams,
+ (maxblocks / zfetch_block_cap));
+ if (max_streams == 0) {
+ max_streams++;
+ }
+
+ if (cur_streams >= max_streams) {
+ return;
+ }
+
+ newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
+ }
+
+ newstream->zst_offset = zst.zst_offset;
+ newstream->zst_len = zst.zst_len;
+ newstream->zst_stride = zst.zst_len;
+ newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
+ newstream->zst_cap = zst.zst_len;
+ newstream->zst_direction = ZFETCH_FORWARD;
+ newstream->zst_last = lbolt;
+
+ mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ inserted = dmu_zfetch_stream_insert(zf, newstream);
+ rw_exit(&zf->zf_rwlock);
+
+ if (!inserted) {
+ mutex_destroy(&newstream->zst_lock);
+ kmem_free(newstream, sizeof (zstream_t));
+ }
+ }
+}
diff --git a/zfs/lib/libzpool/dnode.c b/zfs/lib/libzpool/dnode.c
new file mode 100644
index 000000000..3d40dc243
--- /dev/null
+++ b/zfs/lib/libzpool/dnode.c
@@ -0,0 +1,1387 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dnode.c 1.20 07/08/26 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static int free_range_compar(const void *node1, const void *node2);
+
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+ int i;
+ dnode_t *dn = arg;
+ bzero(dn, sizeof (dnode_t));
+
+ rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ refcount_create(&dn->dn_holds);
+ refcount_create(&dn->dn_tx_holds);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ avl_create(&dn->dn_ranges[i], free_range_compar,
+ sizeof (free_range_t),
+ offsetof(struct free_range, fr_node));
+ list_create(&dn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+
+ list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+ int i;
+ dnode_t *dn = arg;
+
+ rw_destroy(&dn->dn_struct_rwlock);
+ mutex_destroy(&dn->dn_mtx);
+ mutex_destroy(&dn->dn_dbufs_mtx);
+ refcount_destroy(&dn->dn_holds);
+ refcount_destroy(&dn->dn_tx_holds);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ avl_destroy(&dn->dn_ranges[i]);
+ list_destroy(&dn->dn_dirty_records[i]);
+ }
+
+ list_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+ dnode_cache = kmem_cache_create("dnode_t",
+ sizeof (dnode_t),
+ 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+}
+
+void
+dnode_fini(void)
+{
+ kmem_cache_destroy(dnode_cache);
+}
+
+
+#ifdef ZFS_DEBUG
+void
+dnode_verify(dnode_t *dn)
+{
+ int drop_struct_lock = FALSE;
+
+ ASSERT(dn->dn_phys);
+ ASSERT(dn->dn_objset);
+
+ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
+ if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+ return;
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+ int i;
+ ASSERT3U(dn->dn_indblkshift, >=, 0);
+ ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+ if (dn->dn_datablkshift) {
+ ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+ ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+ }
+ ASSERT3U(dn->dn_nlevels, <=, 30);
+ ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+ ASSERT3U(dn->dn_nblkptr, >=, 1);
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+ ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_datablksz, ==,
+ dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+ ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+ dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+ }
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE)
+ ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
+ if (dn->dn_dbuf != NULL) {
+ ASSERT3P(dn->dn_phys, ==,
+ (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+ (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+#endif
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+ uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+ int i;
+
+ if (dnp->dn_type == DMU_OT_NONE) {
+ bzero(dnp, sizeof (dnode_phys_t));
+ return;
+ }
+
+ dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+ dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+ dnp->dn_used = BSWAP_64(dnp->dn_used);
+
+ /*
+ * dn_nblkptr is only one byte, so it's OK to read it in either
+ * byte order. We can't read dn_bouslen.
+ */
+ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+ ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+ for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+ buf64[i] = BSWAP_64(buf64[i]);
+
+ /*
+ * OK to check dn_bonuslen for zero, because it won't matter if
+ * we have the wrong byte order. This is necessary because the
+ * dnode dnode is smaller than a regular dnode.
+ */
+ if (dnp->dn_bonuslen != 0) {
+ /*
+ * Note that the bonus length calculated here may be
+ * longer than the actual bonus buffer. This is because
+ * we always put the bonus buffer after the last block
+ * pointer (instead of packing it against the end of the
+ * dnode buffer).
+ */
+ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+ size_t len = DN_MAX_BONUSLEN - off;
+ ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
+ dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+ }
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+ dnode_phys_t *buf = vbuf;
+ int i;
+
+ ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+ ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+ size >>= DNODE_SHIFT;
+ for (i = 0; i < size; i++) {
+ dnode_byteswap(buf);
+ buf++;
+ }
+}
+
+static int
+free_range_compar(const void *node1, const void *node2)
+{
+ const free_range_t *rp1 = node1;
+ const free_range_t *rp2 = node2;
+
+ if (rp1->fr_blkid < rp2->fr_blkid)
+ return (-1);
+ else if (rp1->fr_blkid > rp2->fr_blkid)
+ return (1);
+ else return (0);
+}
+
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+ dn->dn_bonuslen = newsize;
+ if (newsize == 0)
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+ else
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+ ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+ 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+ dn->dn_datablksz = size;
+ dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+ dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+ uint64_t object)
+{
+ dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+ (void) dnode_cons(dn, NULL, 0); /* XXX */
+
+ dn->dn_objset = os;
+ dn->dn_object = object;
+ dn->dn_dbuf = db;
+ dn->dn_phys = dnp;
+
+ if (dnp->dn_datablkszsec)
+ dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_indblkshift = dnp->dn_indblkshift;
+ dn->dn_nlevels = dnp->dn_nlevels;
+ dn->dn_type = dnp->dn_type;
+ dn->dn_nblkptr = dnp->dn_nblkptr;
+ dn->dn_checksum = dnp->dn_checksum;
+ dn->dn_compress = dnp->dn_compress;
+ dn->dn_bonustype = dnp->dn_bonustype;
+ dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_maxblkid = dnp->dn_maxblkid;
+
+ dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+ mutex_enter(&os->os_lock);
+ list_insert_head(&os->os_dnodes, dn);
+ mutex_exit(&os->os_lock);
+
+ arc_space_consume(sizeof (dnode_t));
+ return (dn);
+}
+
+static void
+dnode_destroy(dnode_t *dn)
+{
+ objset_impl_t *os = dn->dn_objset;
+
+#ifdef ZFS_DEBUG
+ int i;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+ ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
+ ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
+ }
+ ASSERT(NULL == list_head(&dn->dn_dbufs));
+#endif
+
+ mutex_enter(&os->os_lock);
+ list_remove(&os->os_dnodes, dn);
+ mutex_exit(&os->os_lock);
+
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+ dmu_zfetch_rele(&dn->dn_zfetch);
+ if (dn->dn_bonus) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ kmem_cache_free(dnode_cache, dn);
+ arc_space_return(sizeof (dnode_t));
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int i;
+
+ if (blocksize == 0)
+ blocksize = 1 << zfs_default_bs;
+ else if (blocksize > SPA_MAXBLOCKSIZE)
+ blocksize = SPA_MAXBLOCKSIZE;
+ else
+ blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
+
+ if (ibs == 0)
+ ibs = zfs_default_ibs;
+
+ ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
+ dn->dn_object, tx->tx_txg, blocksize, ibs);
+
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+ ASSERT(ot != DMU_OT_NONE);
+ ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT3U(dn->dn_maxblkid, ==, 0);
+ ASSERT3U(dn->dn_allocated_txg, ==, 0);
+ ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+ ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[i], ==, 0);
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+ ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
+ ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+ }
+
+ dn->dn_type = ot;
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_indblkshift = ibs;
+ dn->dn_nlevels = 1;
+ dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ dn->dn_dirtyctx = 0;
+
+ dn->dn_free_txg = 0;
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+
+ dn->dn_allocated_txg = tx->tx_txg;
+
+ dnode_setdirty(dn, tx);
+ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int i, old_nblkptr;
+ dmu_buf_impl_t *db = NULL;
+
+ ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+ ASSERT(tx->tx_txg != 0);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+
+ for (i = 0; i < TXG_SIZE; i++)
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+
+ /* clean up any unreferenced dbufs */
+ dnode_evict_dbufs(dn);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ /*
+ * XXX I should really have a generation number to tell if we
+ * need to do this...
+ */
+ if (blocksize != dn->dn_datablksz ||
+ dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
+ /* free all old data */
+ dnode_free_range(dn, 0, -1ULL, tx);
+ }
+
+ /* change blocksize */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (blocksize != dn->dn_datablksz &&
+ (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+ list_head(&dn->dn_dbufs) != NULL)) {
+ db = dbuf_hold(dn, 0, FTAG);
+ dbuf_new_size(db, blocksize, tx);
+ }
+ dnode_setdblksz(dn, blocksize);
+ dnode_setdirty(dn, tx);
+ dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ /* change type */
+ dn->dn_type = ot;
+
+ /* change bonus size and type */
+ mutex_enter(&dn->dn_mtx);
+ old_nblkptr = dn->dn_nblkptr;
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+ /* XXX - for now, we can't make nblkptr smaller */
+ ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr);
+
+ /* fix up the bonus db_size if dn_nblkptr has changed */
+ if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
+ dn->dn_bonus->db.db_size =
+ DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+ }
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+}
+
+void
+dnode_special_close(dnode_t *dn)
+{
+ /*
+ * Wait for final references to the dnode to clear. This can
+ * only happen if the arc is asyncronously evicting state that
+ * has a hold on this dnode while we are trying to evict this
+ * dnode.
+ */
+ while (refcount_count(&dn->dn_holds) > 0)
+ delay(1);
+ dnode_destroy(dn);
+}
+
+dnode_t *
+dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+{
+ dnode_t *dn = dnode_create(os, dnp, NULL, object);
+ DNODE_VERIFY(dn);
+ return (dn);
+}
+
+static void
+dnode_buf_pageout(dmu_buf_t *db, void *arg)
+{
+ dnode_t **children_dnodes = arg;
+ int i;
+ int epb = db->db_size >> DNODE_SHIFT;
+
+ for (i = 0; i < epb; i++) {
+ dnode_t *dn = children_dnodes[i];
+ int n;
+
+ if (dn == NULL)
+ continue;
+#ifdef ZFS_DEBUG
+ /*
+ * If there are holds on this dnode, then there should
+ * be holds on the dnode's containing dbuf as well; thus
+ * it wouldn't be eligable for eviction and this function
+ * would not have been called.
+ */
+ ASSERT(refcount_is_zero(&dn->dn_holds));
+ ASSERT(list_head(&dn->dn_dbufs) == NULL);
+ ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+
+ for (n = 0; n < TXG_SIZE; n++)
+ ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
+#endif
+ children_dnodes[i] = NULL;
+ dnode_destroy(dn);
+ }
+ kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+}
+
+/*
+ * errors:
+ * EINVAL - invalid object number.
+ * EIO - i/o error.
+ * succeeds even for free dnodes.
+ */
+int
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+ void *tag, dnode_t **dnp)
+{
+ int epb, idx, err;
+ int drop_struct_lock = FALSE;
+ int type;
+ uint64_t blk;
+ dnode_t *mdn, *dn;
+ dmu_buf_impl_t *db;
+ dnode_t **children_dnodes;
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return (EINVAL);
+
+ mdn = os->os_meta_dnode;
+
+ DNODE_VERIFY(mdn);
+
+ if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+ rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+
+ db = dbuf_hold(mdn, blk, FTAG);
+ if (drop_struct_lock)
+ rw_exit(&mdn->dn_struct_rwlock);
+ if (db == NULL)
+ return (EIO);
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ dbuf_rele(db, FTAG);
+ return (err);
+ }
+
+ ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+ epb = db->db.db_size >> DNODE_SHIFT;
+
+ idx = object & (epb-1);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ if (children_dnodes == NULL) {
+ dnode_t **winner;
+ children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
+ KM_SLEEP);
+ if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
+ dnode_buf_pageout)) {
+ kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ children_dnodes = winner;
+ }
+ }
+
+ if ((dn = children_dnodes[idx]) == NULL) {
+ dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
+ dnode_t *winner;
+
+ dn = dnode_create(os, dnp, db, object);
+ winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+ if (winner != NULL) {
+ dnode_destroy(dn);
+ dn = winner;
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ type = dn->dn_type;
+ if (dn->dn_free_txg ||
+ ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
+ ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
+ mutex_exit(&dn->dn_mtx);
+ dbuf_rele(db, FTAG);
+ return (type == DMU_OT_NONE ? ENOENT : EEXIST);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (refcount_add(&dn->dn_holds, tag) == 1)
+ dbuf_add_ref(db, dn);
+
+ DNODE_VERIFY(dn);
+ ASSERT3P(dn->dn_dbuf, ==, db);
+ ASSERT3U(dn->dn_object, ==, object);
+ dbuf_rele(db, FTAG);
+
+ *dnp = dn;
+ return (0);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+int
+dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
+{
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+}
+
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode. Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
+dnode_add_ref(dnode_t *dn, void *tag)
+{
+ mutex_enter(&dn->dn_mtx);
+ if (refcount_is_zero(&dn->dn_holds)) {
+ mutex_exit(&dn->dn_mtx);
+ return (FALSE);
+ }
+ VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+ mutex_exit(&dn->dn_mtx);
+ return (TRUE);
+}
+
+void
+dnode_rele(dnode_t *dn, void *tag)
+{
+ uint64_t refs;
+
+ mutex_enter(&dn->dn_mtx);
+ refs = refcount_remove(&dn->dn_holds, tag);
+ mutex_exit(&dn->dn_mtx);
+ /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+ if (refs == 0 && dn->dn_dbuf)
+ dbuf_rele(dn->dn_dbuf, dn);
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+
+ if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ return;
+
+ DNODE_VERIFY(dn);
+
+#ifdef ZFS_DEBUG
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+ /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+ mutex_exit(&dn->dn_mtx);
+#endif
+
+ mutex_enter(&os->os_lock);
+
+ /*
+ * If we are already marked dirty, we're done.
+ */
+ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+ mutex_exit(&os->os_lock);
+ return;
+ }
+
+ ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+ ASSERT(dn->dn_datablksz != 0);
+ ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+
+ dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+ dn->dn_object, txg);
+
+ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
+ list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
+ } else {
+ list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
+ }
+
+ mutex_exit(&os->os_lock);
+
+ /*
+ * The dnode maintains a hold on its containing dbuf as
+ * long as there are holds on it. Each instantiated child
+ * dbuf maintaines a hold on the dnode. When the last child
+ * drops its hold, the dnode will drop its hold on the
+ * containing dbuf. We add a "dirty hold" here so that the
+ * dnode will hang around after we finish processing its
+ * children.
+ */
+ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
+
+ (void) dbuf_dirty(dn->dn_dbuf, tx);
+
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
+
+ /* we should be the only holder... hopefully */
+ /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+ dn->dn_free_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If the dnode is already dirty, it needs to be moved from
+ * the dirty list to the free list.
+ */
+ mutex_enter(&dn->dn_objset->os_lock);
+ if (list_link_active(&dn->dn_dirty_link[txgoff])) {
+ list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
+ list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
+ mutex_exit(&dn->dn_objset->os_lock);
+ } else {
+ mutex_exit(&dn->dn_objset->os_lock);
+ dnode_setdirty(dn, tx);
+ }
+}
+
+/*
+ * Try to change the block size for the indicated dnode. This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ int have_db0 = FALSE;
+
+ if (size == 0)
+ size = SPA_MINBLOCKSIZE;
+ if (size > SPA_MAXBLOCKSIZE)
+ size = SPA_MAXBLOCKSIZE;
+ else
+ size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+ if (ibs == dn->dn_indblkshift)
+ ibs = 0;
+
+ if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+ return (0);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* Check for any allocated blocks beyond the first */
+ if (dn->dn_phys->dn_maxblkid != 0)
+ goto fail;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+
+ if (db->db_blkid == 0) {
+ have_db0 = TRUE;
+ } else if (db->db_blkid != DB_BONUS_BLKID) {
+ mutex_exit(&dn->dn_dbufs_mtx);
+ goto fail;
+ }
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ if (ibs && dn->dn_nlevels != 1)
+ goto fail;
+
+ db = NULL;
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
+ /* obtain the old block */
+ db = dbuf_hold(dn, 0, FTAG);
+ dbuf_new_size(db, size, tx);
+ }
+
+ dnode_setdblksz(dn, size);
+ dnode_setdirty(dn, tx);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+ if (ibs) {
+ dn->dn_indblkshift = ibs;
+ dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ }
+
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+ return (0);
+
+fail:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ENOTSUP);
+}
+
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+ uint64_t txgoff = tx->tx_txg & TXG_MASK;
+ int drop_struct_lock = FALSE;
+ int epbs, new_nlevels;
+ uint64_t sz;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (blkid <= dn->dn_maxblkid)
+ goto out;
+
+ dn->dn_maxblkid = blkid;
+
+ /*
+ * Compute the number of levels necessary to support the new maxblkid.
+ */
+ new_nlevels = 1;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (sz = dn->dn_nblkptr;
+ sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
+ new_nlevels++;
+
+ if (new_nlevels > dn->dn_nlevels) {
+ int old_nlevels = dn->dn_nlevels;
+ dmu_buf_impl_t *db;
+ list_t *list;
+ dbuf_dirty_record_t *new, *dr, *dr_next;
+
+ dn->dn_nlevels = new_nlevels;
+
+ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+ dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+ /* dirty the left indirects */
+ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ new = dbuf_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+
+ /* transfer the dirty records to the new indirect */
+ mutex_enter(&dn->dn_mtx);
+ mutex_enter(&new->dt.di.dr_mtx);
+ list = &dn->dn_dirty_records[txgoff];
+ for (dr = list_head(list); dr; dr = dr_next) {
+ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+ if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+ dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+ ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ list_remove(&dn->dn_dirty_records[txgoff], dr);
+ list_insert_tail(&new->dt.di.dr_children, dr);
+ dr->dr_parent = new;
+ }
+ }
+ mutex_exit(&new->dt.di.dr_mtx);
+ mutex_exit(&dn->dn_mtx);
+ }
+
+out:
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+ avl_index_t where;
+ free_range_t *rp;
+ free_range_t rp_tofind;
+ uint64_t endblk = blkid + nblks;
+
+ ASSERT(MUTEX_HELD(&dn->dn_mtx));
+ ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
+
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ rp_tofind.fr_blkid = blkid;
+ rp = avl_find(tree, &rp_tofind, &where);
+ if (rp == NULL)
+ rp = avl_nearest(tree, where, AVL_BEFORE);
+ if (rp == NULL)
+ rp = avl_nearest(tree, where, AVL_AFTER);
+
+ while (rp && (rp->fr_blkid <= blkid + nblks)) {
+ uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
+ free_range_t *nrp = AVL_NEXT(tree, rp);
+
+ if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
+ /* clear this entire range */
+ avl_remove(tree, rp);
+ kmem_free(rp, sizeof (free_range_t));
+ } else if (blkid <= rp->fr_blkid &&
+ endblk > rp->fr_blkid && endblk < fr_endblk) {
+ /* clear the beginning of this range */
+ rp->fr_blkid = endblk;
+ rp->fr_nblks = fr_endblk - endblk;
+ } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
+ endblk >= fr_endblk) {
+ /* clear the end of this range */
+ rp->fr_nblks = blkid - rp->fr_blkid;
+ } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
+ /* clear a chunk out of this range */
+ free_range_t *new_rp =
+ kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+
+ new_rp->fr_blkid = endblk;
+ new_rp->fr_nblks = fr_endblk - endblk;
+ avl_insert_here(tree, new_rp, rp, AVL_AFTER);
+ rp->fr_nblks = blkid - rp->fr_blkid;
+ }
+ /* there may be no overlap */
+ rp = nrp;
+ }
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ uint64_t blkoff, blkid, nblks;
+ int blksz, head;
+ int trunc = FALSE;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ blksz = dn->dn_datablksz;
+
+ /* If the range is past the end of the file, this is a no-op */
+ if (off >= blksz * (dn->dn_maxblkid+1))
+ goto out;
+ if (len == -1ULL) {
+ len = UINT64_MAX - off;
+ trunc = TRUE;
+ }
+
+ /*
+ * First, block align the region to free:
+ */
+ if (ISP2(blksz)) {
+ head = P2NPHASE(off, blksz);
+ blkoff = P2PHASE(off, blksz);
+ } else {
+ ASSERT(dn->dn_maxblkid == 0);
+ if (off == 0 && len >= blksz) {
+ /* Freeing the whole block; don't do any head. */
+ head = 0;
+ } else {
+ /* Freeing part of the block. */
+ head = blksz - off;
+ ASSERT3U(head, >, 0);
+ }
+ blkoff = off;
+ }
+ /* zero out any partial block data at the start of the range */
+ if (head) {
+ ASSERT3U(blkoff + head, ==, blksz);
+ if (len < head)
+ head = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
+ FTAG, &db) == 0) {
+ caddr_t data;
+
+ /* don't dirty if it isn't on disk and isn't dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ data = db->db.db_data;
+ bzero(data + blkoff, head);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ off += head;
+ len -= head;
+ }
+
+ /* If the range was less than one block, we're done */
+ if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
+ goto out;
+
+ if (!ISP2(blksz)) {
+ /*
+ * They are freeing the whole block of a
+ * non-power-of-two blocksize file. Skip all the messy
+ * math.
+ */
+ ASSERT3U(off, ==, 0);
+ ASSERT3U(len, >=, blksz);
+ blkid = 0;
+ nblks = 1;
+ } else {
+ int tail;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int blkshift = dn->dn_datablkshift;
+
+ /* If the remaining range is past end of file, we're done */
+ if (off > dn->dn_maxblkid << blkshift)
+ goto out;
+
+ if (off + len == UINT64_MAX)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT3U(P2PHASE(off, blksz), ==, 0);
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ if (len < tail)
+ tail = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+ TRUE, FTAG, &db) == 0) {
+ /* don't dirty if not on disk and not dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr &&
+ !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock,
+ RW_WRITER);
+ bzero(db->db.db_data, tail);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ len -= tail;
+ }
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
+ goto out;
+
+ /* dirty the left indirects */
+ if (dn->dn_nlevels > 1 && off != 0) {
+ db = dbuf_hold_level(dn, 1,
+ (off - head) >> (blkshift + epbs), FTAG);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ }
+
+ /* dirty the right indirects */
+ if (dn->dn_nlevels > 1 && !trunc) {
+ db = dbuf_hold_level(dn, 1,
+ (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ }
+
+ /*
+ * Finally, add this range to the dnode range list, we
+ * will finish up this free operation in the syncing phase.
+ */
+ ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
+ ASSERT(off + len == UINT64_MAX ||
+ IS_P2ALIGNED(len, 1<<blkshift));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+
+ if (trunc)
+ dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, blkid, nblks, tx);
+ {
+ free_range_t *rp, *found;
+ avl_index_t where;
+ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+
+ /* Add new range to dn_ranges */
+ rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+ rp->fr_blkid = blkid;
+ rp->fr_nblks = nblks;
+ found = avl_find(tree, rp, &where);
+ ASSERT(found == NULL);
+ avl_insert(tree, rp, where);
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ dbuf_free_range(dn, blkid, nblks, tx);
+ dnode_setdirty(dn, tx);
+out:
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+ free_range_t range_tofind;
+ void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+ int i;
+
+ if (blkid == DB_BONUS_BLKID)
+ return (FALSE);
+
+ /*
+ * If we're in the process of opening the pool, dp will not be
+ * set yet, but there shouldn't be anything dirty.
+ */
+ if (dp == NULL)
+ return (FALSE);
+
+ if (dn->dn_free_txg)
+ return (TRUE);
+
+ /*
+ * If dn_datablkshift is not set, then there's only a single
+ * block, in which case there will never be a free range so it
+ * won't matter.
+ */
+ range_tofind.fr_blkid = blkid;
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ free_range_t *range_found;
+ avl_index_t idx;
+
+ range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
+ if (range_found) {
+ ASSERT(range_found->fr_nblks > 0);
+ break;
+ }
+ range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
+ if (range_found &&
+ range_found->fr_blkid + range_found->fr_nblks > blkid)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t delta)
+{
+ uint64_t space;
+ dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
+ dn, dn->dn_phys,
+ (u_longlong_t)dn->dn_phys->dn_used,
+ (longlong_t)delta);
+
+ mutex_enter(&dn->dn_mtx);
+ space = DN_USED_BYTES(dn->dn_phys);
+ if (delta > 0) {
+ ASSERT3U(space + delta, >=, space); /* no overflow */
+ } else {
+ ASSERT3U(space, >=, -delta); /* no underflow */
+ }
+ space += delta;
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
+ ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
+ ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
+ dn->dn_phys->dn_used = space >> DEV_BSHIFT;
+ } else {
+ dn->dn_phys->dn_used = space;
+ dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
+ }
+ mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context.
+ * Be conservative (ie. OK to write less than this or free more than
+ * this, but don't write more or free less).
+ */
+void
+dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+
+ if (space > 0)
+ space = spa_get_asize(os->os_spa, space);
+
+ if (ds)
+ dsl_dir_willuse_space(ds->ds_dir, space, tx);
+
+ dmu_tx_willuse_space(tx, space);
+}
+
+static int
+dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+ int lvl, uint64_t blkfill, uint64_t txg)
+{
+ dmu_buf_impl_t *db = NULL;
+ void *data = NULL;
+ uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t epb = 1ULL << epbs;
+ uint64_t minfill, maxfill;
+ int i, error, span;
+
+ dprintf("probing object %llu offset %llx level %d of %u\n",
+ dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+
+ if (lvl == dn->dn_phys->dn_nlevels) {
+ error = 0;
+ epb = dn->dn_phys->dn_nblkptr;
+ data = dn->dn_phys->dn_blkptr;
+ } else {
+ uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+ if (error) {
+ if (error == ENOENT)
+ return (hole ? 0 : ESRCH);
+ return (error);
+ }
+ error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
+ if (error) {
+ dbuf_rele(db, FTAG);
+ return (error);
+ }
+ data = db->db.db_data;
+ }
+
+ if (db && txg &&
+ (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+ error = ESRCH;
+ } else if (lvl == 0) {
+ dnode_phys_t *dnp = data;
+ span = DNODE_SHIFT;
+ ASSERT(dn->dn_type == DMU_OT_DNODE);
+
+ for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+ boolean_t newcontents = B_TRUE;
+ if (txg) {
+ int j;
+ newcontents = B_FALSE;
+ for (j = 0; j < dnp[i].dn_nblkptr; j++) {
+ if (dnp[i].dn_blkptr[j].blk_birth > txg)
+ newcontents = B_TRUE;
+ }
+ }
+ if (!dnp[i].dn_type == hole && newcontents)
+ break;
+ *offset += 1ULL << span;
+ }
+ if (i == blkfill)
+ error = ESRCH;
+ } else {
+ blkptr_t *bp = data;
+ span = (lvl - 1) * epbs + dn->dn_datablkshift;
+ minfill = 0;
+ maxfill = blkfill << ((lvl - 1) * epbs);
+
+ if (hole)
+ maxfill--;
+ else
+ minfill++;
+
+ for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+ i < epb; i++) {
+ if (bp[i].blk_fill >= minfill &&
+ bp[i].blk_fill <= maxfill &&
+ bp[i].blk_birth > txg)
+ break;
+ *offset += 1ULL << span;
+ }
+ if (i >= epb)
+ error = ESRCH;
+ }
+
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ *
+ * Examples:
+ *
+ * dnode_next_offset(dn, hole, offset, 1, 1, 0);
+ * Finds the next hole/data in a file.
+ * Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
+ * Finds the next free/allocated dnode an objset's meta-dnode.
+ * Only finds objects that have new contents since txg (ie.
+ * bonus buffer changes and content removal are ignored).
+ * Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ * Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+ int minlvl, uint64_t blkfill, uint64_t txg)
+{
+ int lvl, maxlvl;
+ int error = 0;
+ uint64_t initial_offset = *offset;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (dn->dn_phys->dn_nlevels == 0) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ESRCH);
+ }
+
+ if (dn->dn_datablkshift == 0) {
+ if (*offset < dn->dn_datablksz) {
+ if (hole)
+ *offset = dn->dn_datablksz;
+ } else {
+ error = ESRCH;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ return (error);
+ }
+
+ maxlvl = dn->dn_phys->dn_nlevels;
+
+ for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ error = dnode_next_offset_level(dn,
+ hole, offset, lvl, blkfill, txg);
+ if (error != ESRCH)
+ break;
+ }
+
+ while (--lvl >= minlvl && error == 0) {
+ error = dnode_next_offset_level(dn,
+ hole, offset, lvl, blkfill, txg);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (error == 0 && initial_offset > *offset)
+ error = ESRCH;
+
+ return (error);
+}
diff --git a/zfs/lib/libzpool/dnode_sync.c b/zfs/lib/libzpool/dnode_sync.c
new file mode 100644
index 000000000..0fdd27ecf
--- /dev/null
+++ b/zfs/lib/libzpool/dnode_sync.c
@@ -0,0 +1,616 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dnode_sync.c 1.19 07/08/26 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ int nblkptr = dn->dn_phys->dn_nblkptr;
+ int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+ int new_level = dn->dn_next_nlevels[txgoff];
+ int i;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* this dnode can't be paged out because it's dirty */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
+
+ db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+
+ dn->dn_phys->dn_nlevels = new_level;
+ dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
+ dn->dn_object, dn->dn_phys->dn_nlevels);
+
+ /* check for existing blkptrs in the dnode */
+ for (i = 0; i < nblkptr; i++)
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+ break;
+ if (i != nblkptr) {
+ /* transfer dnode's block pointers to new indirect block */
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ ASSERT(db->db.db_data);
+ ASSERT(arc_released(db->db_buf));
+ ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ sizeof (blkptr_t) * nblkptr);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ /* set dbuf's parent pointers to new indirect buf */
+ for (i = 0; i < nblkptr; i++) {
+ dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+
+ if (child == NULL)
+ continue;
+ ASSERT3P(child->db_dnode, ==, dn);
+ if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+ ASSERT(child->db_parent->db_level == db->db_level);
+ ASSERT(child->db_blkptr !=
+ &dn->dn_phys->dn_blkptr[child->db_blkid]);
+ mutex_exit(&child->db_mtx);
+ continue;
+ }
+ ASSERT(child->db_parent == NULL ||
+ child->db_parent == dn->dn_dbuf);
+
+ child->db_parent = db;
+ dbuf_add_ref(db, child);
+ if (db->db.db_data)
+ child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+ else
+ child->db_blkptr = NULL;
+ dprintf_dbuf_bp(child, child->db_blkptr,
+ "changed db_blkptr to new indirect %s", "");
+
+ mutex_exit(&child->db_mtx);
+ }
+
+ bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t bytesfreed = 0;
+ int i;
+
+ dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+
+ for (i = 0; i < num; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+
+ bytesfreed += bp_get_dasize(os->os_spa, bp);
+ ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+ dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
+ bzero(bp, sizeof (blkptr_t));
+ }
+ dnode_diduse_space(dn, -bytesfreed);
+}
+
+#ifdef ZFS_DEBUG
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+ int off, num;
+ int i, err, epbs;
+ uint64_t txg = tx->tx_txg;
+
+ epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ off = start - (db->db_blkid * 1<<epbs);
+ num = end - start + 1;
+
+ ASSERT3U(off, >=, 0);
+ ASSERT3U(num, >=, 0);
+ ASSERT3U(db->db_level, >, 0);
+ ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+ ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+ ASSERT(db->db_blkptr != NULL);
+
+ for (i = off; i < off+num; i++) {
+ uint64_t *buf;
+ dmu_buf_impl_t *child;
+ dbuf_dirty_record_t *dr;
+ int j;
+
+ ASSERT(db->db_level == 1);
+
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+ (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ if (err == ENOENT)
+ continue;
+ ASSERT(err == 0);
+ ASSERT(child->db_level == 0);
+ dr = child->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ ASSERT(dr == NULL || dr->dr_txg == txg);
+
+ /* data_old better be zeroed */
+ if (dr) {
+ buf = dr->dt.dl.dr_data->b_data;
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ child, i, off, num);
+ }
+ }
+ }
+
+ /*
+ * db_data better be zeroed unless it's dirty in a
+ * future txg.
+ */
+ mutex_enter(&child->db_mtx);
+ buf = child->db.db_data;
+ if (buf != NULL && child->db_state != DB_FILL &&
+ child->db_last_dirty == NULL) {
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ child, i, off, num);
+ }
+ }
+ }
+ mutex_exit(&child->db_mtx);
+
+ dbuf_rele(child, FTAG);
+ }
+}
+#endif
+
+static int
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ blkptr_t *bp;
+ dmu_buf_impl_t *subdb;
+ uint64_t start, end, dbstart, dbend, i;
+ int epbs, shift, err;
+ int all = TRUE;
+
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ arc_release(db->db_buf, db);
+ bp = (blkptr_t *)db->db.db_data;
+
+ epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ shift = (db->db_level - 1) * epbs;
+ dbstart = db->db_blkid << epbs;
+ start = blkid >> shift;
+ if (dbstart < start) {
+ bp += start - dbstart;
+ all = FALSE;
+ } else {
+ start = dbstart;
+ }
+ dbend = ((db->db_blkid + 1) << epbs) - 1;
+ end = (blkid + nblks - 1) >> shift;
+ if (dbend <= end)
+ end = dbend;
+ else if (all)
+ all = trunc;
+ ASSERT3U(start, <=, end);
+
+ if (db->db_level == 1) {
+ FREE_VERIFY(db, start, end, tx);
+ free_blocks(dn, bp, end-start+1, tx);
+ arc_buf_freeze(db->db_buf);
+ ASSERT(all || db->db_last_dirty);
+ return (all);
+ }
+
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (free_children(subdb, blkid, nblks, trunc, tx)) {
+ ASSERT3P(subdb->db_blkptr, ==, bp);
+ free_blocks(dn, bp, 1, tx);
+ } else {
+ all = FALSE;
+ }
+ dbuf_rele(subdb, FTAG);
+ }
+ arc_buf_freeze(db->db_buf);
+#ifdef ZFS_DEBUG
+ bp -= (end-start)+1;
+ for (i = start; i <= end; i++, bp++) {
+ if (i == start && blkid != 0)
+ continue;
+ else if (i == end && !trunc)
+ continue;
+ ASSERT3U(bp->blk_birth, ==, 0);
+ }
+#endif
+ ASSERT(all || db->db_last_dirty);
+ return (all);
+}
+
+/*
+ * free_range: Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ dmu_buf_impl_t *db;
+ int trunc, start, end, shift, i, err;
+ int dnlevel = dn->dn_phys->dn_nlevels;
+
+ if (blkid > dn->dn_phys->dn_maxblkid)
+ return;
+
+ ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+ trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
+ if (trunc)
+ nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+
+ /* There are no indirect blocks in the object */
+ if (dnlevel == 1) {
+ if (blkid >= dn->dn_phys->dn_nblkptr) {
+ /* this range was never made persistent */
+ return;
+ }
+ ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+ free_blocks(dn, bp + blkid, nblks, tx);
+ if (trunc) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off,
+ 1, 1, 0) != 0);
+ }
+ return;
+ }
+
+ shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+ start = blkid >> shift;
+ ASSERT(start < dn->dn_phys->dn_nblkptr);
+ end = (blkid + nblks - 1) >> shift;
+ bp += start;
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (free_children(db, blkid, nblks, trunc, tx)) {
+ ASSERT3P(db->db_blkptr, ==, bp);
+ free_blocks(dn, bp, 1, tx);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ if (trunc) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
+ }
+}
+
+/*
+ * Try to kick all the dnodes dbufs out of the cache...
+ */
+void
+dnode_evict_dbufs(dnode_t *dn)
+{
+ int progress;
+ int pass = 0;
+
+ do {
+ dmu_buf_impl_t *db, marker;
+ int evicting = FALSE;
+
+ progress = FALSE;
+ mutex_enter(&dn->dn_dbufs_mtx);
+ list_insert_tail(&dn->dn_dbufs, &marker);
+ db = list_head(&dn->dn_dbufs);
+ for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
+ list_remove(&dn->dn_dbufs, db);
+ list_insert_tail(&dn->dn_dbufs, db);
+ ASSERT3P(db->db_dnode, ==, dn);
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_EVICTING) {
+ progress = TRUE;
+ evicting = TRUE;
+ mutex_exit(&db->db_mtx);
+ } else if (refcount_is_zero(&db->db_holds)) {
+ progress = TRUE;
+ ASSERT(!arc_released(db->db_buf));
+ dbuf_clear(db); /* exits db_mtx for us */
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+
+ }
+ list_remove(&dn->dn_dbufs, &marker);
+ /*
+ * NB: we need to drop dn_dbufs_mtx between passes so
+ * that any DB_EVICTING dbufs can make progress.
+ * Ideally, we would have some cv we could wait on, but
+ * since we don't, just wait a bit to give the other
+ * thread a chance to run.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ if (evicting)
+ delay(1);
+ pass++;
+ ASSERT(pass < 100); /* sanity check */
+ } while (progress);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ mutex_enter(&db->db_mtx);
+ /* XXX - use dbuf_undirty()? */
+ list_remove(list, dr);
+ ASSERT(db->db_last_dirty == dr);
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt -= 1;
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ dr->dt.dl.dr_data == db->db_buf);
+ dbuf_unoverride(dr);
+ mutex_exit(&db->db_mtx);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ }
+}
+
+static void
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
+ dnode_evict_dbufs(dn);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ /*
+ * XXX - It would be nice to assert this, but we may still
+ * have residual holds from async evictions from the arc...
+ *
+ * zfs_obj_to_path() also depends on this being
+ * commented out.
+ *
+ * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+ */
+
+ /* Undirty next bits */
+ dn->dn_next_nlevels[txgoff] = 0;
+ dn->dn_next_indblkshift[txgoff] = 0;
+ dn->dn_next_blksz[txgoff] = 0;
+
+ /* free up all the blocks in the file. */
+ dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
+ ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+
+ /* ASSERT(blkptrs are zero); */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ ASSERT(dn->dn_free_txg > 0);
+ if (dn->dn_allocated_txg != dn->dn_free_txg)
+ dbuf_will_dirty(dn->dn_dbuf, tx);
+ bzero(dn->dn_phys, sizeof (dnode_phys_t));
+
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_type = DMU_OT_NONE;
+ dn->dn_maxblkid = 0;
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ mutex_exit(&dn->dn_mtx);
+
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ /*
+ * Now that we've released our hold, the dnode may
+ * be evicted, so we musn't access it.
+ */
+}
+
+/*
+ * Write out the dnode's dirty buffers.
+ *
+ * NOTE: The dnode is kept in memory by being dirty. Once the
+ * dirty bit is cleared, it may be evicted. Beware of this!
+ */
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
+{
+ free_range_t *rp;
+ dnode_phys_t *dnp = dn->dn_phys;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ list_t *list = &dn->dn_dirty_records[txgoff];
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+ DNODE_VERIFY(dn);
+
+ ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_allocated_txg == tx->tx_txg) {
+ /* The dnode is newly allocated or reallocated */
+ if (dnp->dn_type == DMU_OT_NONE) {
+ /* this is a first alloc, not a realloc */
+ /* XXX shouldn't the phys already be zeroed? */
+ bzero(dnp, DNODE_CORE_SIZE);
+ dnp->dn_nlevels = 1;
+ }
+
+ if (dn->dn_nblkptr > dnp->dn_nblkptr) {
+ /* zero the new blkptrs we are gaining */
+ bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+ sizeof (blkptr_t) *
+ (dn->dn_nblkptr - dnp->dn_nblkptr));
+ }
+ dnp->dn_type = dn->dn_type;
+ dnp->dn_bonustype = dn->dn_bonustype;
+ dnp->dn_bonuslen = dn->dn_bonuslen;
+ dnp->dn_nblkptr = dn->dn_nblkptr;
+ }
+
+ ASSERT(dnp->dn_nlevels > 1 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+
+ if (dn->dn_next_blksz[txgoff]) {
+ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
+ SPA_MINBLOCKSIZE) == 0);
+ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ list_head(list) != NULL ||
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
+ dnp->dn_datablkszsec);
+ dnp->dn_datablkszsec =
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+ dn->dn_next_blksz[txgoff] = 0;
+ }
+
+ if (dn->dn_next_bonuslen[txgoff]) {
+ if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+ dnp->dn_bonuslen = 0;
+ else
+ dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+ ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+ dn->dn_next_bonuslen[txgoff] = 0;
+ }
+
+ if (dn->dn_next_indblkshift[txgoff]) {
+ ASSERT(dnp->dn_nlevels == 1);
+ dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+ dn->dn_next_indblkshift[txgoff] = 0;
+ }
+
+ /*
+ * Just take the live (open-context) values for checksum and compress.
+ * Strictly speaking it's a future leak, but nothing bad happens if we
+ * start using the new checksum or compress algorithm a little early.
+ */
+ dnp->dn_checksum = dn->dn_checksum;
+ dnp->dn_compress = dn->dn_compress;
+
+ mutex_exit(&dn->dn_mtx);
+
+ /* process all the "freed" ranges in the file */
+ if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
+ for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
+ rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
+ dnode_sync_free_range(dn,
+ rp->fr_blkid, rp->fr_nblks, tx);
+ }
+ /* grab the mutex so we don't race with dnode_block_freed() */
+ mutex_enter(&dn->dn_mtx);
+ for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+
+ free_range_t *last = rp;
+ rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
+ avl_remove(&dn->dn_ranges[txgoff], last);
+ kmem_free(last, sizeof (free_range_t));
+ }
+ mutex_exit(&dn->dn_mtx);
+ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+ dnode_sync_free(dn, tx);
+ return;
+ }
+
+ if (dn->dn_next_nlevels[txgoff]) {
+ dnode_increase_indirection(dn, tx);
+ dn->dn_next_nlevels[txgoff] = 0;
+ }
+
+ dbuf_sync_list(list, tx);
+
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ ASSERT3P(list_head(list), ==, NULL);
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ }
+
+ /*
+ * Although we have dropped our reference to the dnode, it
+ * can't be evicted until its written, and we haven't yet
+ * initiated the IO for the dnode's dbuf.
+ */
+}
diff --git a/zfs/lib/libzpool/dsl_dataset.c b/zfs/lib/libzpool/dsl_dataset.c
new file mode 100644
index 000000000..88a280d67
--- /dev/null
+++ b/zfs/lib/libzpool/dsl_dataset.c
@@ -0,0 +1,2798 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dsl_dataset.c 1.42 08/04/28 SMI"
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa.h>
+#include <sys/sunddi.h>
+
+static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
+static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
+static dsl_checkfunc_t dsl_dataset_rollback_check;
+static dsl_syncfunc_t dsl_dataset_rollback_sync;
+static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
+
+#define DS_REF_MAX (1ULL << 62)
+
+#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
+
+/*
+ * We use weighted reference counts to express the various forms of exclusion
+ * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open
+ * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
+ * This makes the exclusion logic simple: the total refcnt for all opens cannot
+ * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their
+ * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume
+ * just over half of the refcnt space, so there can't be more than one, but it
+ * can peacefully coexist with any number of STANDARD opens.
+ */
+static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
+ 0, /* DS_MODE_NONE - invalid */
+ 1, /* DS_MODE_STANDARD - unlimited number */
+ (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */
+ DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */
+};
+
+/*
+ * Figure out how much of this delta should be propogated to the dsl_dir
+ * layer. If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
+ */
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+ uint64_t old_bytes, new_bytes;
+
+ if (ds->ds_reserved == 0)
+ return (delta);
+
+ old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+ new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+ ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+ return (new_bytes - old_bytes);
+}
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+ int64_t delta;
+
+ dprintf_bp(bp, "born, ds=%p\n", ds);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* It could have been compressed away to nothing */
+ if (BP_IS_HOLE(bp))
+ return;
+ ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+ ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+ if (ds == NULL) {
+ /*
+ * Account for the meta-objset space in its placeholder
+ * dsl_dir.
+ */
+ ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ used, compressed, uncompressed, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+ return;
+ }
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_lock);
+ delta = parent_delta(ds, used);
+ ds->ds_phys->ds_used_bytes += used;
+ ds->ds_phys->ds_compressed_bytes += compressed;
+ ds->ds_phys->ds_uncompressed_bytes += uncompressed;
+ ds->ds_phys->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx);
+}
+
+void
+dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+ dmu_tx_t *tx)
+{
+ int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* No block pointer => nothing to free */
+ if (BP_IS_HOLE(bp))
+ return;
+
+ ASSERT(used > 0);
+ if (ds == NULL) {
+ int err;
+ /*
+ * Account for the meta-objset space in its placeholder
+ * dataset.
+ */
+ err = arc_free(pio, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ ASSERT(err == 0);
+
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ -used, -compressed, -uncompressed, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+ return;
+ }
+ ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+ int err;
+ int64_t delta;
+
+ dprintf_bp(bp, "freeing: %s", "");
+ err = arc_free(pio, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ ASSERT(err == 0);
+
+ mutex_enter(&ds->ds_lock);
+ ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+ !DS_UNIQUE_IS_ACCURATE(ds));
+ delta = parent_delta(ds, -used);
+ ds->ds_phys->ds_unique_bytes -= used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir,
+ delta, -compressed, -uncompressed, tx);
+ } else {
+ dprintf_bp(bp, "putting on dead list: %s", "");
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+ /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object && bp->blk_birth >
+ ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ mutex_enter(&ds->ds_prev->ds_lock);
+ ds->ds_prev->ds_phys->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_prev->ds_lock);
+ }
+ }
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
+ ds->ds_phys->ds_used_bytes -= used;
+ ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
+ ds->ds_phys->ds_compressed_bytes -= compressed;
+ ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
+ ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+ mutex_exit(&ds->ds_lock);
+}
+
+uint64_t
+dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
+{
+ uint64_t trysnap = 0;
+
+ if (ds == NULL)
+ return (0);
+ /*
+ * The snapshot creation could fail, but that would cause an
+ * incorrect FALSE return, which would only result in an
+ * overestimation of the amount of space that an operation would
+ * consume, which is OK.
+ *
+ * There's also a small window where we could miss a pending
+ * snapshot, because we could set the sync task in the quiescing
+ * phase. So this should only be used as a guess.
+ */
+ if (ds->ds_trysnap_txg >
+ spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
+ trysnap = ds->ds_trysnap_txg;
+ return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+{
+ return (blk_birth > dsl_dataset_prev_snap_txg(ds));
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+{
+ dsl_dataset_t *ds = dsv;
+
+ /* open_refcount == DS_REF_MAX when deleting */
+ ASSERT(ds->ds_open_refcount == 0 ||
+ ds->ds_open_refcount == DS_REF_MAX);
+
+ dprintf_ds(ds, "evicting %s\n", "");
+
+ unique_remove(ds->ds_fsid_guid);
+
+ if (ds->ds_user_ptr != NULL)
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+
+ if (ds->ds_prev) {
+ dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ ds->ds_prev = NULL;
+ }
+
+ bplist_close(&ds->ds_deadlist);
+ dsl_dir_close(ds->ds_dir, ds);
+
+ ASSERT(!list_link_active(&ds->ds_synced_link));
+
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+
+ kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+static int
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+ dsl_dataset_phys_t *headphys;
+ int err;
+ dmu_buf_t *headdbuf;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (ds->ds_snapname[0])
+ return (0);
+ if (ds->ds_phys->ds_next_snap_obj == 0)
+ return (0);
+
+ err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err)
+ return (err);
+ headphys = headdbuf->db_data;
+ err = zap_value_search(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
+ dmu_buf_rele(headdbuf, FTAG);
+ return (err);
+}
+
+static int
+dsl_dataset_snap_lookup(objset_t *os, uint64_t flags,
+ uint64_t snapnames_zapobj, const char *name, uint64_t *value)
+{
+ matchtype_t mt;
+ int err;
+
+ if (flags & DS_FLAG_CI_DATASET)
+ mt = MT_FIRST;
+ else
+ mt = MT_EXACT;
+
+ err = zap_lookup_norm(os, snapnames_zapobj, name, 8, 1,
+ value, mt, NULL, 0, NULL);
+ if (err == ENOTSUP && mt == MT_FIRST)
+ err = zap_lookup(os, snapnames_zapobj, name, 8, 1, value);
+ return (err);
+}
+
+static int
+dsl_dataset_snap_remove(objset_t *os, uint64_t flags,
+ uint64_t snapnames_zapobj, char *name, dmu_tx_t *tx)
+{
+ matchtype_t mt;
+ int err;
+
+ if (flags & DS_FLAG_CI_DATASET)
+ mt = MT_FIRST;
+ else
+ mt = MT_EXACT;
+
+ err = zap_remove_norm(os, snapnames_zapobj, name, mt, tx);
+ if (err == ENOTSUP && mt == MT_FIRST)
+ err = zap_remove(os, snapnames_zapobj, name, tx);
+ return (err);
+}
+
+int
+dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
+ int mode, void *tag, dsl_dataset_t **dsp)
+{
+ uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_t *ds;
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+
+ err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+ if (err)
+ return (err);
+ ds = dmu_buf_get_user(dbuf);
+ if (ds == NULL) {
+ dsl_dataset_t *winner;
+
+ ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+ ds->ds_dbuf = dbuf;
+ ds->ds_object = dsobj;
+ ds->ds_phys = dbuf->db_data;
+
+ mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+
+ err = bplist_open(&ds->ds_deadlist,
+ mos, ds->ds_phys->ds_deadlist_obj);
+ if (err == 0) {
+ err = dsl_dir_open_obj(dp,
+ ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+ }
+ if (err) {
+ /*
+ * we don't really need to close the blist if we
+ * just opened it.
+ */
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+
+ if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+ ds->ds_snapname[0] = '\0';
+ if (ds->ds_phys->ds_prev_snap_obj) {
+ err = dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, ds, &ds->ds_prev);
+ }
+ } else {
+ if (snapname) {
+#ifdef ZFS_DEBUG
+ dsl_dataset_phys_t *headphys;
+ dmu_buf_t *headdbuf;
+ err = dmu_bonus_hold(mos,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err == 0) {
+ uint64_t foundobj;
+
+ headphys = headdbuf->db_data;
+ err = dsl_dataset_snap_lookup(
+ dp->dp_meta_objset,
+ headphys->ds_flags,
+ headphys->ds_snapnames_zapobj,
+ snapname, &foundobj);
+ ASSERT3U(foundobj, ==, dsobj);
+ dmu_buf_rele(headdbuf, FTAG);
+ }
+#endif
+ (void) strcat(ds->ds_snapname, snapname);
+ } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+ err = dsl_dataset_get_snapname(ds);
+ }
+ }
+
+ if (!dsl_dataset_is_snapshot(ds)) {
+ /*
+ * In sync context, we're called with either no lock
+ * or with the write lock. If we're not syncing,
+ * we're always called with the read lock held.
+ */
+ boolean_t need_lock =
+ !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
+ dsl_pool_sync_context(dp);
+
+ if (need_lock)
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+
+ err = dsl_prop_get_ds_locked(ds->ds_dir,
+ "refreservation", sizeof (uint64_t), 1,
+ &ds->ds_reserved, NULL);
+ if (err == 0) {
+ err = dsl_prop_get_ds_locked(ds->ds_dir,
+ "refquota", sizeof (uint64_t), 1,
+ &ds->ds_quota, NULL);
+ }
+
+ if (need_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ } else {
+ ds->ds_reserved = ds->ds_quota = 0;
+ }
+
+ if (err == 0) {
+ winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+ dsl_dataset_evict);
+ }
+ if (err || winner) {
+ bplist_close(&ds->ds_deadlist);
+ if (ds->ds_prev) {
+ dsl_dataset_close(ds->ds_prev,
+ DS_MODE_NONE, ds);
+ }
+ dsl_dir_close(ds->ds_dir, ds);
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ if (err) {
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ ds = winner;
+ } else {
+ ds->ds_fsid_guid =
+ unique_insert(ds->ds_phys->ds_fsid_guid);
+ }
+ }
+ ASSERT3P(ds->ds_dbuf, ==, dbuf);
+ ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
+
+ mutex_enter(&ds->ds_lock);
+ if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
+ (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
+ !DS_MODE_IS_INCONSISTENT(mode)) ||
+ (ds->ds_open_refcount + weight > DS_REF_MAX)) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_close(ds, DS_MODE_NONE, tag);
+ return (EBUSY);
+ }
+ ds->ds_open_refcount += weight;
+ mutex_exit(&ds->ds_lock);
+
+ *dsp = ds;
+ return (0);
+}
+
+int
+dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+ void *tag, dsl_dataset_t **dsp)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ const char *tail;
+ uint64_t obj;
+ dsl_dataset_t *ds = NULL;
+ int err = 0;
+
+ err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+
+ dp = dd->dd_pool;
+ obj = dd->dd_phys->dd_head_dataset_obj;
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if (obj == 0) {
+ /* A dataset with no associated objset */
+ err = ENOENT;
+ goto out;
+ }
+
+ if (tail != NULL) {
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t flags;
+
+ err = dsl_dataset_open_obj(dp, obj, NULL,
+ DS_MODE_NONE, tag, &ds);
+ if (err)
+ goto out;
+ flags = ds->ds_phys->ds_flags;
+ obj = ds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_close(ds, DS_MODE_NONE, tag);
+ ds = NULL;
+
+ if (tail[0] != '@') {
+ err = ENOENT;
+ goto out;
+ }
+ tail++;
+
+ /* Look for a snapshot */
+ if (!DS_MODE_IS_READONLY(mode)) {
+ err = EROFS;
+ goto out;
+ }
+ dprintf("looking for snapshot '%s'\n", tail);
+ err = dsl_dataset_snap_lookup(mos, flags, obj, tail, &obj);
+ if (err)
+ goto out;
+ }
+ err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dir_close(dd, FTAG);
+
+ ASSERT3U((err == 0), ==, (ds != NULL));
+ /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
+
+ *dsp = ds;
+ return (err);
+}
+
+int
+dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+ if (ds == NULL) {
+ (void) strcpy(name, "mos");
+ } else {
+ dsl_dir_name(ds->ds_dir, name);
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ if (ds->ds_snapname[0]) {
+ (void) strcat(name, "@");
+ if (!MUTEX_HELD(&ds->ds_lock)) {
+ /*
+ * We use a "recursive" mutex so that we
+ * can call dprintf_ds() with ds_lock held.
+ */
+ mutex_enter(&ds->ds_lock);
+ (void) strcat(name, ds->ds_snapname);
+ mutex_exit(&ds->ds_lock);
+ } else {
+ (void) strcat(name, ds->ds_snapname);
+ }
+ }
+ }
+}
+
+static int
+dsl_dataset_namelen(dsl_dataset_t *ds)
+{
+ int result;
+
+ if (ds == NULL) {
+ result = 3; /* "mos" */
+ } else {
+ result = dsl_dir_namelen(ds->ds_dir);
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ if (ds->ds_snapname[0]) {
+ ++result; /* adding one for the @-sign */
+ if (!MUTEX_HELD(&ds->ds_lock)) {
+ /* see dsl_datset_name */
+ mutex_enter(&ds->ds_lock);
+ result += strlen(ds->ds_snapname);
+ mutex_exit(&ds->ds_lock);
+ } else {
+ result += strlen(ds->ds_snapname);
+ }
+ }
+ }
+
+ return (result);
+}
+
+void
+dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+{
+ uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_open_refcount, >=, weight);
+ ds->ds_open_refcount -= weight;
+ mutex_exit(&ds->ds_lock);
+
+ dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_downgrade(dsl_dataset_t *ds, int oldmode, int newmode)
+{
+ uint64_t oldweight = ds_refcnt_weight[DS_MODE_LEVEL(oldmode)];
+ uint64_t newweight = ds_refcnt_weight[DS_MODE_LEVEL(newmode)];
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_open_refcount, >=, oldweight);
+ ASSERT3U(oldweight, >=, newweight);
+ ds->ds_open_refcount -= oldweight;
+ ds->ds_open_refcount += newweight;
+ mutex_exit(&ds->ds_lock);
+}
+
+boolean_t
+dsl_dataset_tryupgrade(dsl_dataset_t *ds, int oldmode, int newmode)
+{
+ boolean_t rv;
+ uint64_t oldweight = ds_refcnt_weight[DS_MODE_LEVEL(oldmode)];
+ uint64_t newweight = ds_refcnt_weight[DS_MODE_LEVEL(newmode)];
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_open_refcount, >=, oldweight);
+ ASSERT3U(newweight, >=, oldweight);
+ if (ds->ds_open_refcount - oldweight + newweight > DS_REF_MAX) {
+ rv = B_FALSE;
+ } else {
+ ds->ds_open_refcount -= oldweight;
+ ds->ds_open_refcount += newweight;
+ rv = B_TRUE;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (rv);
+}
+
+void
+dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+ dsl_dir_t *dd;
+
+ dsl_dir_create_root(mos, ddobjp, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+ DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+ dmu_buf_rele(dbuf, FTAG);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = dsobj;
+ dsl_dir_close(dd, FTAG);
+
+ VERIFY(0 ==
+ dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
+ (void) dmu_objset_create_impl(dp->dp_spa, ds,
+ &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+}
+
+uint64_t
+dsl_dataset_create_sync_impl(dsl_dir_t *dd, dsl_dataset_t *origin,
+ uint64_t flags, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj;
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+ ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_flags = flags;
+ dsphys->ds_fsid_guid = unique_create();
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+ DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+
+ if (origin) {
+ dsphys->ds_prev_snap_obj = origin->ds_object;
+ dsphys->ds_prev_snap_txg =
+ origin->ds_phys->ds_creation_txg;
+ dsphys->ds_used_bytes =
+ origin->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes =
+ origin->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ origin->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_bp = origin->ds_phys->ds_bp;
+ dsphys->ds_flags |= origin->ds_phys->ds_flags;
+
+ dmu_buf_will_dirty(origin->ds_dbuf, tx);
+ origin->ds_phys->ds_num_children++;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_origin_obj = origin->ds_object;
+ }
+
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+ dmu_buf_rele(dbuf, FTAG);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = dsobj;
+
+ return (dsobj);
+}
+
+uint64_t
+dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
+ dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = pdd->dd_pool;
+ uint64_t dsobj, ddobj;
+ dsl_dir_t *dd;
+
+ ASSERT(lastname[0] != '@');
+
+ ddobj = dsl_dir_create_sync(pdd, lastname, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
+
+ dsobj = dsl_dataset_create_sync_impl(dd, origin, flags, tx);
+
+ dsl_deleg_set_create_perms(dd, tx, cr);
+
+ dsl_dir_close(dd, FTAG);
+
+ return (dsobj);
+}
+
+struct destroyarg {
+ dsl_sync_task_group_t *dstg;
+ char *snapname;
+ char *failed;
+};
+
+static int
+dsl_snapshot_destroy_one(char *name, void *arg)
+{
+ struct destroyarg *da = arg;
+ dsl_dataset_t *ds;
+ char *cp;
+ int err;
+
+ (void) strcat(name, "@");
+ (void) strcat(name, da->snapname);
+ err = dsl_dataset_open(name,
+ DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ da->dstg, &ds);
+ cp = strchr(name, '@');
+ *cp = '\0';
+ if (err == ENOENT)
+ return (0);
+ if (err) {
+ (void) strcpy(da->failed, name);
+ return (err);
+ }
+
+ dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, ds, da->dstg, 0);
+ return (0);
+}
+
+/*
+ * Destroy 'snapname' in all descendants of 'fsname'.
+ */
+#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
+int
+dsl_snapshots_destroy(char *fsname, char *snapname)
+{
+ int err;
+ struct destroyarg da;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+
+ err = spa_open(fsname, &spa, FTAG);
+ if (err)
+ return (err);
+ da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ da.snapname = snapname;
+ da.failed = fsname;
+
+ err = dmu_objset_find(fsname,
+ dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
+
+ if (err == 0)
+ err = dsl_sync_task_group_wait(da.dstg);
+
+ for (dst = list_head(&da.dstg->dstg_tasks); dst;
+ dst = list_next(&da.dstg->dstg_tasks, dst)) {
+ dsl_dataset_t *ds = dst->dst_arg1;
+ if (dst->dst_err) {
+ dsl_dataset_name(ds, fsname);
+ *strchr(fsname, '@') = '\0';
+ }
+ /*
+ * If it was successful, destroy_sync would have
+ * closed the ds
+ */
+ if (err)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
+ }
+
+ dsl_sync_task_group_destroy(da.dstg);
+ spa_close(spa, FTAG);
+ return (err);
+}
+
+/*
+ * ds must be opened EXCLUSIVE or PRIMARY. on return (whether
+ * successful or not), ds will be closed and caller can no longer
+ * dereference it.
+ */
+int
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
+{
+ int err;
+ dsl_sync_task_group_t *dstg;
+ objset_t *os;
+ dsl_dir_t *dd;
+ uint64_t obj;
+
+ if (ds->ds_open_refcount != DS_REF_MAX) {
+ if (dsl_dataset_tryupgrade(ds, DS_MODE_PRIMARY,
+ DS_MODE_EXCLUSIVE) == 0) {
+ dsl_dataset_close(ds, DS_MODE_PRIMARY, tag);
+ return (EBUSY);
+ }
+ }
+
+ if (dsl_dataset_is_snapshot(ds)) {
+ /* Destroying a snapshot is simpler */
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
+ ds, tag, 0);
+ goto out;
+ }
+
+ dd = ds->ds_dir;
+
+ /*
+ * Check for errors and mark this ds as inconsistent, in
+ * case we crash while freeing the objects.
+ */
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
+ dsl_dataset_destroy_begin_sync, ds, NULL, 0);
+ if (err)
+ goto out;
+
+ err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+ if (err)
+ goto out;
+
+ /*
+ * remove the objects in open context, so that we won't
+ * have too much to do in syncing context.
+ */
+ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
+ ds->ds_phys->ds_prev_snap_txg)) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
+ dmu_tx_hold_bonus(tx, obj);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ /*
+ * Perhaps there is not enough disk
+ * space. Just deal with it from
+ * dsl_dataset_destroy_sync().
+ */
+ dmu_tx_abort(tx);
+ continue;
+ }
+ VERIFY(0 == dmu_object_free(os, obj, tx));
+ dmu_tx_commit(tx);
+ }
+ /* Make sure it's not dirty before we finish destroying it. */
+ txg_wait_synced(dd->dd_pool, 0);
+
+ dmu_objset_close(os);
+ if (err != ESRCH)
+ goto out;
+
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ if (err)
+ goto out;
+
+ /*
+ * Blow away the dsl_dir + head dataset.
+ */
+ dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
+ dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, ds, tag, 0);
+ dsl_sync_task_create(dstg, dsl_dir_destroy_check,
+ dsl_dir_destroy_sync, dd, FTAG, 0);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+ /* if it is successful, *destroy_sync will close the ds+dd */
+ if (err)
+ dsl_dir_close(dd, FTAG);
+out:
+ if (err)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
+ return (err);
+}
+
+int
+dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
+{
+ ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+
+ return (dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
+ ds, &ost, 0));
+}
+
+void *
+dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+ void *p, dsl_dataset_evict_func_t func)
+{
+ void *old;
+
+ mutex_enter(&ds->ds_lock);
+ old = ds->ds_user_ptr;
+ if (old == NULL) {
+ ds->ds_user_ptr = p;
+ ds->ds_user_evict_func = func;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (old);
+}
+
+void *
+dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
+{
+ return (ds->ds_user_ptr);
+}
+
+
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
+{
+ return (&ds->ds_phys->ds_bp);
+}
+
+void
+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* If it's the meta-objset, set dp_meta_rootbp */
+ if (ds == NULL) {
+ tx->tx_pool->dp_meta_rootbp = *bp;
+ } else {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_bp = *bp;
+ }
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+ return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp;
+
+ if (ds == NULL) /* this is the meta-objset */
+ return;
+
+ ASSERT(ds->ds_user_ptr != NULL);
+
+ if (ds->ds_phys->ds_next_snap_obj != 0)
+ panic("dirtying snapshot!");
+
+ dp = ds->ds_dir->dd_pool;
+
+ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ }
+}
+
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use. To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+static void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+ uint64_t mrs_used;
+ uint64_t dlused, dlcomp, dluncomp;
+
+ ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+
+ if (ds->ds_phys->ds_prev_snap_obj != 0)
+ mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
+ else
+ mrs_used = 0;
+
+ VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
+ &dluncomp));
+
+ ASSERT3U(dlused, <=, mrs_used);
+ ds->ds_phys->ds_unique_bytes =
+ ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
+
+ if (!DS_UNIQUE_IS_ACCURATE(ds) &&
+ spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ SPA_VERSION_UNIQUE_ACCURATE)
+ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+static uint64_t
+dsl_dataset_unique(dsl_dataset_t *ds)
+{
+ if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
+ dsl_dataset_recalc_head_uniq(ds);
+
+ return (ds->ds_phys->ds_unique_bytes);
+}
+
+struct killarg {
+ int64_t *usedp;
+ int64_t *compressedp;
+ int64_t *uncompressedp;
+ zio_t *zio;
+ dmu_tx_t *tx;
+};
+
+static int
+kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ struct killarg *ka = arg;
+ blkptr_t *bp = &bc->bc_blkptr;
+
+ ASSERT3U(bc->bc_errno, ==, 0);
+
+ /*
+ * Since this callback is not called concurrently, no lock is
+ * needed on the accounting values.
+ */
+ *ka->usedp += bp_get_dasize(spa, bp);
+ *ka->compressedp += BP_GET_PSIZE(bp);
+ *ka->uncompressedp += BP_GET_UCSIZE(bp);
+ /* XXX check for EIO? */
+ (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
+ ARC_NOWAIT);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ dmu_objset_type_t *ost = arg2;
+
+ /*
+ * We can only roll back to emptyness if it is a ZPL objset.
+ */
+ if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
+ return (EINVAL);
+
+ /*
+ * This must not be a snapshot.
+ */
+ if (ds->ds_phys->ds_next_snap_obj != 0)
+ return (EINVAL);
+
+ /*
+ * If we made changes this txg, traverse_dsl_dataset won't find
+ * them. Try again.
+ */
+ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
+ return (EAGAIN);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ dmu_objset_type_t *ost = arg2;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ /*
+ * Before the roll back destroy the zil.
+ */
+ if (ds->ds_user_ptr != NULL) {
+ zil_rollback_destroy(
+ ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
+
+ /*
+ * We need to make sure that the objset_impl_t is reopened after
+ * we do the rollback, otherwise it will have the wrong
+ * objset_phys_t. Normally this would happen when this
+ * DS_MODE_EXCLUSIVE dataset-open is closed, thus causing the
+ * dataset to be immediately evicted. But when doing "zfs recv
+ * -F", we reopen the objset before that, so that there is no
+ * window where the dataset is closed and inconsistent.
+ */
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
+
+ /* Zero out the deadlist. */
+ bplist_close(&ds->ds_deadlist);
+ bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ ds->ds_phys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+ ds->ds_phys->ds_deadlist_obj));
+
+ {
+ /* Free blkptrs that we gave birth to */
+ zio_t *zio;
+ int64_t used = 0, compressed = 0, uncompressed = 0;
+ struct killarg ka;
+ int64_t delta;
+
+ zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
+ ZIO_FLAG_MUSTSUCCEED);
+ ka.usedp = &used;
+ ka.compressedp = &compressed;
+ ka.uncompressedp = &uncompressed;
+ ka.zio = zio;
+ ka.tx = tx;
+ (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ ADVANCE_POST, kill_blkptr, &ka);
+ (void) zio_wait(zio);
+
+ /* only deduct space beyond any refreservation */
+ delta = parent_delta(ds, -used);
+ dsl_dir_diduse_space(ds->ds_dir,
+ delta, -compressed, -uncompressed, tx);
+ }
+
+ if (ds->ds_prev) {
+ /* Change our contents to that of the prev snapshot */
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+ ds->ds_phys->ds_used_bytes =
+ ds->ds_prev->ds_phys->ds_used_bytes;
+ ds->ds_phys->ds_compressed_bytes =
+ ds->ds_prev->ds_phys->ds_compressed_bytes;
+ ds->ds_phys->ds_uncompressed_bytes =
+ ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+ ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
+ ds->ds_phys->ds_unique_bytes = 0;
+
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+ }
+ } else {
+ /* Zero out our contents, recreate objset */
+ bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
+ ds->ds_phys->ds_used_bytes = 0;
+ ds->ds_phys->ds_compressed_bytes = 0;
+ ds->ds_phys->ds_uncompressed_bytes = 0;
+ ds->ds_phys->ds_flags = 0;
+ ds->ds_phys->ds_unique_bytes = 0;
+ (void) dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
+ &ds->ds_phys->ds_bp, *ost, tx);
+ }
+
+ spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
+ tx, cr, "dataset = %llu", ds->ds_object);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t count;
+ int err;
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+ return (EINVAL);
+
+ /*
+ * This is really a dsl_dir thing, but check it here so that
+ * we'll be less likely to leave this dataset inconsistent &
+ * nearly destroyed.
+ */
+ err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+ if (err)
+ return (err);
+ if (count != 0)
+ return (EEXIST);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* Mark it as inconsistent on-disk, in case we crash */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+ cr, "dataset = %llu", ds->ds_object);
+}
+
+/* ARGSUSED */
+int
+dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /* Can't delete a branch point. */
+ if (ds->ds_phys->ds_num_children > 1)
+ return (EEXIST);
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+ return (EINVAL);
+
+ /*
+ * If we made changes this txg, traverse_dsl_dataset won't find
+ * them. Try again.
+ */
+ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
+ return (EAGAIN);
+
+ /* XXX we should do some i/o error checking... */
+ return (0);
+}
+
+void
+dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ int64_t used = 0, compressed = 0, uncompressed = 0;
+ zio_t *zio;
+ int err;
+ int after_branch_point = FALSE;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *ds_prev = NULL;
+ uint64_t obj;
+
+ ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ ASSERT(ds->ds_prev == NULL ||
+ ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
+ ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+
+ /* Remove our reservation */
+ if (ds->ds_reserved != 0) {
+ uint64_t val = 0;
+ dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+ ASSERT3U(ds->ds_reserved, ==, 0);
+ }
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ obj = ds->ds_object;
+
+ if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ if (ds->ds_prev) {
+ ds_prev = ds->ds_prev;
+ } else {
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_prev));
+ }
+ after_branch_point =
+ (ds_prev->ds_phys->ds_next_snap_obj != obj);
+
+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+ if (after_branch_point &&
+ ds->ds_phys->ds_next_snap_obj == 0) {
+ /* This clone is toast. */
+ ASSERT(ds_prev->ds_phys->ds_num_children > 1);
+ ds_prev->ds_phys->ds_num_children--;
+ } else if (!after_branch_point) {
+ ds_prev->ds_phys->ds_next_snap_obj =
+ ds->ds_phys->ds_next_snap_obj;
+ }
+ }
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ blkptr_t bp;
+ dsl_dataset_t *ds_next;
+ uint64_t itor = 0;
+ uint64_t old_unique;
+
+ spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_next_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_next));
+ ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+
+ old_unique = dsl_dataset_unique(ds_next);
+
+ dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+ ds_next->ds_phys->ds_prev_snap_obj =
+ ds->ds_phys->ds_prev_snap_obj;
+ ds_next->ds_phys->ds_prev_snap_txg =
+ ds->ds_phys->ds_prev_snap_txg;
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+ ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+
+ /*
+ * Transfer to our deadlist (which will become next's
+ * new deadlist) any entries from next's current
+ * deadlist which were born before prev, and free the
+ * other entries.
+ *
+ * XXX we're doing this long task with the config lock held
+ */
+ while (bplist_iterate(&ds_next->ds_deadlist, &itor,
+ &bp) == 0) {
+ if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
+ &bp, tx));
+ if (ds_prev && !after_branch_point &&
+ bp.blk_birth >
+ ds_prev->ds_phys->ds_prev_snap_txg) {
+ ds_prev->ds_phys->ds_unique_bytes +=
+ bp_get_dasize(dp->dp_spa, &bp);
+ }
+ } else {
+ used += bp_get_dasize(dp->dp_spa, &bp);
+ compressed += BP_GET_PSIZE(&bp);
+ uncompressed += BP_GET_UCSIZE(&bp);
+ /* XXX check return value? */
+ (void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+ &bp, NULL, NULL, ARC_NOWAIT);
+ }
+ }
+
+ /* free next's deadlist */
+ bplist_close(&ds_next->ds_deadlist);
+ bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+
+ /* set next's deadlist to our deadlist */
+ ds_next->ds_phys->ds_deadlist_obj =
+ ds->ds_phys->ds_deadlist_obj;
+ VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
+ ds_next->ds_phys->ds_deadlist_obj));
+ ds->ds_phys->ds_deadlist_obj = 0;
+
+ if (ds_next->ds_phys->ds_next_snap_obj != 0) {
+ /*
+ * Update next's unique to include blocks which
+ * were previously shared by only this snapshot
+ * and it. Those blocks will be born after the
+ * prev snap and before this snap, and will have
+ * died after the next snap and before the one
+ * after that (ie. be on the snap after next's
+ * deadlist).
+ *
+ * XXX we're doing this long task with the
+ * config lock held
+ */
+ dsl_dataset_t *ds_after_next;
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds_next->ds_phys->ds_next_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_after_next));
+ itor = 0;
+ while (bplist_iterate(&ds_after_next->ds_deadlist,
+ &itor, &bp) == 0) {
+ if (bp.blk_birth >
+ ds->ds_phys->ds_prev_snap_txg &&
+ bp.blk_birth <=
+ ds->ds_phys->ds_creation_txg) {
+ ds_next->ds_phys->ds_unique_bytes +=
+ bp_get_dasize(dp->dp_spa, &bp);
+ }
+ }
+
+ dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
+ ASSERT3P(ds_next->ds_prev, ==, NULL);
+ } else {
+ ASSERT3P(ds_next->ds_prev, ==, ds);
+ dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
+ ds_next);
+ if (ds_prev) {
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, ds_next, &ds_next->ds_prev));
+ } else {
+ ds_next->ds_prev = NULL;
+ }
+
+ dsl_dataset_recalc_head_uniq(ds_next);
+
+ /*
+ * Reduce the amount of our unconsmed refreservation
+ * being charged to our parent by the amount of
+ * new unique data we have gained.
+ */
+ if (old_unique < ds_next->ds_reserved) {
+ int64_t mrsdelta;
+ uint64_t new_unique =
+ ds_next->ds_phys->ds_unique_bytes;
+
+ ASSERT(old_unique <= new_unique);
+ mrsdelta = MIN(new_unique - old_unique,
+ ds_next->ds_reserved - old_unique);
+ dsl_dir_diduse_space(ds->ds_dir, -mrsdelta,
+ 0, 0, tx);
+ }
+ }
+ dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
+
+ /*
+ * NB: unique_bytes might not be accurate for the head objset.
+ * Before SPA_VERSION 9, we didn't update its value when we
+ * deleted the most recent snapshot.
+ */
+ ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+ } else {
+ /*
+ * There's no next snapshot, so this is a head dataset.
+ * Destroy the deadlist. Unless it's a clone, the
+ * deadlist should be empty. (If it's a clone, it's
+ * safe to ignore the deadlist contents.)
+ */
+ struct killarg ka;
+
+ ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
+ bplist_close(&ds->ds_deadlist);
+ bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ ds->ds_phys->ds_deadlist_obj = 0;
+
+ /*
+ * Free everything that we point to (that's born after
+ * the previous snapshot, if we are a clone)
+ *
+ * XXX we're doing this long task with the config lock held
+ */
+ ka.usedp = &used;
+ ka.compressedp = &compressed;
+ ka.uncompressedp = &uncompressed;
+ ka.zio = zio;
+ ka.tx = tx;
+ err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ ADVANCE_POST, kill_blkptr, &ka);
+ ASSERT3U(err, ==, 0);
+ ASSERT(spa_version(dp->dp_spa) <
+ SPA_VERSION_UNIQUE_ACCURATE ||
+ used == ds->ds_phys->ds_unique_bytes);
+ }
+
+ err = zio_wait(zio);
+ ASSERT3U(err, ==, 0);
+
+ dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
+
+ if (ds->ds_phys->ds_snapnames_zapobj) {
+ err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
+ ASSERT(err == 0);
+ }
+
+ if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
+ /* Erase the link in the dataset */
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+ /*
+ * dsl_dir_sync_destroy() called us, they'll destroy
+ * the dataset.
+ */
+ } else {
+ /* remove from snapshot namespace */
+ dsl_dataset_t *ds_head;
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_head));
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+ {
+ uint64_t val;
+
+ err = dsl_dataset_snap_lookup(mos,
+ ds_head->ds_phys->ds_flags,
+ ds_head->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, &val);
+ ASSERT3U(err, ==, 0);
+ ASSERT3U(val, ==, obj);
+ }
+#endif
+ err = dsl_dataset_snap_remove(mos,
+ ds_head->ds_phys->ds_flags,
+ ds_head->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, tx);
+ ASSERT(err == 0);
+ dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
+ }
+
+ if (ds_prev && ds->ds_prev != ds_prev)
+ dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+
+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+ spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
+ cr, "dataset = %llu", ds->ds_object);
+
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
+ VERIFY(0 == dmu_object_free(mos, obj, tx));
+
+}
+
+static int
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ uint64_t asize;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ /*
+ * If there's an fs-only reservation, any blocks that might become
+ * owned by the snapshot dataset must be accommodated by space
+ * outside of the reservation.
+ */
+ asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+ if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
+ return (ENOSPC);
+
+ /*
+ * Propogate any reserved space for this snapshot to other
+ * snapshot checks in this sync group.
+ */
+ if (asize > 0)
+ dsl_dir_willuse_space(ds->ds_dir, asize, tx);
+
+ return (0);
+}
+
+/* ARGSUSED */
+int
+dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ const char *snapname = arg2;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int err;
+ uint64_t value;
+
+ /*
+ * We don't allow multiple snapshots of the same txg. If there
+ * is already one, try again.
+ */
+ if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
+ return (EAGAIN);
+
+ /*
+ * Check for conflicting name snapshot name.
+ */
+ err = dsl_dataset_snap_lookup(mos, ds->ds_phys->ds_flags,
+ ds->ds_phys->ds_snapnames_zapobj, snapname, &value);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
+ /*
+ * Check that the dataset's name is not too long. Name consists
+ * of the dataset's length + 1 for the @-sign + snapshot name's length
+ */
+ if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+
+ err = dsl_dataset_snapshot_reserve_space(ds, tx);
+ if (err)
+ return (err);
+
+ ds->ds_trysnap_txg = tx->tx_txg;
+ return (0);
+}
+
+void
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ const char *snapname = arg2;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+
+ spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = ds->ds_dir->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
+ dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+ dsphys->ds_next_snap_obj = ds->ds_object;
+ dsphys->ds_num_children = 1;
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
+ dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_flags = ds->ds_phys->ds_flags;
+ dsphys->ds_bp = ds->ds_phys->ds_bp;
+ dmu_buf_rele(dbuf, FTAG);
+
+ ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
+ if (ds->ds_prev) {
+ ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object ||
+ ds->ds_prev->ds_phys->ds_num_children > 1);
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+ ds->ds_prev->ds_phys->ds_creation_txg);
+ ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+ }
+ }
+
+ /*
+ * If we have a reference-reservation on this dataset, we will
+ * need to increase the amount of refreservation being charged
+ * since our unique space is going to zero.
+ */
+ if (ds->ds_reserved) {
+ int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+ dsl_dir_diduse_space(ds->ds_dir, add, 0, 0, tx);
+ }
+
+ bplist_close(&ds->ds_deadlist);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
+ ds->ds_phys->ds_prev_snap_obj = dsobj;
+ ds->ds_phys->ds_prev_snap_txg = tx->tx_txg;
+ ds->ds_phys->ds_unique_bytes = 0;
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+ ds->ds_phys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+ ds->ds_phys->ds_deadlist_obj));
+
+ dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
+ err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &dsobj, tx);
+ ASSERT(err == 0);
+
+ if (ds->ds_prev)
+ dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, snapname,
+ DS_MODE_NONE, ds, &ds->ds_prev));
+
+ spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
+ "dataset = %llu", dsobj);
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+ /*
+ * in case we had to change ds_fsid_guid when we opened it,
+ * sync it out now.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+
+ dsl_dir_dirty(ds->ds_dir, tx);
+ dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ uint64_t refd, avail, uobjs, aobjs;
+
+ dsl_dir_stats(ds->ds_dir, nv);
+
+ dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
+ ds->ds_phys->ds_creation_time);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
+ ds->ds_phys->ds_creation_txg);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+ ds->ds_quota);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+ ds->ds_reserved);
+
+ if (ds->ds_phys->ds_next_snap_obj) {
+ /*
+ * This is a snapshot; override the dd's space used with
+ * our unique space and compression ratio.
+ */
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+ ds->ds_phys->ds_unique_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+ ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
+ (ds->ds_phys->ds_uncompressed_bytes * 100 /
+ ds->ds_phys->ds_compressed_bytes));
+ }
+}
+
+void
+dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+{
+ stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
+ stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
+ stat->dds_guid = ds->ds_phys->ds_guid;
+ if (ds->ds_phys->ds_next_snap_obj) {
+ stat->dds_is_snapshot = B_TRUE;
+ stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+ }
+
+ /* clone origin is really a dsl_dir thing... */
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ if (ds->ds_dir->dd_phys->dd_origin_obj) {
+ dsl_dataset_t *ods;
+
+ VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
+ ds->ds_dir->dd_phys->dd_origin_obj,
+ NULL, DS_MODE_NONE, FTAG, &ods));
+ dsl_dataset_name(ods, stat->dds_origin);
+ dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
+ }
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+}
+
+uint64_t
+dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+{
+ return (ds->ds_fsid_guid);
+}
+
+void
+dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ *refdbytesp = ds->ds_phys->ds_used_bytes;
+ *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+ if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
+ *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+ if (ds->ds_quota != 0) {
+ /*
+ * Adjust available bytes according to refquota
+ */
+ if (*refdbytesp < ds->ds_quota)
+ *availbytesp = MIN(*availbytesp,
+ ds->ds_quota - *refdbytesp);
+ else
+ *availbytesp = 0;
+ }
+ *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+ *availobjsp = DN_MAX_OBJECT - *usedobjsp;
+}
+
+boolean_t
+dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+ if (ds->ds_prev == NULL)
+ return (B_FALSE);
+ if (ds->ds_phys->ds_bp.blk_birth >
+ ds->ds_prev->ds_phys->ds_creation_txg)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ char *newsnapname = arg2;
+ dsl_dir_t *dd = ds->ds_dir;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dataset_t *hds;
+ uint64_t val;
+ int err;
+
+ err = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
+ if (err)
+ return (err);
+
+ /* new name better not be in use */
+ err = dsl_dataset_snap_lookup(mos, hds->ds_phys->ds_flags,
+ hds->ds_phys->ds_snapnames_zapobj, newsnapname, &val);
+ dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+
+ if (err == 0)
+ err = EEXIST;
+ else if (err == ENOENT)
+ err = 0;
+
+ /* dataset name + 1 for the "@" + the new snapshot name must fit */
+ if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
+ err = ENAMETOOLONG;
+
+ return (err);
+}
+
+static void
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
+ cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ const char *newsnapname = arg2;
+ dsl_dir_t *dd = ds->ds_dir;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dataset_t *hds;
+ int err;
+
+ ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
+
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ err = dsl_dataset_snap_remove(mos, hds->ds_phys->ds_flags,
+ hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, tx);
+ ASSERT3U(err, ==, 0);
+ mutex_enter(&ds->ds_lock);
+ (void) strcpy(ds->ds_snapname, newsnapname);
+ mutex_exit(&ds->ds_lock);
+ err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, 8, 1, &ds->ds_object, tx);
+ ASSERT3U(err, ==, 0);
+
+ spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
+ cr, "dataset = %llu", ds->ds_object);
+ dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+}
+
+struct renamesnaparg {
+ dsl_sync_task_group_t *dstg;
+ char failed[MAXPATHLEN];
+ char *oldsnap;
+ char *newsnap;
+};
+
+static int
+dsl_snapshot_rename_one(char *name, void *arg)
+{
+ struct renamesnaparg *ra = arg;
+ dsl_dataset_t *ds = NULL;
+ char *cp;
+ int err;
+
+ cp = name + strlen(name);
+ *cp = '@';
+ (void) strcpy(cp + 1, ra->oldsnap);
+
+ /*
+ * For recursive snapshot renames the parent won't be changing
+ * so we just pass name for both the to/from argument.
+ */
+ if (err = zfs_secpolicy_rename_perms(name, name, CRED())) {
+ (void) strcpy(ra->failed, name);
+ return (err);
+ }
+
+ err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
+ ra->dstg, &ds);
+ if (err == ENOENT) {
+ *cp = '\0';
+ return (0);
+ }
+ if (err) {
+ (void) strcpy(ra->failed, name);
+ *cp = '\0';
+ dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
+ return (err);
+ }
+
+#ifdef _KERNEL
+ /* for all filesystems undergoing rename, we'll need to unmount it */
+ (void) zfs_unmount_snap(name, NULL);
+#endif
+
+ *cp = '\0';
+
+ dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
+ dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
+
+ return (0);
+}
+
+static int
+dsl_recursive_rename(char *oldname, const char *newname)
+{
+ int err;
+ struct renamesnaparg *ra;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ char *cp, *fsname = spa_strdup(oldname);
+ int len = strlen(oldname);
+
+ /* truncate the snapshot name to get the fsname */
+ cp = strchr(fsname, '@');
+ *cp = '\0';
+
+ err = spa_open(fsname, &spa, FTAG);
+ if (err) {
+ kmem_free(fsname, len + 1);
+ return (err);
+ }
+ ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
+ ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+
+ ra->oldsnap = strchr(oldname, '@') + 1;
+ ra->newsnap = strchr(newname, '@') + 1;
+ *ra->failed = '\0';
+
+ err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
+ DS_FIND_CHILDREN);
+ kmem_free(fsname, len + 1);
+
+ if (err == 0) {
+ err = dsl_sync_task_group_wait(ra->dstg);
+ }
+
+ for (dst = list_head(&ra->dstg->dstg_tasks); dst;
+ dst = list_next(&ra->dstg->dstg_tasks, dst)) {
+ dsl_dataset_t *ds = dst->dst_arg1;
+ if (dst->dst_err) {
+ dsl_dir_name(ds->ds_dir, ra->failed);
+ (void) strcat(ra->failed, "@");
+ (void) strcat(ra->failed, ra->newsnap);
+ }
+ dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
+ }
+
+ if (err)
+ (void) strcpy(oldname, ra->failed);
+
+ dsl_sync_task_group_destroy(ra->dstg);
+ kmem_free(ra, sizeof (struct renamesnaparg));
+ spa_close(spa, FTAG);
+ return (err);
+}
+
+static int
+dsl_valid_rename(char *oldname, void *arg)
+{
+ int delta = *(int *)arg;
+
+ if (strlen(oldname) + delta >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+
+ return (0);
+}
+
+#pragma weak dmu_objset_rename = dsl_dataset_rename
+int
+dsl_dataset_rename(char *oldname, const char *newname,
+ boolean_t recursive)
+{
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ const char *tail;
+ int err;
+
+ err = dsl_dir_open(oldname, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ int delta = strlen(newname) - strlen(oldname);
+
+ /* if we're growing, validate child size lengths */
+ if (delta > 0)
+ err = dmu_objset_find(oldname, dsl_valid_rename,
+ &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+
+ if (!err)
+ err = dsl_dir_rename(dd, newname);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+ }
+ if (tail[0] != '@') {
+ /* the name ended in a nonexistant component */
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+
+ dsl_dir_close(dd, FTAG);
+
+ /* new name must be snapshot in same filesystem */
+ tail = strchr(newname, '@');
+ if (tail == NULL)
+ return (EINVAL);
+ tail++;
+ if (strncmp(oldname, newname, tail - newname) != 0)
+ return (EXDEV);
+
+ if (recursive) {
+ err = dsl_recursive_rename(oldname, newname);
+ } else {
+ err = dsl_dataset_open(oldname,
+ DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_snapshot_rename_check,
+ dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
+
+ dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+ }
+
+ return (err);
+}
+
+struct promotearg {
+ uint64_t used, comp, uncomp, unique;
+ uint64_t ds_flags, newnext_obj, snapnames_obj;
+};
+
+/* ARGSUSED */
+static int
+dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *hds = arg1;
+ struct promotearg *pa = arg2;
+ dsl_dir_t *dd = hds->ds_dir;
+ dsl_pool_t *dp = hds->ds_dir->dd_pool;
+ dsl_dir_t *odd = NULL;
+ dsl_dataset_t *ds = NULL;
+ dsl_dataset_t *origin_ds = NULL;
+ dsl_dataset_t *newnext_ds = NULL;
+ int err;
+ char *name = NULL;
+ uint64_t itor = 0;
+ blkptr_t bp;
+
+ bzero(pa, sizeof (*pa));
+
+ /* Check that it is a clone */
+ if (dd->dd_phys->dd_origin_obj == 0)
+ return (EINVAL);
+
+ /* Since this is so expensive, don't do the preliminary check */
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ if (err = dsl_dataset_open_obj(dp, dd->dd_phys->dd_origin_obj,
+ NULL, DS_MODE_EXCLUSIVE, FTAG, &origin_ds))
+ goto out;
+ odd = origin_ds->ds_dir;
+
+ {
+ dsl_dataset_t *phds;
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ odd->dd_phys->dd_head_dataset_obj,
+ NULL, DS_MODE_NONE, FTAG, &phds))
+ goto out;
+ pa->ds_flags = phds->ds_phys->ds_flags;
+ pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
+ }
+
+ if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+ err = EXDEV;
+ goto out;
+ }
+
+ /* find origin's new next ds */
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
+ NULL, DS_MODE_NONE, FTAG, &newnext_ds));
+ while (newnext_ds->ds_phys->ds_prev_snap_obj != origin_ds->ds_object) {
+ dsl_dataset_t *prev;
+
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ newnext_ds->ds_phys->ds_prev_snap_obj,
+ NULL, DS_MODE_NONE, FTAG, &prev))
+ goto out;
+ dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
+ newnext_ds = prev;
+ }
+ pa->newnext_obj = newnext_ds->ds_object;
+
+ /* compute origin's new unique space */
+ while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
+ &itor, &bp)) == 0) {
+ if (bp.blk_birth > origin_ds->ds_phys->ds_prev_snap_txg)
+ pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
+ }
+ if (err != ENOENT)
+ goto out;
+
+ /* Walk the snapshots that we are moving */
+ name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ ds = origin_ds;
+ /* CONSTCOND */
+ while (TRUE) {
+ uint64_t val, dlused, dlcomp, dluncomp;
+ dsl_dataset_t *prev;
+
+ /* Check that the snapshot name does not conflict */
+ dsl_dataset_name(ds, name);
+ err = dsl_dataset_snap_lookup(dd->dd_pool->dp_meta_objset,
+ hds->ds_phys->ds_flags, hds->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, &val);
+ if (err != ENOENT) {
+ if (err == 0)
+ err = EEXIST;
+ goto out;
+ }
+
+ /*
+ * compute space to transfer. Each snapshot gave birth to:
+ * (my used) - (prev's used) + (deadlist's used)
+ */
+ pa->used += ds->ds_phys->ds_used_bytes;
+ pa->comp += ds->ds_phys->ds_compressed_bytes;
+ pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
+
+ /* If we reach the first snapshot, we're done. */
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ break;
+
+ if (err = bplist_space(&ds->ds_deadlist,
+ &dlused, &dlcomp, &dluncomp))
+ goto out;
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
+ FTAG, &prev))
+ goto out;
+ pa->used += dlused - prev->ds_phys->ds_used_bytes;
+ pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
+ pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
+
+ /*
+ * We could be a clone of a clone. If we reach our
+ * parent's branch point, we're done.
+ */
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
+ break;
+ }
+ if (ds != origin_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ ds = prev;
+ }
+
+ /* Check that there is enough space here */
+ err = dsl_dir_transfer_possible(odd, dd, pa->used);
+
+out:
+ if (ds && ds != origin_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (origin_ds)
+ dsl_dataset_close(origin_ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (newnext_ds)
+ dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
+ if (name)
+ kmem_free(name, MAXPATHLEN);
+ return (err);
+}
+
+static void
+dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *hds = arg1;
+ struct promotearg *pa = arg2;
+ dsl_dir_t *dd = hds->ds_dir;
+ dsl_pool_t *dp = hds->ds_dir->dd_pool;
+ dsl_dir_t *odd = NULL;
+ dsl_dataset_t *ds, *origin_ds;
+ char *name;
+
+ ASSERT(dd->dd_phys->dd_origin_obj != 0);
+ ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
+
+ VERIFY(0 == dsl_dataset_open_obj(dp, dd->dd_phys->dd_origin_obj,
+ NULL, DS_MODE_EXCLUSIVE, FTAG, &origin_ds));
+ /*
+ * We need to explicitly open odd, since origin_ds's dd will be
+ * changing.
+ */
+ VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
+ NULL, FTAG, &odd));
+
+ /* move snapshots to this dir */
+ name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ ds = origin_ds;
+ /* CONSTCOND */
+ while (TRUE) {
+ dsl_dataset_t *prev;
+
+ /* move snap name entry */
+ dsl_dataset_name(ds, name);
+ VERIFY(0 == dsl_dataset_snap_remove(dp->dp_meta_objset,
+ pa->ds_flags, pa->snapnames_obj, ds->ds_snapname, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset,
+ hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+ 8, 1, &ds->ds_object, tx));
+
+ /* change containing dsl_dir */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
+ ds->ds_phys->ds_dir_obj = dd->dd_object;
+ ASSERT3P(ds->ds_dir, ==, odd);
+ dsl_dir_close(ds->ds_dir, ds);
+ VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
+ NULL, ds, &ds->ds_dir));
+
+ ASSERT3U(dsl_prop_numcb(ds), ==, 0);
+
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ break;
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
+ FTAG, &prev));
+
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
+ break;
+ }
+ if (ds != origin_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ ds = prev;
+ }
+ if (ds != origin_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+
+ /* change origin's next snap */
+ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+ origin_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
+
+ /* change origin */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
+ dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
+ dmu_buf_will_dirty(odd->dd_dbuf, tx);
+ odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
+
+ /* change space accounting */
+ dsl_dir_diduse_space(odd, -pa->used, -pa->comp, -pa->uncomp, tx);
+ dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
+ origin_ds->ds_phys->ds_unique_bytes = pa->unique;
+
+ /* log history record */
+ spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
+ cr, "dataset = %llu", ds->ds_object);
+
+ dsl_dir_close(odd, FTAG);
+ dsl_dataset_close(origin_ds, DS_MODE_EXCLUSIVE, FTAG);
+ kmem_free(name, MAXPATHLEN);
+}
+
+int
+dsl_dataset_promote(const char *name)
+{
+ dsl_dataset_t *ds;
+ int err;
+ dmu_object_info_t doi;
+ struct promotearg pa;
+
+ err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, &doi);
+ if (err) {
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (err);
+ }
+
+ /*
+ * Add in 128x the snapnames zapobj size, since we will be moving
+ * a bunch of snapnames to the promoted ds, and dirtying their
+ * bonus buffers.
+ */
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_promote_check,
+ dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (err);
+}
+
+struct cloneswaparg {
+ dsl_dataset_t *cds; /* clone dataset */
+ dsl_dataset_t *ohds; /* origin's head dataset */
+ boolean_t force;
+ int64_t unused_refres_delta; /* change in unconsumed refreservation */
+};
+
+/* ARGSUSED */
+static int
+dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ struct cloneswaparg *csa = arg1;
+
+ /* they should both be heads */
+ if (dsl_dataset_is_snapshot(csa->cds) ||
+ dsl_dataset_is_snapshot(csa->ohds))
+ return (EINVAL);
+
+ /* the branch point should be just before them */
+ if (csa->cds->ds_prev != csa->ohds->ds_prev)
+ return (EINVAL);
+
+ /* cds should be the clone */
+ if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
+ csa->ohds->ds_object)
+ return (EINVAL);
+
+ /* the clone should be a child of the origin */
+ if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
+ return (EINVAL);
+
+ /* ohds shouldn't be modified unless 'force' */
+ if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
+ return (ETXTBSY);
+
+ /* adjust amount of any unconsumed refreservation */
+ csa->unused_refres_delta =
+ (int64_t)MIN(csa->ohds->ds_reserved,
+ csa->ohds->ds_phys->ds_unique_bytes) -
+ (int64_t)MIN(csa->ohds->ds_reserved,
+ csa->cds->ds_phys->ds_unique_bytes);
+
+ if (csa->unused_refres_delta > 0 &&
+ csa->unused_refres_delta >
+ dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
+ return (ENOSPC);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ struct cloneswaparg *csa = arg1;
+ dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
+ uint64_t itor = 0;
+ blkptr_t bp;
+ uint64_t unique = 0;
+ int err;
+
+ ASSERT(csa->cds->ds_reserved == 0);
+ ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
+
+ dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
+ dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
+ dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
+
+ if (csa->cds->ds_user_ptr != NULL) {
+ csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
+ csa->cds->ds_user_ptr = NULL;
+ }
+
+ if (csa->ohds->ds_user_ptr != NULL) {
+ csa->ohds->ds_user_evict_func(csa->ohds,
+ csa->ohds->ds_user_ptr);
+ csa->ohds->ds_user_ptr = NULL;
+ }
+
+ /* compute unique space */
+ while ((err = bplist_iterate(&csa->cds->ds_deadlist,
+ &itor, &bp)) == 0) {
+ if (bp.blk_birth > csa->cds->ds_prev->ds_phys->ds_prev_snap_txg)
+ unique += bp_get_dasize(dp->dp_spa, &bp);
+ }
+ VERIFY(err == ENOENT);
+
+ /* reset origin's unique bytes */
+ csa->cds->ds_prev->ds_phys->ds_unique_bytes = unique;
+
+ /* swap blkptrs */
+ {
+ blkptr_t tmp;
+ tmp = csa->ohds->ds_phys->ds_bp;
+ csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
+ csa->cds->ds_phys->ds_bp = tmp;
+ }
+
+ /* set dd_*_bytes */
+ {
+ int64_t dused, dcomp, duncomp;
+ uint64_t cdl_used, cdl_comp, cdl_uncomp;
+ uint64_t odl_used, odl_comp, odl_uncomp;
+
+ VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
+ &cdl_comp, &cdl_uncomp));
+ VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
+ &odl_comp, &odl_uncomp));
+ dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
+ (csa->ohds->ds_phys->ds_used_bytes + odl_used);
+ dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
+ (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
+ duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
+ cdl_uncomp -
+ (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+
+ dsl_dir_diduse_space(csa->ohds->ds_dir,
+ dused, dcomp, duncomp, tx);
+ dsl_dir_diduse_space(csa->cds->ds_dir,
+ -dused, -dcomp, -duncomp, tx);
+ }
+
+#define SWITCH64(x, y) \
+ { \
+ uint64_t __tmp = (x); \
+ (x) = (y); \
+ (y) = __tmp; \
+ }
+
+ /* swap ds_*_bytes */
+ SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
+ csa->cds->ds_phys->ds_used_bytes);
+ SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
+ csa->cds->ds_phys->ds_compressed_bytes);
+ SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
+ csa->cds->ds_phys->ds_uncompressed_bytes);
+ SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
+ csa->cds->ds_phys->ds_unique_bytes);
+
+ /* apply any parent delta for change in unconsumed refreservation */
+ dsl_dir_diduse_space(csa->ohds->ds_dir, csa->unused_refres_delta,
+ 0, 0, tx);
+
+ /* swap deadlists */
+ bplist_close(&csa->cds->ds_deadlist);
+ bplist_close(&csa->ohds->ds_deadlist);
+ SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
+ csa->cds->ds_phys->ds_deadlist_obj);
+ VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+ csa->cds->ds_phys->ds_deadlist_obj));
+ VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+ csa->ohds->ds_phys->ds_deadlist_obj));
+}
+
+/*
+ * Swap 'clone' with its origin head file system.
+ */
+int
+dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
+ boolean_t force)
+{
+ struct cloneswaparg csa;
+
+ ASSERT(clone->ds_open_refcount == DS_REF_MAX);
+ ASSERT(origin_head->ds_open_refcount == DS_REF_MAX);
+
+ csa.cds = clone;
+ csa.ohds = origin_head;
+ csa.force = force;
+ return (dsl_sync_task_do(clone->ds_dir->dd_pool,
+ dsl_dataset_clone_swap_check,
+ dsl_dataset_clone_swap_sync, &csa, NULL, 9));
+}
+
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
+int
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
+{
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+ int error;
+
+ if ((error = spa_open(pname, &spa, FTAG)) != 0)
+ return (error);
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if ((error = dsl_dataset_open_obj(dp, obj,
+ NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ return (error);
+ }
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+
+ return (0);
+}
+
+int
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+ uint64_t asize, uint64_t inflight, uint64_t *used,
+ uint64_t *ref_rsrv)
+{
+ int error = 0;
+
+ ASSERT3S(asize, >, 0);
+
+ /*
+ * *ref_rsrv is the portion of asize that will come from any
+ * unconsumed refreservation space.
+ */
+ *ref_rsrv = 0;
+
+ mutex_enter(&ds->ds_lock);
+ /*
+ * Make a space adjustment for reserved bytes.
+ */
+ if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+ ASSERT3U(*used, >=,
+ ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+ *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+ *ref_rsrv =
+ asize - MIN(asize, parent_delta(ds, asize + inflight));
+ }
+
+ if (!check_quota || ds->ds_quota == 0) {
+ mutex_exit(&ds->ds_lock);
+ return (0);
+ }
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota, they get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
+ if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
+ error = ERESTART;
+ else
+ error = EDQUOT;
+ }
+ mutex_exit(&ds->ds_lock);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
+ return (ENOTSUP);
+
+ if (new_quota == 0)
+ return (0);
+
+ if (new_quota < ds->ds_phys->ds_used_bytes ||
+ new_quota < ds->ds_reserved)
+ return (ENOSPC);
+
+ return (0);
+}
+
+/* ARGSUSED */
+void
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_quota = new_quota;
+ mutex_exit(&ds->ds_lock);
+
+ dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+
+ spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
+ tx, cr, "%lld dataset = %llu ",
+ (longlong_t)new_quota, ds->ds_dir->dd_phys->dd_head_dataset_obj);
+}
+
+int
+dsl_dataset_set_quota(const char *dsname, uint64_t quota)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_open(dsname, DS_MODE_STANDARD, FTAG, &ds);
+ if (err)
+ return (err);
+
+ if (quota != ds->ds_quota) {
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(ds->ds_dir->dd_pool, 0);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
+ ds, &quota, 0);
+ }
+ dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+ return (err);
+}
+
+static int
+dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ int64_t delta;
+ uint64_t unique;
+
+ if (new_reservation > INT64_MAX)
+ return (EOVERFLOW);
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+ SPA_VERSION_REFRESERVATION)
+ return (ENOTSUP);
+
+ if (dsl_dataset_is_snapshot(ds))
+ return (EINVAL);
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ mutex_enter(&ds->ds_lock);
+ unique = dsl_dataset_unique(ds);
+ delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
+ mutex_exit(&ds->ds_lock);
+
+ if (delta > 0 &&
+ delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+ return (ENOSPC);
+ if (delta > 0 && ds->ds_quota > 0 &&
+ new_reservation > ds->ds_quota)
+ return (ENOSPC);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
+ dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ uint64_t unique;
+ int64_t delta;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ mutex_enter(&ds->ds_lock);
+ unique = dsl_dataset_unique(ds);
+ delta = MAX(0, (int64_t)(new_reservation - unique)) -
+ MAX(0, (int64_t)(ds->ds_reserved - unique));
+ ds->ds_reserved = new_reservation;
+ mutex_exit(&ds->ds_lock);
+
+ dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
+ new_reservation, cr, tx);
+
+ dsl_dir_diduse_space(ds->ds_dir, delta, 0, 0, tx);
+
+ spa_history_internal_log(LOG_DS_REFRESERV,
+ ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
+ (longlong_t)new_reservation,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj);
+}
+
+int
+dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_open(dsname, DS_MODE_STANDARD, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_set_reservation_check,
+ dsl_dataset_set_reservation_sync, ds, &reservation, 0);
+ dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+ return (err);
+}
diff --git a/zfs/lib/libzpool/dsl_deleg.c b/zfs/lib/libzpool/dsl_deleg.c
new file mode 100644
index 000000000..bb386c13a
--- /dev/null
+++ b/zfs/lib/libzpool/dsl_deleg.c
@@ -0,0 +1,744 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * DSL permissions are stored in a two level zap attribute
+ * mechanism. The first level identifies the "class" of
+ * entry. The class is identified by the first 2 letters of
+ * the attribute. The second letter "l" or "d" identifies whether
+ * it is a local or descendent permission. The first letter
+ * identifies the type of entry.
+ *
+ * ul$<id> identifies permissions granted locally for this userid.
+ * ud$<id> identifies permissions granted on descendent datasets for
+ * this userid.
+ * Ul$<id> identifies permission sets granted locally for this userid.
+ * Ud$<id> identifies permission sets granted on descendent datasets for
+ * this userid.
+ * gl$<id> identifies permissions granted locally for this groupid.
+ * gd$<id> identifies permissions granted on descendent datasets for
+ * this groupid.
+ * Gl$<id> identifies permission sets granted locally for this groupid.
+ * Gd$<id> identifies permission sets granted on descendent datasets for
+ * this groupid.
+ * el$ identifies permissions granted locally for everyone.
+ * ed$ identifies permissions granted on descendent datasets
+ * for everyone.
+ * El$ identifies permission sets granted locally for everyone.
+ * Ed$ identifies permission sets granted to descendent datasets for
+ * everyone.
+ * c-$ identifies permission to create at dataset creation time.
+ * C-$ identifies permission sets to grant locally at dataset creation
+ * time.
+ * s-$@<name> permissions defined in specified set @<name>
+ * S-$@<name> Sets defined in named set @<name>
+ *
+ * Each of the above entities points to another zap attribute that contains one
+ * attribute for each allowed permission, such as create, destroy,...
+ * All of the "upper" case class types will specify permission set names
+ * rather than permissions.
+ *
+ * Basically it looks something like this:
+ * ul$12 -> ZAP OBJ -> permissions...
+ *
+ * The ZAP OBJ is referred to as the jump object.
+ */
+
+#pragma ident "@(#)dsl_deleg.c 1.5 07/10/29 SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/cred.h>
+#include <sys/sunddi.h>
+
+#include "zfs_deleg.h"
+
+/*
+ * Validate that user is allowed to delegate specified permissions.
+ *
+ * In order to delegate "create" you must have "create"
+ * and "allow".
+ */
+int
+dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+ nvpair_t *whopair = NULL;
+ int error;
+
+ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+ return (error);
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+
+ VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+
+ if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
+ return (EPERM);
+
+ if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Validate that user is allowed to unallow specified permissions. They
+ * must have the 'allow' permission, and even then can only unallow
+ * perms for their uid.
+ */
+int
+dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+ nvpair_t *whopair = NULL;
+ int error;
+ char idstr[32];
+
+ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+ return (error);
+
+ (void) snprintf(idstr, sizeof (idstr), "%lld",
+ (longlong_t)crgetuid(cr));
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
+
+ if (type != ZFS_DELEG_USER &&
+ type != ZFS_DELEG_USER_SETS)
+ return (EPERM);
+
+ if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
+ return (EPERM);
+ }
+ return (0);
+}
+
+static void
+dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ nvlist_t *nvp = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ nvpair_t *whopair = NULL;
+ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+ if (zapobj == 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ }
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ const char *whokey = nvpair_name(whopair);
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+ uint64_t jumpobj;
+
+ VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
+ jumpobj = zap_create(mos, DMU_OT_DSL_PERMS,
+ DMU_OT_NONE, 0, tx);
+ VERIFY(zap_update(mos, zapobj,
+ whokey, 8, 1, &jumpobj, tx) == 0);
+ }
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+ uint64_t n = 0;
+
+ VERIFY(zap_update(mos, jumpobj,
+ perm, 8, 1, &n, tx) == 0);
+ spa_history_internal_log(LOG_DS_PERM_UPDATE,
+ dd->dd_pool->dp_spa, tx, cr,
+ "%s %s dataset = %llu", whokey, perm,
+ dd->dd_phys->dd_head_dataset_obj);
+ }
+ }
+}
+
+static void
+dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ nvlist_t *nvp = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ nvpair_t *whopair = NULL;
+ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+ if (zapobj == 0)
+ return;
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+ const char *whokey = nvpair_name(whopair);
+ nvlist_t *perms;
+ nvpair_t *permpair = NULL;
+ uint64_t jumpobj;
+
+ if (nvpair_value_nvlist(whopair, &perms) != 0) {
+ if (zap_lookup(mos, zapobj, whokey, 8,
+ 1, &jumpobj) == 0) {
+ (void) zap_remove(mos, zapobj, whokey, tx);
+ VERIFY(0 == zap_destroy(mos, jumpobj, tx));
+ }
+ spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
+ dd->dd_pool->dp_spa, tx, cr,
+ "%s dataset = %llu", whokey,
+ dd->dd_phys->dd_head_dataset_obj);
+ continue;
+ }
+
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
+ continue;
+
+ while (permpair = nvlist_next_nvpair(perms, permpair)) {
+ const char *perm = nvpair_name(permpair);
+ uint64_t n = 0;
+
+ (void) zap_remove(mos, jumpobj, perm, tx);
+ if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
+ (void) zap_remove(mos, zapobj,
+ whokey, tx);
+ VERIFY(0 == zap_destroy(mos,
+ jumpobj, tx));
+ }
+ spa_history_internal_log(LOG_DS_PERM_REMOVE,
+ dd->dd_pool->dp_spa, tx, cr,
+ "%s %s dataset = %llu", whokey, perm,
+ dd->dd_phys->dd_head_dataset_obj);
+ }
+ }
+}
+
+int
+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+{
+ dsl_dir_t *dd;
+ int error;
+ nvpair_t *whopair = NULL;
+ int blocks_modified = 0;
+
+ error = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (error)
+ return (error);
+
+ if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS) {
+ dsl_dir_close(dd, FTAG);
+ return (ENOTSUP);
+ }
+
+ while (whopair = nvlist_next_nvpair(nvp, whopair))
+ blocks_modified++;
+
+ error = dsl_sync_task_do(dd->dd_pool, NULL,
+ unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
+ dd, nvp, blocks_modified);
+ dsl_dir_close(dd, FTAG);
+
+ return (error);
+}
+
+/*
+ * Find all 'allow' permissions from a given point and then continue
+ * traversing up to the root.
+ *
+ * This function constructs an nvlist of nvlists.
+ * each setpoint is an nvlist composed of an nvlist of an nvlist
+ * of the individual * users/groups/everyone/create
+ * permissions.
+ *
+ * The nvlist will look like this.
+ *
+ * { source fsname -> { whokeys { permissions,...}, ...}}
+ *
+ * The fsname nvpairs will be arranged in a bottom up order. For example,
+ * if we have the following structure a/b/c then the nvpairs for the fsnames
+ * will be ordered a/b/c, a/b, a.
+ */
+int
+dsl_deleg_get(const char *ddname, nvlist_t **nvp)
+{
+ dsl_dir_t *dd, *startdd;
+ dsl_pool_t *dp;
+ int error;
+ objset_t *mos;
+
+ error = dsl_dir_open(ddname, FTAG, &startdd, NULL);
+ if (error)
+ return (error);
+
+ dp = startdd->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
+ zap_cursor_t basezc;
+ zap_attribute_t baseza;
+ nvlist_t *sp_nvp;
+ uint64_t n;
+ char source[MAXNAMELEN];
+
+ if (dd->dd_phys->dd_deleg_zapobj &&
+ (zap_count(mos, dd->dd_phys->dd_deleg_zapobj,
+ &n) == 0) && n) {
+ VERIFY(nvlist_alloc(&sp_nvp,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ } else {
+ continue;
+ }
+
+ for (zap_cursor_init(&basezc, mos,
+ dd->dd_phys->dd_deleg_zapobj);
+ zap_cursor_retrieve(&basezc, &baseza) == 0;
+ zap_cursor_advance(&basezc)) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ nvlist_t *perms_nvp;
+
+ ASSERT(baseza.za_integer_length == 8);
+ ASSERT(baseza.za_num_integers == 1);
+
+ VERIFY(nvlist_alloc(&perms_nvp,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ VERIFY(nvlist_add_boolean(perms_nvp,
+ za.za_name) == 0);
+ }
+ zap_cursor_fini(&zc);
+ VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name,
+ perms_nvp) == 0);
+ nvlist_free(perms_nvp);
+ }
+
+ zap_cursor_fini(&basezc);
+
+ dsl_dir_name(dd, source);
+ VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0);
+ nvlist_free(sp_nvp);
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ dsl_dir_close(startdd, FTAG);
+ return (0);
+}
+
+/*
+ * Routines for dsl_deleg_access() -- access checking.
+ */
+typedef struct perm_set {
+ avl_node_t p_node;
+ boolean_t p_matched;
+ char p_setname[ZFS_MAX_DELEG_NAME];
+} perm_set_t;
+
+static int
+perm_set_compare(const void *arg1, const void *arg2)
+{
+ const perm_set_t *node1 = arg1;
+ const perm_set_t *node2 = arg2;
+ int val;
+
+ val = strcmp(node1->p_setname, node2->p_setname);
+ if (val == 0)
+ return (0);
+ return (val > 0 ? 1 : -1);
+}
+
+/*
+ * Determine whether a specified permission exists.
+ *
+ * First the base attribute has to be retrieved. i.e. ul$12
+ * Once the base object has been retrieved the actual permission
+ * is lookup up in the zap object the base object points to.
+ *
+ * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
+ * there is no perm in that jumpobj.
+ */
+static int
+dsl_check_access(objset_t *mos, uint64_t zapobj,
+ char type, char checkflag, void *valp, const char *perm)
+{
+ int error;
+ uint64_t jumpobj, zero;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey, type, checkflag, valp);
+ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+ if (error == 0) {
+ error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
+ if (error == ENOENT)
+ error = EPERM;
+ }
+ return (error);
+}
+
+/*
+ * check a specified user/group for a requested permission
+ */
+static int
+dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
+ int checkflag, cred_t *cr)
+{
+ const gid_t *gids;
+ int ngids;
+ int i;
+ uint64_t id;
+
+ /* check for user */
+ id = crgetuid(cr);
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_USER, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check for users primary group */
+ id = crgetgid(cr);
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check for everyone entry */
+ id = -1;
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
+ return (0);
+
+ /* check each supplemental group user is a member of */
+ ngids = crgetngroups(cr);
+ gids = crgetgroups(cr);
+ for (i = 0; i != ngids; i++) {
+ id = gids[i];
+ if (dsl_check_access(mos, zapobj,
+ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+ return (0);
+ }
+
+ return (EPERM);
+}
+
+/*
+ * Iterate over the sets specified in the specified zapobj
+ * and load them into the permsets avl tree.
+ */
+static int
+dsl_load_sets(objset_t *mos, uint64_t zapobj,
+ char type, char checkflag, void *valp, avl_tree_t *avl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ perm_set_t *permnode;
+ avl_index_t idx;
+ uint64_t jumpobj;
+ int error;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey, type, checkflag, valp);
+
+ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+ if (error != 0)
+ return (error);
+
+ for (zap_cursor_init(&zc, mos, jumpobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
+ (void) strlcpy(permnode->p_setname, za.za_name,
+ sizeof (permnode->p_setname));
+ permnode->p_matched = B_FALSE;
+
+ if (avl_find(avl, permnode, &idx) == NULL) {
+ avl_insert(avl, permnode, idx);
+ } else {
+ kmem_free(permnode, sizeof (perm_set_t));
+ }
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+/*
+ * Load all permissions user based on cred belongs to.
+ */
+static void
+dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
+ char checkflag, cred_t *cr)
+{
+ const gid_t *gids;
+ int ngids, i;
+ uint64_t id;
+
+ id = crgetuid(cr);
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_USER_SETS, checkflag, &id, avl);
+
+ id = crgetgid(cr);
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
+
+ ngids = crgetngroups(cr);
+ gids = crgetgroups(cr);
+ for (i = 0; i != ngids; i++) {
+ id = gids[i];
+ (void) dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+ }
+}
+
+/*
+ * Check if user has requested permission.
+ */
+int
+dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr)
+{
+ dsl_dir_t *dd, *startdd;
+ dsl_pool_t *dp;
+ void *cookie;
+ int error;
+ char checkflag = ZFS_DELEG_LOCAL;
+ const char *tail;
+ objset_t *mos;
+ avl_tree_t permsets;
+ perm_set_t *setnode;
+
+ /*
+ * Use tail so that zfs_ioctl() code doesn't have
+ * to always to to figure out parent name in order
+ * to do access check. for example renaming a snapshot
+ */
+ error = dsl_dir_open(ddname, FTAG, &startdd, &tail);
+ if (error)
+ return (error);
+
+ if (tail && tail[0] != '@') {
+ dsl_dir_close(startdd, FTAG);
+ return (ENOENT);
+ }
+ dp = startdd->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ if (dsl_delegation_on(mos) == B_FALSE) {
+ dsl_dir_close(startdd, FTAG);
+ return (ECANCELED);
+ }
+
+ if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS) {
+ dsl_dir_close(startdd, FTAG);
+ return (EPERM);
+ }
+
+ avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
+ offsetof(perm_set_t, p_node));
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ for (dd = startdd; dd != NULL; dd = dd->dd_parent,
+ checkflag = ZFS_DELEG_DESCENDENT) {
+ uint64_t zapobj;
+ boolean_t expanded;
+
+ /*
+ * If not in global zone then make sure
+ * the zoned property is set
+ */
+ if (!INGLOBALZONE(curproc)) {
+ uint64_t zoned;
+
+ if (dsl_prop_get_ds_locked(dd,
+ zfs_prop_to_name(ZFS_PROP_ZONED),
+ 8, 1, &zoned, NULL) != 0)
+ break;
+ if (!zoned)
+ break;
+ }
+ zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+ if (zapobj == 0)
+ continue;
+
+ dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
+again:
+ expanded = B_FALSE;
+ for (setnode = avl_first(&permsets); setnode;
+ setnode = AVL_NEXT(&permsets, setnode)) {
+ if (setnode->p_matched == B_TRUE)
+ continue;
+
+ /* See if this set directly grants this permission */
+ error = dsl_check_access(mos, zapobj,
+ ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
+ if (error == 0)
+ goto success;
+ if (error == EPERM)
+ setnode->p_matched = B_TRUE;
+
+ /* See if this set includes other sets */
+ error = dsl_load_sets(mos, zapobj,
+ ZFS_DELEG_NAMED_SET_SETS, 0,
+ setnode->p_setname, &permsets);
+ if (error == 0)
+ setnode->p_matched = expanded = B_TRUE;
+ }
+ /*
+ * If we expanded any sets, that will define more sets,
+ * which we need to check.
+ */
+ if (expanded)
+ goto again;
+
+ error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
+ if (error == 0)
+ goto success;
+ }
+ error = EPERM;
+success:
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dir_close(startdd, FTAG);
+
+ cookie = NULL;
+ while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
+ kmem_free(setnode, sizeof (perm_set_t));
+
+ return (error);
+}
+
+/*
+ * Other routines.
+ */
+
+static void
+copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
+ boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t jumpobj, pjumpobj;
+ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ char whokey[ZFS_MAX_DELEG_NAME];
+
+ zfs_deleg_whokey(whokey,
+ dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
+ ZFS_DELEG_LOCAL, NULL);
+ if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
+ return;
+
+ if (zapobj == 0) {
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ }
+
+ zfs_deleg_whokey(whokey,
+ dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
+ ZFS_DELEG_LOCAL, &uid);
+ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
+ jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
+ }
+
+ for (zap_cursor_init(&zc, mos, pjumpobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t zero = 0;
+ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+
+ VERIFY(zap_update(mos, jumpobj, za.za_name,
+ 8, 1, &zero, tx) == 0);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * set all create time permission on new dataset.
+ */
+void
+dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
+{
+ dsl_dir_t *dd;
+ uint64_t uid = crgetuid(cr);
+
+ if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
+ SPA_VERSION_DELEGATED_PERMS)
+ return;
+
+ for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
+ uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj;
+
+ if (pzapobj == 0)
+ continue;
+
+ copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
+ copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
+ }
+}
+
+int
+dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (zapobj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, mos, zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+ VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
+ }
+ zap_cursor_fini(&zc);
+ VERIFY(0 == zap_destroy(mos, zapobj, tx));
+ return (0);
+}
+
+boolean_t
+dsl_delegation_on(objset_t *os)
+{
+ return (os->os->os_spa->spa_delegation);
+}
diff --git a/zfs/lib/libzpool/dsl_dir.c b/zfs/lib/libzpool/dsl_dir.c
new file mode 100644
index 000000000..fac1664c3
--- /dev/null
+++ b/zfs/lib/libzpool/dsl_dir.c
@@ -0,0 +1,1269 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dsl_dir.c 1.25 08/03/25 SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/sunddi.h>
+#include "zfs_namecheck.h"
+
+static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
+ cred_t *cr, dmu_tx_t *tx);
+
+
+/* ARGSUSED */
+static void
+dsl_dir_evict(dmu_buf_t *db, void *arg)
+{
+ dsl_dir_t *dd = arg;
+ dsl_pool_t *dp = dd->dd_pool;
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+ ASSERT(dd->dd_tempreserved[t] == 0);
+ ASSERT(dd->dd_space_towrite[t] == 0);
+ }
+
+ ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
+
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+
+ spa_close(dd->dd_pool->dp_spa, dd);
+
+ /*
+ * The props callback list should be empty since they hold the
+ * dir open.
+ */
+ list_destroy(&dd->dd_prop_cbs);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+int
+dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **ddp)
+{
+ dmu_buf_t *dbuf;
+ dsl_dir_t *dd;
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+
+ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+ if (err)
+ return (err);
+ dd = dmu_buf_get_user(dbuf);
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbuf, &doi);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
+ }
+#endif
+ /* XXX assert bonus buffer size is correct */
+ if (dd == NULL) {
+ dsl_dir_t *winner;
+ int err;
+
+ dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+ dd->dd_object = ddobj;
+ dd->dd_dbuf = dbuf;
+ dd->dd_pool = dp;
+ dd->dd_phys = dbuf->db_data;
+ dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
+ mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_node));
+
+ if (dd->dd_phys->dd_parent_obj) {
+ err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
+ NULL, dd, &dd->dd_parent);
+ if (err) {
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ if (tail) {
+#ifdef ZFS_DEBUG
+ uint64_t foundobj;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ tail, sizeof (foundobj), 1, &foundobj);
+ ASSERT(err || foundobj == ddobj);
+#endif
+ (void) strcpy(dd->dd_myname, tail);
+ } else {
+ err = zap_value_search(dp->dp_meta_objset,
+ dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ ddobj, 0, dd->dd_myname);
+ }
+ if (err) {
+ dsl_dir_close(dd->dd_parent, dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ } else {
+ (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
+ }
+
+ winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
+ dsl_dir_evict);
+ if (winner) {
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dd = winner;
+ } else {
+ spa_open_ref(dp->dp_spa, dd);
+ }
+ }
+
+ /*
+ * The dsl_dir_t has both open-to-close and instantiate-to-evict
+ * holds on the spa. We need the open-to-close holds because
+ * otherwise the spa_refcnt wouldn't change when we open a
+ * dir which the spa also has open, so we could incorrectly
+ * think it was OK to unload/export/destroy the pool. We need
+ * the instantiate-to-evict hold because the dsl_dir_t has a
+ * pointer to the dd_pool, which has a pointer to the spa_t.
+ */
+ spa_open_ref(dp->dp_spa, tag);
+ ASSERT3P(dd->dd_pool, ==, dp);
+ ASSERT3U(dd->dd_object, ==, ddobj);
+ ASSERT3P(dd->dd_dbuf, ==, dbuf);
+ *ddp = dd;
+ return (0);
+}
+
+void
+dsl_dir_close(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+ if (dd->dd_parent) {
+ dsl_dir_name(dd->dd_parent, buf);
+ (void) strcat(buf, "/");
+ } else {
+ buf[0] = '\0';
+ }
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /*
+ * recursive mutex so that we can use
+ * dprintf_dd() with dd_lock held
+ */
+ mutex_enter(&dd->dd_lock);
+ (void) strcat(buf, dd->dd_myname);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ (void) strcat(buf, dd->dd_myname);
+ }
+}
+
+/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */
+int
+dsl_dir_namelen(dsl_dir_t *dd)
+{
+ int result = 0;
+
+ if (dd->dd_parent) {
+ /* parent's name + 1 for the "/" */
+ result = dsl_dir_namelen(dd->dd_parent) + 1;
+ }
+
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /* see dsl_dir_name */
+ mutex_enter(&dd->dd_lock);
+ result += strlen(dd->dd_myname);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ result += strlen(dd->dd_myname);
+ }
+
+ return (result);
+}
+
+int
+dsl_dir_is_private(dsl_dir_t *dd)
+{
+ int rv = FALSE;
+
+ if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
+ rv = TRUE;
+ if (dataset_name_hidden(dd->dd_myname))
+ rv = TRUE;
+ return (rv);
+}
+
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+ char *p;
+ if (path == NULL)
+ return (ENOENT);
+ /* This would be a good place to reserve some namespace... */
+ p = strpbrk(path, "/@");
+ if (p && (p[1] == '/' || p[1] == '@')) {
+ /* two separators in a row */
+ return (EINVAL);
+ }
+ if (p == NULL || p == path) {
+ /*
+ * if the first thing is an @ or /, it had better be an
+ * @ and it had better not have any more ats or slashes,
+ * and it had better have something after the @.
+ */
+ if (p != NULL &&
+ (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+ return (EINVAL);
+ if (strlen(path) >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strcpy(component, path);
+ p = NULL;
+ } else if (p[0] == '/') {
+ if (p-path >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(component, path, p - path);
+ component[p-path] = '\0';
+ p++;
+ } else if (p[0] == '@') {
+ /*
+ * if the next separator is an @, there better not be
+ * any more slashes.
+ */
+ if (strchr(path, '/'))
+ return (EINVAL);
+ if (p-path >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(component, path, p - path);
+ component[p-path] = '\0';
+ } else {
+ ASSERT(!"invalid p");
+ }
+ *nextp = p;
+ return (0);
+}
+
+/*
+ * same as dsl_open_dir, ignore the first component of name and use the
+ * spa instead
+ */
+int
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+ dsl_dir_t **ddp, const char **tailp)
+{
+ char buf[MAXNAMELEN];
+ const char *next, *nextnext = NULL;
+ int err;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ uint64_t ddobj;
+ int openedspa = FALSE;
+
+ dprintf("%s\n", name);
+
+ err = getcomponent(name, buf, &next);
+ if (err)
+ return (err);
+ if (spa == NULL) {
+ err = spa_open(buf, &spa, FTAG);
+ if (err) {
+ dprintf("spa_open(%s) failed\n", buf);
+ return (err);
+ }
+ openedspa = TRUE;
+
+ /* XXX this assertion belongs in spa_open */
+ ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
+ }
+
+ dp = spa_get_dsl(spa);
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+ if (err) {
+ rw_exit(&dp->dp_config_rwlock);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
+ while (next != NULL) {
+ dsl_dir_t *child_ds;
+ err = getcomponent(next, buf, &nextnext);
+ if (err)
+ break;
+ ASSERT(next[0] != '\0');
+ if (next[0] == '@')
+ break;
+ dprintf("looking up %s in obj%lld\n",
+ buf, dd->dd_phys->dd_child_dir_zapobj);
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj,
+ buf, sizeof (ddobj), 1, &ddobj);
+ if (err) {
+ if (err == ENOENT)
+ err = 0;
+ break;
+ }
+
+ err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
+ if (err)
+ break;
+ dsl_dir_close(dd, tag);
+ dd = child_ds;
+ next = nextnext;
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ if (err) {
+ dsl_dir_close(dd, tag);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
+ /*
+ * It's an error if there's more than one component left, or
+ * tailp==NULL and there's any component left.
+ */
+ if (next != NULL &&
+ (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+ /* bad path name */
+ dsl_dir_close(dd, tag);
+ dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+ err = ENOENT;
+ }
+ if (tailp)
+ *tailp = next;
+ if (openedspa)
+ spa_close(spa, FTAG);
+ *ddp = dd;
+ return (err);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail. Return NULL if the path is bogus, or if
+ * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@'
+ * means that the last component is a snapshot.
+ */
+int
+dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
+{
+ return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
+}
+
+uint64_t
+dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+{
+ objset_t *mos = pds->dd_pool->dp_meta_objset;
+ uint64_t ddobj;
+ dsl_dir_phys_t *dsphys;
+ dmu_buf_t *dbuf;
+
+ ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+ VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+ name, sizeof (uint64_t), 1, &ddobj, tx));
+ VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+
+ dsphys->dd_creation_time = gethrestime_sec();
+ dsphys->dd_parent_obj = pds->dd_object;
+ dsphys->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ dsphys->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+ dmu_buf_rele(dbuf, FTAG);
+
+ return (ddobj);
+}
+
+/* ARGSUSED */
+int
+dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+ uint64_t count;
+
+ /*
+ * There should be exactly two holds, both from
+ * dsl_dataset_destroy: one on the dd directory, and one on its
+ * head ds. Otherwise, someone is trying to lookup something
+ * inside this dir while we want to destroy it. The
+ * config_rwlock ensures that nobody else opens it after we
+ * check.
+ */
+ if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+ return (EBUSY);
+
+ err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
+ if (err)
+ return (err);
+ if (count != 0)
+ return (EEXIST);
+
+ return (0);
+}
+
+void
+dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t val, obj;
+
+ ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
+ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+
+ /* Remove our reservation. */
+ val = 0;
+ dsl_dir_set_reservation_sync(dd, &val, cr, tx);
+ ASSERT3U(dd->dd_used_bytes, ==, 0);
+ ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
+
+ VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
+ VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
+ VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
+ VERIFY(0 == zap_remove(mos,
+ dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+
+ obj = dd->dd_object;
+ dsl_dir_close(dd, tag);
+ VERIFY(0 == dmu_object_free(mos, obj, tx));
+}
+
+void
+dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+ dsl_dir_phys_t *dsp;
+ dmu_buf_t *dbuf;
+ int error;
+
+ *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+
+ error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
+ sizeof (uint64_t), 1, ddobjp, tx);
+ ASSERT3U(error, ==, 0);
+
+ VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsp = dbuf->db_data;
+
+ dsp->dd_creation_time = gethrestime_sec();
+ dsp->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ dsp->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+
+ dmu_buf_rele(dbuf, FTAG);
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
+{
+ mutex_enter(&dd->dd_lock);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
+ dd->dd_phys->dd_reserved);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+ dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
+ (dd->dd_phys->dd_uncompressed_bytes * 100 /
+ dd->dd_phys->dd_compressed_bytes));
+ mutex_exit(&dd->dd_lock);
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ if (dd->dd_phys->dd_origin_obj) {
+ dsl_dataset_t *ds;
+ char buf[MAXNAMELEN];
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_origin_obj,
+ NULL, DS_MODE_NONE, FTAG, &ds));
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
+ }
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+
+ ASSERT(dd->dd_phys);
+
+ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(dd->dd_dbuf, dd);
+ }
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+ uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
+ uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+ return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
+ dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
+ dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
+ mutex_exit(&dd->dd_lock);
+
+ /* release the hold from dsl_dir_dirty */
+ dmu_buf_rele(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_space_towrite(dsl_dir_t *dd)
+{
+ uint64_t space = 0;
+ int i;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ space += dd->dd_space_towrite[i&TXG_MASK];
+ ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
+ }
+ return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it? If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+ uint64_t parentspace, myspace, quota, used;
+
+ /*
+ * If there are no restrictions otherwise, assume we have
+ * unlimited space available.
+ */
+ quota = UINT64_MAX;
+ parentspace = UINT64_MAX;
+
+ if (dd->dd_parent != NULL) {
+ parentspace = dsl_dir_space_available(dd->dd_parent,
+ ancestor, delta, ondiskonly);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ if (dd->dd_phys->dd_quota != 0)
+ quota = dd->dd_phys->dd_quota;
+ used = dd->dd_used_bytes;
+ if (!ondiskonly)
+ used += dsl_dir_space_towrite(dd);
+ if (dd == ancestor)
+ used += delta;
+
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
+ quota = MIN(quota, poolsize);
+ }
+
+ if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+ /*
+ * We have some space reserved, in addition to what our
+ * parent gave us.
+ */
+ parentspace += dd->dd_phys->dd_reserved - used;
+ }
+
+ if (used > quota) {
+ /* over quota */
+ myspace = 0;
+
+ /*
+ * While it's OK to be a little over quota, if
+ * we think we are using more space than there
+ * is in the pool (which is already 1.6% more than
+ * dsl_pool_adjustedsize()), something is very
+ * wrong.
+ */
+ ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
+ } else {
+ /*
+ * the lesser of the space provided by our parent and
+ * the space left in our quota
+ */
+ myspace = MIN(parentspace, quota - used);
+ }
+
+ mutex_exit(&dd->dd_lock);
+
+ return (myspace);
+}
+
+struct tempreserve {
+ list_node_t tr_node;
+ dsl_pool_t *tr_dp;
+ dsl_dir_t *tr_ds;
+ uint64_t tr_size;
+};
+
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
+ boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
+ dmu_tx_t *tx, boolean_t first)
+{
+ uint64_t txg = tx->tx_txg;
+ uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+ struct tempreserve *tr;
+ int enospc = EDQUOT;
+ int txgidx = txg & TXG_MASK;
+ int i;
+ uint64_t ref_rsrv = 0;
+
+ ASSERT3U(txg, !=, 0);
+ ASSERT3S(asize, >, 0);
+
+ mutex_enter(&dd->dd_lock);
+
+ /*
+ * Check against the dsl_dir's quota. We don't add in the delta
+ * when checking for over-quota because they get one free hit.
+ */
+ est_inflight = dsl_dir_space_towrite(dd);
+ for (i = 0; i < TXG_SIZE; i++)
+ est_inflight += dd->dd_tempreserved[i];
+ used_on_disk = dd->dd_used_bytes;
+
+ /*
+ * On the first iteration, fetch the dataset's used-on-disk and
+ * refreservation values. Also, if checkrefquota is set, test if
+ * allocating this space would exceed the dataset's refquota.
+ */
+ if (first && tx->tx_objset) {
+ int error;
+ dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+
+ error = dsl_dataset_check_quota(ds, checkrefquota,
+ asize, est_inflight, &used_on_disk, &ref_rsrv);
+ if (error) {
+ mutex_exit(&dd->dd_lock);
+ return (error);
+ }
+ }
+
+ /*
+ * If this transaction will result in a net free of space,
+ * we want to let it through.
+ */
+ if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
+ quota = UINT64_MAX;
+ else
+ quota = dd->dd_phys->dd_quota;
+
+ /*
+ * Adjust the quota against the actual pool size at the root.
+ * To ensure that it's possible to remove files from a full
+ * pool without inducing transient overcommits, we throttle
+ * netfree transactions against a quota that is slightly larger,
+ * but still within the pool's allocation slop. In cases where
+ * we're very close to full, this will allow a steady trickle of
+ * removes to get through.
+ */
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
+ if (poolsize < quota) {
+ quota = poolsize;
+ enospc = ENOSPC;
+ }
+ }
+
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota, they get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (used_on_disk + est_inflight > quota) {
+ if (est_inflight > 0 || used_on_disk < quota)
+ enospc = ERESTART;
+ dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
+ "quota=%lluK tr=%lluK err=%d\n",
+ used_on_disk>>10, est_inflight>>10,
+ quota>>10, asize>>10, enospc);
+ mutex_exit(&dd->dd_lock);
+ return (enospc);
+ }
+
+ /* We need to up our estimated delta before dropping dd_lock */
+ dd->dd_tempreserved[txgidx] += asize;
+
+ parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
+ asize - ref_rsrv);
+ mutex_exit(&dd->dd_lock);
+
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = dd;
+ tr->tr_size = asize;
+ list_insert_tail(tr_list, tr);
+
+ /* see if it's OK with our parent */
+ if (dd->dd_parent && parent_rsrv) {
+ boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+
+ return (dsl_dir_tempreserve_impl(dd->dd_parent,
+ parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and dsl_dir_willuse_space()
+ * has been called), the reservation should be canceled, using
+ * dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
+ uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
+{
+ int err;
+ list_t *tr_list;
+
+ if (asize == 0) {
+ *tr_cookiep = NULL;
+ return (0);
+ }
+
+ tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(tr_list, sizeof (struct tempreserve),
+ offsetof(struct tempreserve, tr_node));
+ ASSERT3S(asize, >, 0);
+ ASSERT3S(fsize, >=, 0);
+
+ err = arc_tempreserve_space(lsize, tx->tx_txg);
+ if (err == 0) {
+ struct tempreserve *tr;
+
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_size = lsize;
+ list_insert_tail(tr_list, tr);
+
+ err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
+ } else {
+ if (err == EAGAIN) {
+ txg_delay(dd->dd_pool, tx->tx_txg, 1);
+ err = ERESTART;
+ }
+ dsl_pool_memory_pressure(dd->dd_pool);
+ }
+
+ if (err == 0) {
+ struct tempreserve *tr;
+
+ tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_dp = dd->dd_pool;
+ tr->tr_size = asize;
+ list_insert_tail(tr_list, tr);
+
+ err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+ FALSE, asize > usize, tr_list, tx, TRUE);
+ }
+
+ if (err)
+ dsl_dir_tempreserve_clear(tr_list, tx);
+ else
+ *tr_cookiep = tr_list;
+
+ return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+ int txgidx = tx->tx_txg & TXG_MASK;
+ list_t *tr_list = tr_cookie;
+ struct tempreserve *tr;
+
+ ASSERT3U(tx->tx_txg, !=, 0);
+
+ if (tr_cookie == NULL)
+ return;
+
+ while (tr = list_head(tr_list)) {
+ if (tr->tr_dp) {
+ dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
+ } else if (tr->tr_ds) {
+ mutex_enter(&tr->tr_ds->dd_lock);
+ ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+ tr->tr_size);
+ tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+ mutex_exit(&tr->tr_ds->dd_lock);
+ } else {
+ arc_tempreserve_clear(tr->tr_size);
+ }
+ list_remove(tr_list, tr);
+ kmem_free(tr, sizeof (struct tempreserve));
+ }
+
+ kmem_free(tr_list, sizeof (list_t));
+}
+
+static void
+dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+ int64_t parent_space;
+ uint64_t est_used;
+
+ mutex_enter(&dd->dd_lock);
+ if (space > 0)
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+ est_used = dsl_dir_space_towrite(dd) + dd->dd_used_bytes;
+ parent_space = parent_delta(dd, est_used, space);
+ mutex_exit(&dd->dd_lock);
+
+ /* Make sure that we clean up dd_space_to* */
+ dsl_dir_dirty(dd, tx);
+
+ /* XXX this is potentially expensive and unnecessary... */
+ if (parent_space && dd->dd_parent)
+ dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
+}
+
+/*
+ * Call in open context when we think we're going to write/free space,
+ * eg. when dirtying data. Be conservative (ie. OK to write less than
+ * this or free more than this, but don't write more or free less).
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+ dsl_pool_willuse_space(dd->dd_pool, space, tx);
+ dsl_dir_willuse_space_impl(dd, space, tx);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+ int64_t accounted_delta;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsl_dir_dirty(dd, tx);
+
+ mutex_enter(&dd->dd_lock);
+ accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
+ ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
+ ASSERT(compressed >= 0 ||
+ dd->dd_phys->dd_compressed_bytes >= -compressed);
+ ASSERT(uncompressed >= 0 ||
+ dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
+ dd->dd_used_bytes += used;
+ dd->dd_phys->dd_uncompressed_bytes += uncompressed;
+ dd->dd_phys->dd_compressed_bytes += compressed;
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent != NULL) {
+ dsl_dir_diduse_space(dd->dd_parent,
+ accounted_delta, compressed, uncompressed, tx);
+ }
+}
+
+static int
+dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+ int err = 0;
+ uint64_t towrite;
+
+ if (new_quota == 0)
+ return (0);
+
+ mutex_enter(&dd->dd_lock);
+ /*
+ * If we are doing the preliminary check in open context, and
+ * there are pending changes, then don't fail it, since the
+ * pending changes could under-estimate the amount of space to be
+ * freed up.
+ */
+ towrite = dsl_dir_space_towrite(dd);
+ if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
+ (new_quota < dd->dd_phys->dd_reserved ||
+ new_quota < dd->dd_used_bytes + towrite)) {
+ err = ENOSPC;
+ }
+ mutex_exit(&dd->dd_lock);
+ return (err);
+}
+
+/* ARGSUSED */
+static void
+dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ dd->dd_phys->dd_quota = new_quota;
+ mutex_exit(&dd->dd_lock);
+
+ spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+ tx, cr, "%lld dataset = %llu ",
+ (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, uint64_t quota)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+
+ if (quota != dd->dd_phys->dd_quota) {
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(dd->dd_pool, 0);
+
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+ dsl_dir_set_quota_sync, dd, &quota, 0);
+ }
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+int
+dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ uint64_t used, avail;
+ int64_t delta;
+
+ if (new_reservation > INT64_MAX)
+ return (EOVERFLOW);
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ mutex_enter(&dd->dd_lock);
+ used = dd->dd_used_bytes;
+ delta = MAX(used, new_reservation) -
+ MAX(used, dd->dd_phys->dd_reserved);
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent) {
+ avail = dsl_dir_space_available(dd->dd_parent,
+ NULL, 0, FALSE);
+ } else {
+ avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+ }
+
+ if (delta > 0 && delta > avail)
+ return (ENOSPC);
+ if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
+ new_reservation > dd->dd_phys->dd_quota)
+ return (ENOSPC);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ uint64_t used;
+ int64_t delta;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ used = dd->dd_used_bytes;
+ delta = MAX(used, new_reservation) -
+ MAX(used, dd->dd_phys->dd_reserved);
+ dd->dd_phys->dd_reserved = new_reservation;
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent != NULL) {
+ /* Roll up this additional usage into our ancestors */
+ dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
+ }
+
+ spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+ tx, cr, "%lld dataset = %llu",
+ (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj);
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
+ dsl_dir_set_reservation_sync, dd, &reservation, 0);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+ for (; ds1; ds1 = ds1->dd_parent) {
+ dsl_dir_t *dd;
+ for (dd = ds2; dd; dd = dd->dd_parent) {
+ if (ds1 == dd)
+ return (dd);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor? Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+ if (dd == ancestor)
+ return (delta);
+
+ mutex_enter(&dd->dd_lock);
+ delta = parent_delta(dd, dd->dd_used_bytes, delta);
+ mutex_exit(&dd->dd_lock);
+ return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+struct renamearg {
+ dsl_dir_t *newparent;
+ const char *mynewname;
+};
+
+/*ARGSUSED*/
+static int
+dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct renamearg *ra = arg2;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+ uint64_t val;
+
+ /* There should be 2 references: the open and the dirty */
+ if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+ return (EBUSY);
+
+ /* check for existing name */
+ err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
+ ra->mynewname, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
+ if (ra->newparent != dd->dd_parent) {
+ /* is there enough space? */
+ uint64_t myspace =
+ MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+
+ /* no rename into our descendant */
+ if (closest_common_ancestor(dd, ra->newparent) == dd)
+ return (EINVAL);
+
+ if (err = dsl_dir_transfer_possible(dd->dd_parent,
+ ra->newparent, myspace))
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct renamearg *ra = arg2;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+
+ ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
+
+ if (ra->newparent != dd->dd_parent) {
+ uint64_t myspace =
+ MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+
+ dsl_dir_diduse_space(dd->dd_parent, -myspace,
+ -dd->dd_phys->dd_compressed_bytes,
+ -dd->dd_phys->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(ra->newparent, myspace,
+ dd->dd_phys->dd_compressed_bytes,
+ dd->dd_phys->dd_uncompressed_bytes, tx);
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ /* remove from old parent zapobj */
+ err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ dd->dd_myname, tx);
+ ASSERT3U(err, ==, 0);
+
+ (void) strcpy(dd->dd_myname, ra->mynewname);
+ dsl_dir_close(dd->dd_parent, dd);
+ dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
+ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
+ ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
+
+ /* add to new parent zapobj */
+ err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
+ dd->dd_myname, 8, 1, &dd->dd_object, tx);
+ ASSERT3U(err, ==, 0);
+
+ spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+ tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
+}
+
+int
+dsl_dir_rename(dsl_dir_t *dd, const char *newname)
+{
+ struct renamearg ra;
+ int err;
+
+ /* new parent should exist */
+ err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
+ if (err)
+ return (err);
+
+ /* can't rename to different pool */
+ if (dd->dd_pool != ra.newparent->dd_pool) {
+ err = ENXIO;
+ goto out;
+ }
+
+ /* new name should not already exist */
+ if (ra.mynewname == NULL) {
+ err = EEXIST;
+ goto out;
+ }
+
+ err = dsl_sync_task_do(dd->dd_pool,
+ dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
+
+out:
+ dsl_dir_close(ra.newparent, FTAG);
+ return (err);
+}
+
+int
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
+{
+ dsl_dir_t *ancestor;
+ int64_t adelta;
+ uint64_t avail;
+
+ ancestor = closest_common_ancestor(sdd, tdd);
+ adelta = would_change(sdd, -space, ancestor);
+ avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
+ if (avail < space)
+ return (ENOSPC);
+
+ return (0);
+}
diff --git a/zfs/lib/libzpool/dsl_pool.c b/zfs/lib/libzpool/dsl_pool.c
new file mode 100644
index 000000000..de8736800
--- /dev/null
+++ b/zfs/lib/libzpool/dsl_pool.c
@@ -0,0 +1,339 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dsl_pool.c 1.12 08/03/20 SMI"
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+
+int zfs_no_write_throttle = 0;
+uint64_t zfs_write_limit_override = 0;
+
+static int
+dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
+{
+ uint64_t obj;
+ int err;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+ MOS_DIR_NAME, sizeof (obj), 1, &obj);
+ if (err)
+ return (err);
+
+ return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp;
+ blkptr_t *bp = spa_get_rootblkptr(spa);
+ extern uint64_t zfs_write_limit_min;
+
+ dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+ dp->dp_spa = spa;
+ dp->dp_meta_rootbp = *bp;
+ rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
+ dp->dp_write_limit = zfs_write_limit_min;
+ txg_init(dp, txg);
+
+ txg_list_create(&dp->dp_dirty_datasets,
+ offsetof(dsl_dataset_t, ds_dirty_link));
+ txg_list_create(&dp->dp_dirty_dirs,
+ offsetof(dsl_dir_t, dd_dirty_link));
+ txg_list_create(&dp->dp_sync_tasks,
+ offsetof(dsl_sync_task_group_t, dstg_node));
+ list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
+ offsetof(dsl_dataset_t, ds_synced_link));
+
+ mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (dp);
+}
+
+int
+dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ objset_impl_t *osi;
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+ if (err)
+ goto out;
+ dp->dp_meta_objset = &osi->os;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+ &dp->dp_root_dir_obj);
+ if (err)
+ goto out;
+
+ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir);
+ if (err)
+ goto out;
+
+ err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+ if (err)
+ goto out;
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ if (err)
+ dsl_pool_close(dp);
+ else
+ *dpp = dp;
+
+ return (err);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+ /* drop our reference from dsl_pool_open() */
+ if (dp->dp_mos_dir)
+ dsl_dir_close(dp->dp_mos_dir, dp);
+ if (dp->dp_root_dir)
+ dsl_dir_close(dp->dp_root_dir, dp);
+
+ /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+ if (dp->dp_meta_objset)
+ dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+
+ txg_list_destroy(&dp->dp_dirty_datasets);
+ txg_list_destroy(&dp->dp_dirty_dirs);
+ list_destroy(&dp->dp_synced_datasets);
+
+ arc_flush(dp->dp_spa);
+ txg_fini(dp);
+ rw_destroy(&dp->dp_config_rwlock);
+ mutex_destroy(&dp->dp_lock);
+ kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, uint64_t txg)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+ dp->dp_meta_objset = &dmu_objset_create_impl(spa,
+ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
+
+ /* create the pool directory */
+ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+ ASSERT3U(err, ==, 0);
+
+ /* create and open the root dir */
+ dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir));
+
+ /* create and open the meta-objset dir */
+ (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
+
+ dmu_tx_commit(tx);
+
+ return (dp);
+}
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+ zio_t *zio;
+ dmu_tx_t *tx;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ dsl_sync_task_group_t *dstg;
+ objset_impl_t *mosi = dp->dp_meta_objset->os;
+ int err;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+ if (!list_link_active(&ds->ds_synced_link))
+ list_insert_tail(&dp->dp_synced_datasets, ds);
+ else
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ dsl_dataset_sync(ds, zio, tx);
+ }
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+
+ while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
+ dsl_sync_task_group_sync(dstg, tx);
+ while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+ dsl_dir_sync(dd, tx);
+
+ if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+ list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dmu_objset_sync(mosi, zio, tx);
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+void
+dsl_pool_zil_clean(dsl_pool_t *dp)
+{
+ dsl_dataset_t *ds;
+
+ while (ds = list_head(&dp->dp_synced_datasets)) {
+ list_remove(&dp->dp_synced_datasets, ds);
+ ASSERT(ds->ds_user_ptr != NULL);
+ zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ }
+}
+
+/*
+ * TRUE if the current thread is the tx_sync_thread or if we
+ * are being called from SPA context during pool initialization.
+ */
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+ return (curthread == dp->dp_tx.tx_sync_thread ||
+ spa_get_dsl(dp->dp_spa) == NULL);
+}
+
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+{
+ uint64_t space, resv;
+
+ /*
+ * Reserve about 1.6% (1/64), or at least 32MB, for allocation
+ * efficiency.
+ * XXX The intent log is not accounted for, so it must fit
+ * within this slop.
+ *
+ * If we're trying to assess whether it's OK to do a free,
+ * cut the reservation in half to allow forward progress
+ * (e.g. make it possible to rm(1) files from a full pool).
+ */
+ space = spa_get_dspace(dp->dp_spa);
+ resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
+ if (netfree)
+ resv >>= 1;
+
+ return (space - resv);
+}
+
+int
+dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
+{
+ uint64_t reserved = 0;
+ uint64_t write_limit = (zfs_write_limit_override ?
+ zfs_write_limit_override : dp->dp_write_limit);
+
+ if (zfs_no_write_throttle) {
+ dp->dp_tempreserved[tx->tx_txg & TXG_MASK] += space;
+ return (0);
+ }
+
+ /*
+ * Check to see if we have exceeded the maximum allowed IO for
+ * this transaction group. We can do this without locks since
+ * a little slop here is ok. Note that we do the reserved check
+ * with only half the requested reserve: this is because the
+ * reserve requests are worst-case, and we really don't want to
+ * throttle based off of worst-case estimates.
+ */
+ if (write_limit > 0) {
+ reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
+ + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
+
+ if (reserved && reserved > write_limit)
+ return (ERESTART);
+ }
+
+ atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
+
+ /*
+ * If this transaction group is over 7/8ths capacity, delay
+ * the caller 1 clock tick. This will slow down the "fill"
+ * rate until the sync process can catch up with us.
+ */
+ if (reserved && reserved > (write_limit - write_limit << 3))
+ txg_delay(dp, tx->tx_txg, 1);
+
+ return (0);
+}
+
+void
+dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+ ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
+ atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
+}
+
+void
+dsl_pool_memory_pressure(dsl_pool_t *dp)
+{
+ extern uint64_t zfs_write_limit_min;
+ uint64_t space_inuse = 0;
+ int i;
+
+ if (dp->dp_write_limit == zfs_write_limit_min)
+ return;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ space_inuse += dp->dp_space_towrite[i];
+ space_inuse += dp->dp_tempreserved[i];
+ }
+ dp->dp_write_limit = MAX(zfs_write_limit_min,
+ MIN(dp->dp_write_limit, space_inuse / 4));
+}
+
+void
+dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+ if (space > 0) {
+ mutex_enter(&dp->dp_lock);
+ dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
+ mutex_exit(&dp->dp_lock);
+ }
+}
diff --git a/zfs/lib/libzpool/dsl_prop.c b/zfs/lib/libzpool/dsl_prop.c
new file mode 100644
index 000000000..364a713fe
--- /dev/null
+++ b/zfs/lib/libzpool/dsl_prop.c
@@ -0,0 +1,551 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dsl_prop.c 1.16 08/02/20 SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+static int
+dodefault(const char *propname, int intsz, int numint, void *buf)
+{
+ zfs_prop_t prop;
+
+ /*
+ * The setonce properties are read-only, BUT they still
+ * have a default value that can be used as the initial
+ * value.
+ */
+ if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL ||
+ (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
+ return (ENOENT);
+
+ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+ if (intsz != 1)
+ return (EOVERFLOW);
+ (void) strncpy(buf, zfs_prop_default_string(prop),
+ numint);
+ } else {
+ if (intsz != 8 || numint < 1)
+ return (EOVERFLOW);
+
+ *(uint64_t *)buf = zfs_prop_default_numeric(prop);
+ }
+
+ return (0);
+}
+
+static int
+dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
+ int intsz, int numint, void *buf, char *setpoint)
+{
+ int err = ENOENT;
+ zfs_prop_t prop;
+
+ if (setpoint)
+ setpoint[0] = '\0';
+
+ prop = zfs_name_to_prop(propname);
+
+ /*
+ * Note: dd may be NULL, therefore we shouldn't dereference it
+ * ouside this loop.
+ */
+ for (; dd != NULL; dd = dd->dd_parent) {
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+ propname, intsz, numint, buf);
+ if (err != ENOENT) {
+ if (setpoint)
+ dsl_dir_name(dd, setpoint);
+ break;
+ }
+
+ /*
+ * Break out of this loop for non-inheritable properties.
+ */
+ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ break;
+ }
+ if (err == ENOENT)
+ err = dodefault(propname, intsz, numint, buf);
+
+ return (err);
+}
+
+/*
+ * Register interest in the named property. We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ uint64_t value;
+ dsl_prop_cb_record_t *cbr;
+ int err;
+ int need_rwlock;
+
+ need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock);
+ if (need_rwlock)
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+
+ err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL);
+ if (err != 0) {
+ if (need_rwlock)
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ return (err);
+ }
+
+ cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+ cbr->cbr_ds = ds;
+ cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
+ (void) strcpy((char *)cbr->cbr_propname, propname);
+ cbr->cbr_func = callback;
+ cbr->cbr_arg = cbarg;
+ mutex_enter(&dd->dd_lock);
+ list_insert_head(&dd->dd_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
+ NULL, cbr, &dd));
+ if (need_rwlock)
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ /* Leave dataset open until this callback is unregistered */
+ return (0);
+}
+
+int
+dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ int err;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ return (err);
+}
+
+/*
+ * Get property when config lock is already held.
+ */
+int dsl_prop_get_ds_locked(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+ return (dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint));
+}
+
+int
+dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ dsl_dir_t *dd;
+ const char *tail;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+ if (tail && tail[0] != '@') {
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+
+ err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/*
+ * Get the current property value. It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint)
+{
+ return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+/*
+ * Unregister this callback. Return 0 on success, ENOENT if ddname is
+ * invalid, ENOMSG if no matching callback registered.
+ */
+int
+dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_cb_record_t *cbr;
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (cbr->cbr_ds == ds &&
+ cbr->cbr_func == callback &&
+ cbr->cbr_arg == cbarg &&
+ strcmp(cbr->cbr_propname, propname) == 0)
+ break;
+ }
+
+ if (cbr == NULL) {
+ mutex_exit(&dd->dd_lock);
+ return (ENOMSG);
+ }
+
+ list_remove(&dd->dd_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+ kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
+ kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+
+ /* Clean up from dsl_prop_register */
+ dsl_dir_close(dd, cbr);
+ return (0);
+}
+
+/*
+ * Return the number of callbacks that are registered for this dataset.
+ */
+int
+dsl_prop_numcb(dsl_dataset_t *ds)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_cb_record_t *cbr;
+ int num = 0;
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (cbr->cbr_ds == ds)
+ num++;
+ }
+ mutex_exit(&dd->dd_lock);
+
+ return (num);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+ const char *propname, uint64_t value, int first)
+{
+ dsl_dir_t *dd;
+ dsl_prop_cb_record_t *cbr;
+ objset_t *mos = dp->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t *za;
+ int err;
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+ err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+ if (err)
+ return;
+
+ if (!first) {
+ /*
+ * If the prop is set here, then this change is not
+ * being inherited here or below; stop the recursion.
+ */
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+ 8, 1, &value);
+ if (err == 0) {
+ dsl_dir_close(dd, FTAG);
+ return;
+ }
+ ASSERT3U(err, ==, ENOENT);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (strcmp(cbr->cbr_propname, propname) == 0) {
+ cbr->cbr_func(cbr->cbr_arg, value);
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, mos,
+ dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_prop_changed_notify(dp, za->za_first_integer,
+ propname, value, FALSE);
+ }
+ kmem_free(za, sizeof (zap_attribute_t));
+ zap_cursor_fini(&zc);
+ dsl_dir_close(dd, FTAG);
+}
+
+struct prop_set_arg {
+ const char *name;
+ int intsz;
+ int numints;
+ const void *buf;
+};
+
+
+static void
+dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct prop_set_arg *psa = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
+ uint64_t intval;
+ int isint;
+ char valbuf[32];
+ char *valstr;
+
+ isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+
+ if (psa->numints == 0) {
+ int err = zap_remove(mos, zapobj, psa->name, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ if (isint) {
+ VERIFY(0 == dsl_prop_get_impl(dd->dd_parent,
+ psa->name, 8, 1, &intval, NULL));
+ }
+ } else {
+ VERIFY(0 == zap_update(mos, zapobj, psa->name,
+ psa->intsz, psa->numints, psa->buf, tx));
+ if (isint)
+ intval = *(uint64_t *)psa->buf;
+ }
+
+ if (isint) {
+ dsl_prop_changed_notify(dd->dd_pool,
+ dd->dd_object, psa->name, intval, TRUE);
+ }
+ if (isint) {
+ (void) snprintf(valbuf, sizeof (valbuf),
+ "%lld", (longlong_t)intval);
+ valstr = valbuf;
+ } else {
+ valstr = (char *)psa->buf;
+ }
+ spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT :
+ LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+ "%s=%s dataset = %llu", psa->name, valstr,
+ dd->dd_phys->dd_head_dataset_obj);
+}
+
+void
+dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+ cred_t *cr, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx));
+
+ dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
+
+ spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+ "%s=%llu dataset = %llu", name, (u_longlong_t)val,
+ dd->dd_phys->dd_head_dataset_obj);
+}
+
+int
+dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, const void *buf)
+{
+ struct prop_set_arg psa;
+
+ psa.name = propname;
+ psa.intsz = intsz;
+ psa.numints = numints;
+ psa.buf = buf;
+
+ return (dsl_sync_task_do(dd->dd_pool,
+ NULL, dsl_prop_set_sync, dd, &psa, 2));
+}
+
+int
+dsl_prop_set(const char *ddname, const char *propname,
+ int intsz, int numints, const void *buf)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ /*
+ * We must do these checks before we get to the syncfunc, since
+ * it can't fail.
+ */
+ if (strlen(propname) >= ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+ if (intsz * numints >= ZAP_MAXVALUELEN)
+ return (E2BIG);
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+ err = dsl_prop_set_dd(dd, propname, intsz, numints, buf);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/*
+ * Iterate over all properties for this dataset and return them in an nvlist.
+ */
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dir_t *dd = ds->ds_dir;
+ boolean_t snapshot;
+ int err = 0;
+ dsl_pool_t *dp;
+ objset_t *mos;
+
+ snapshot = dsl_dataset_is_snapshot(ds);
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ dp = dd->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ for (; dd != NULL; dd = dd->dd_parent) {
+ char setpoint[MAXNAMELEN];
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ dsl_dir_name(dd, setpoint);
+
+ for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ nvlist_t *propval;
+ zfs_prop_t prop;
+ /*
+ * Skip non-inheritable properties.
+ */
+ if ((prop = zfs_name_to_prop(za.za_name)) !=
+ ZPROP_INVAL && !zfs_prop_inheritable(prop) &&
+ dd != ds->ds_dir)
+ continue;
+
+ if (snapshot &&
+ !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+ continue;
+
+ if (nvlist_lookup_nvlist(*nvp, za.za_name,
+ &propval) == 0)
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ if (za.za_integer_length == 1) {
+ /*
+ * String property
+ */
+ char *tmp = kmem_alloc(za.za_num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos,
+ dd->dd_phys->dd_props_zapobj,
+ za.za_name, 1, za.za_num_integers,
+ tmp);
+ if (err != 0) {
+ kmem_free(tmp, za.za_num_integers);
+ break;
+ }
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+ tmp) == 0);
+ kmem_free(tmp, za.za_num_integers);
+ } else {
+ /*
+ * Integer property
+ */
+ ASSERT(za.za_integer_length == 8);
+ (void) nvlist_add_uint64(propval, ZPROP_VALUE,
+ za.za_first_integer);
+ }
+
+ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE,
+ setpoint) == 0);
+ VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
+ propval) == 0);
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+
+ if (err != ENOENT)
+ break;
+ err = 0;
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ return (err);
+}
+
+void
+dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
+{
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ nvlist_free(propval);
+}
+
+void
+dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
+{
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ nvlist_free(propval);
+}
diff --git a/zfs/lib/libzpool/dsl_synctask.c b/zfs/lib/libzpool/dsl_synctask.c
new file mode 100644
index 000000000..305a23bf2
--- /dev/null
+++ b/zfs/lib/libzpool/dsl_synctask.c
@@ -0,0 +1,225 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)dsl_synctask.c 1.5 07/10/29 SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/cred.h>
+
+#define DST_AVG_BLKSHIFT 14
+
+/* ARGSUSED */
+static int
+dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ return (0);
+}
+
+dsl_sync_task_group_t *
+dsl_sync_task_group_create(dsl_pool_t *dp)
+{
+ dsl_sync_task_group_t *dstg;
+
+ dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP);
+ list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
+ offsetof(dsl_sync_task_t, dst_node));
+ dstg->dstg_pool = dp;
+ dstg->dstg_cr = CRED();
+
+ return (dstg);
+}
+
+void
+dsl_sync_task_create(dsl_sync_task_group_t *dstg,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified)
+{
+ dsl_sync_task_t *dst;
+
+ if (checkfunc == NULL)
+ checkfunc = dsl_null_checkfunc;
+ dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP);
+ dst->dst_checkfunc = checkfunc;
+ dst->dst_syncfunc = syncfunc;
+ dst->dst_arg1 = arg1;
+ dst->dst_arg2 = arg2;
+ list_insert_tail(&dstg->dstg_tasks, dst);
+
+ dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT;
+}
+
+int
+dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
+{
+ dmu_tx_t *tx;
+ uint64_t txg;
+ dsl_sync_task_t *dst;
+
+top:
+ tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+
+ /* Do a preliminary error check. */
+ dstg->dstg_err = 0;
+ rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+#ifdef ZFS_DEBUG
+ /*
+ * Only check half the time, otherwise, the sync-context
+ * check will almost never fail.
+ */
+ if (spa_get_random(2) == 0)
+ continue;
+#endif
+ dst->dst_err =
+ dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ if (dst->dst_err)
+ dstg->dstg_err = dst->dst_err;
+ }
+ rw_exit(&dstg->dstg_pool->dp_config_rwlock);
+
+ if (dstg->dstg_err) {
+ dmu_tx_commit(tx);
+ return (dstg->dstg_err);
+ }
+
+ VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+
+ dmu_tx_commit(tx);
+
+ txg_wait_synced(dstg->dstg_pool, txg);
+
+ if (dstg->dstg_err == EAGAIN)
+ goto top;
+
+ return (dstg->dstg_err);
+}
+
+void
+dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+{
+ uint64_t txg;
+
+ dstg->dstg_nowaiter = B_TRUE;
+ txg = dmu_tx_get_txg(tx);
+ VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+}
+
+void
+dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
+{
+ dsl_sync_task_t *dst;
+
+ while (dst = list_head(&dstg->dstg_tasks)) {
+ list_remove(&dstg->dstg_tasks, dst);
+ kmem_free(dst, sizeof (dsl_sync_task_t));
+ }
+ kmem_free(dstg, sizeof (dsl_sync_task_group_t));
+}
+
+void
+dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+{
+ dsl_sync_task_t *dst;
+ void *tr_cookie;
+
+ ASSERT3U(dstg->dstg_err, ==, 0);
+
+ /*
+ * Check for sufficient space.
+ */
+ dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
+ dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
+ /* don't bother trying again */
+ if (dstg->dstg_err == ERESTART)
+ dstg->dstg_err = EAGAIN;
+ if (dstg->dstg_err)
+ return;
+
+ /*
+ * Check for errors by calling checkfuncs.
+ */
+ rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+ dst->dst_err =
+ dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ if (dst->dst_err)
+ dstg->dstg_err = dst->dst_err;
+ }
+
+ if (dstg->dstg_err == 0) {
+ /*
+ * Execute sync tasks.
+ */
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+ dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
+ dstg->dstg_cr, tx);
+ }
+ }
+ rw_exit(&dstg->dstg_pool->dp_config_rwlock);
+
+ dsl_dir_tempreserve_clear(tr_cookie, tx);
+
+ if (dstg->dstg_nowaiter)
+ dsl_sync_task_group_destroy(dstg);
+}
+
+int
+dsl_sync_task_do(dsl_pool_t *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified)
+{
+ dsl_sync_task_group_t *dstg;
+ int err;
+
+ dstg = dsl_sync_task_group_create(dp);
+ dsl_sync_task_create(dstg, checkfunc, syncfunc,
+ arg1, arg2, blocks_modified);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+ return (err);
+}
+
+void
+dsl_sync_task_do_nowait(dsl_pool_t *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx)
+{
+ dsl_sync_task_group_t *dstg;
+
+ dstg = dsl_sync_task_group_create(dp);
+ dsl_sync_task_create(dstg, checkfunc, syncfunc,
+ arg1, arg2, blocks_modified);
+ dsl_sync_task_group_nowait(dstg, tx);
+}
diff --git a/zfs/lib/libzpool/fletcher.c b/zfs/lib/libzpool/fletcher.c
new file mode 100644
index 000000000..299d70c11
--- /dev/null
+++ b/zfs/lib/libzpool/fletcher.c
@@ -0,0 +1,145 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)fletcher.c 1.2 06/03/03 SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/spa.h>
+
+void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += ip[0];
+ a1 += ip[1];
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += BSWAP_64(ip[0]);
+ a1 += BSWAP_64(ip[1]);
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_native(const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/zfs/lib/libzpool/gzip.c b/zfs/lib/libzpool/gzip.c
new file mode 100644
index 000000000..94c76042d
--- /dev/null
+++ b/zfs/lib/libzpool/gzip.c
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)gzip.c 1.1 07/03/22 SMI"
+
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/zmod.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <strings.h>
+#endif
+
+size_t
+gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ size_t dstlen = d_len;
+
+ ASSERT(d_len <= s_len);
+
+ if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
+ if (d_len != s_len)
+ return (s_len);
+
+ bcopy(s_start, d_start, s_len);
+ return (s_len);
+ }
+
+ return (dstlen);
+}
+
+/*ARGSUSED*/
+int
+gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ size_t dstlen = d_len;
+
+ ASSERT(d_len >= s_len);
+
+ if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK)
+ return (-1);
+
+ return (0);
+}
diff --git a/zfs/lib/libzpool/kernel.c b/zfs/lib/libzpool/kernel.c
new file mode 100644
index 000000000..71317446d
--- /dev/null
+++ b/zfs/lib/libzpool/kernel.c
@@ -0,0 +1,894 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+#include <assert.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <sys/spa.h>
+#include <sys/stat.h>
+#include <sys/processor.h>
+#include <sys/zfs_context.h>
+#include <sys/zmod.h>
+#include <sys/utsname.h>
+
+/*
+ * Emulation of kernel services in userland.
+ */
+
+uint64_t physmem;
+vnode_t *rootdir = (vnode_t *)0xabcd1234;
+char hw_serial[11];
+
+struct utsname utsname = {
+ "userland", "libzpool", "1", "1", "na"
+};
+
+/*
+ * =========================================================================
+ * threads
+ * =========================================================================
+ */
+/*ARGSUSED*/
+kthread_t *
+zk_thread_create(void (*func)(), void *arg)
+{
+ thread_t tid;
+
+ VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
+ &tid) == 0);
+
+ return ((void *)(uintptr_t)tid);
+}
+
+/*
+ * =========================================================================
+ * kstats
+ * =========================================================================
+ */
+/*ARGSUSED*/
+kstat_t *
+kstat_create(char *module, int instance, char *name, char *class,
+ uchar_t type, ulong_t ndata, uchar_t ks_flag)
+{
+ return (NULL);
+}
+
+/*ARGSUSED*/
+void
+kstat_install(kstat_t *ksp)
+{}
+
+/*ARGSUSED*/
+void
+kstat_delete(kstat_t *ksp)
+{}
+
+/*
+ * =========================================================================
+ * mutexes
+ * =========================================================================
+ */
+void
+zmutex_init(kmutex_t *mp)
+{
+ mp->m_owner = NULL;
+ mp->initialized = B_TRUE;
+ (void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL);
+}
+
+void
+zmutex_destroy(kmutex_t *mp)
+{
+ ASSERT(mp->initialized == B_TRUE);
+ ASSERT(mp->m_owner == NULL);
+ (void) _mutex_destroy(&(mp)->m_lock);
+ mp->m_owner = (void *)-1UL;
+ mp->initialized = B_FALSE;
+}
+
+void
+mutex_enter(kmutex_t *mp)
+{
+ ASSERT(mp->initialized == B_TRUE);
+ ASSERT(mp->m_owner != (void *)-1UL);
+ ASSERT(mp->m_owner != curthread);
+ VERIFY(mutex_lock(&mp->m_lock) == 0);
+ ASSERT(mp->m_owner == NULL);
+ mp->m_owner = curthread;
+}
+
+int
+mutex_tryenter(kmutex_t *mp)
+{
+ ASSERT(mp->initialized == B_TRUE);
+ ASSERT(mp->m_owner != (void *)-1UL);
+ if (0 == mutex_trylock(&mp->m_lock)) {
+ ASSERT(mp->m_owner == NULL);
+ mp->m_owner = curthread;
+ return (1);
+ } else {
+ return (0);
+ }
+}
+
+void
+mutex_exit(kmutex_t *mp)
+{
+ ASSERT(mp->initialized == B_TRUE);
+ ASSERT(mutex_owner(mp) == curthread);
+ mp->m_owner = NULL;
+ VERIFY(mutex_unlock(&mp->m_lock) == 0);
+}
+
+void *
+mutex_owner(kmutex_t *mp)
+{
+ ASSERT(mp->initialized == B_TRUE);
+ return (mp->m_owner);
+}
+
+/*
+ * =========================================================================
+ * rwlocks
+ * =========================================================================
+ */
+/*ARGSUSED*/
+void
+rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
+{
+ rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL);
+ rwlp->rw_owner = NULL;
+ rwlp->initialized = B_TRUE;
+}
+
+void
+rw_destroy(krwlock_t *rwlp)
+{
+ rwlock_destroy(&rwlp->rw_lock);
+ rwlp->rw_owner = (void *)-1UL;
+ rwlp->initialized = B_FALSE;
+}
+
+void
+rw_enter(krwlock_t *rwlp, krw_t rw)
+{
+ ASSERT(!RW_LOCK_HELD(rwlp));
+ ASSERT(rwlp->initialized == B_TRUE);
+ ASSERT(rwlp->rw_owner != (void *)-1UL);
+ ASSERT(rwlp->rw_owner != curthread);
+
+ if (rw == RW_READER)
+ (void) rw_rdlock(&rwlp->rw_lock);
+ else
+ (void) rw_wrlock(&rwlp->rw_lock);
+
+ rwlp->rw_owner = curthread;
+}
+
+void
+rw_exit(krwlock_t *rwlp)
+{
+ ASSERT(rwlp->initialized == B_TRUE);
+ ASSERT(rwlp->rw_owner != (void *)-1UL);
+
+ rwlp->rw_owner = NULL;
+ (void) rw_unlock(&rwlp->rw_lock);
+}
+
+int
+rw_tryenter(krwlock_t *rwlp, krw_t rw)
+{
+ int rv;
+
+ ASSERT(rwlp->initialized == B_TRUE);
+ ASSERT(rwlp->rw_owner != (void *)-1UL);
+
+ if (rw == RW_READER)
+ rv = rw_tryrdlock(&rwlp->rw_lock);
+ else
+ rv = rw_trywrlock(&rwlp->rw_lock);
+
+ if (rv == 0) {
+ rwlp->rw_owner = curthread;
+ return (1);
+ }
+
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+rw_tryupgrade(krwlock_t *rwlp)
+{
+ ASSERT(rwlp->initialized == B_TRUE);
+ ASSERT(rwlp->rw_owner != (void *)-1UL);
+
+ return (0);
+}
+
+/*
+ * =========================================================================
+ * condition variables
+ * =========================================================================
+ */
+/*ARGSUSED*/
+void
+cv_init(kcondvar_t *cv, char *name, int type, void *arg)
+{
+ VERIFY(cond_init(cv, type, NULL) == 0);
+}
+
+void
+cv_destroy(kcondvar_t *cv)
+{
+ VERIFY(cond_destroy(cv) == 0);
+}
+
+void
+cv_wait(kcondvar_t *cv, kmutex_t *mp)
+{
+ ASSERT(mutex_owner(mp) == curthread);
+ mp->m_owner = NULL;
+ int ret = cond_wait(cv, &mp->m_lock);
+ VERIFY(ret == 0 || ret == EINTR);
+ mp->m_owner = curthread;
+}
+
+clock_t
+cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
+{
+ int error;
+ timestruc_t ts;
+ clock_t delta;
+
+top:
+ delta = abstime - lbolt;
+ if (delta <= 0)
+ return (-1);
+
+ ts.tv_sec = delta / hz;
+ ts.tv_nsec = (delta % hz) * (NANOSEC / hz);
+
+ ASSERT(mutex_owner(mp) == curthread);
+ mp->m_owner = NULL;
+ error = cond_reltimedwait(cv, &mp->m_lock, &ts);
+ mp->m_owner = curthread;
+
+ if (error == ETIME)
+ return (-1);
+
+ if (error == EINTR)
+ goto top;
+
+ ASSERT(error == 0);
+
+ return (1);
+}
+
+void
+cv_signal(kcondvar_t *cv)
+{
+ VERIFY(cond_signal(cv) == 0);
+}
+
+void
+cv_broadcast(kcondvar_t *cv)
+{
+ VERIFY(cond_broadcast(cv) == 0);
+}
+
+/*
+ * =========================================================================
+ * vnode operations
+ * =========================================================================
+ */
+/*
+ * Note: for the xxxat() versions of these functions, we assume that the
+ * starting vp is always rootdir (which is true for spa_directory.c, the only
+ * ZFS consumer of these interfaces). We assert this is true, and then emulate
+ * them by adding '/' in front of the path.
+ */
+
+/*ARGSUSED*/
+int
+vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
+{
+ int fd;
+ vnode_t *vp;
+ int old_umask;
+ char realpath[MAXPATHLEN];
+ struct stat64 st;
+
+ /*
+ * If we're accessing a real disk from userland, we need to use
+ * the character interface to avoid caching. This is particularly
+ * important if we're trying to look at a real in-kernel storage
+ * pool from userland, e.g. via zdb, because otherwise we won't
+ * see the changes occurring under the segmap cache.
+ * On the other hand, the stupid character device returns zero
+ * for its size. So -- gag -- we open the block device to get
+ * its size, and remember it for subsequent VOP_GETATTR().
+ */
+ if (strncmp(path, "/dev/", 5) == 0) {
+ char *dsk;
+ fd = open64(path, O_RDONLY);
+ if (fd == -1)
+ return (errno);
+ if (fstat64(fd, &st) == -1) {
+ close(fd);
+ return (errno);
+ }
+ close(fd);
+ (void) sprintf(realpath, "%s", path);
+ dsk = strstr(path, "/dsk/");
+ if (dsk != NULL)
+ (void) sprintf(realpath + (dsk - path) + 1, "r%s",
+ dsk + 1);
+ } else {
+ (void) sprintf(realpath, "%s", path);
+ if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
+ return (errno);
+ }
+
+ if (flags & FCREAT)
+ old_umask = umask(0);
+
+ /*
+ * The construct 'flags - FREAD' conveniently maps combinations of
+ * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
+ */
+ fd = open64(realpath, flags - FREAD, mode);
+
+ if (flags & FCREAT)
+ (void) umask(old_umask);
+
+ if (fd == -1)
+ return (errno);
+
+ if (fstat64(fd, &st) == -1) {
+ close(fd);
+ return (errno);
+ }
+
+ (void) fcntl(fd, F_SETFD, FD_CLOEXEC);
+
+ *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
+
+ vp->v_fd = fd;
+ vp->v_size = st.st_size;
+ vp->v_path = spa_strdup(path);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
+ int x3, vnode_t *startvp, int fd)
+{
+ char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
+ int ret;
+
+ ASSERT(startvp == rootdir);
+ (void) sprintf(realpath, "/%s", path);
+
+ /* fd ignored for now, need if want to simulate nbmand support */
+ ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
+
+ umem_free(realpath, strlen(path) + 2);
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+int
+vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
+ int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
+{
+ ssize_t iolen, split;
+
+ if (uio == UIO_READ) {
+ iolen = pread64(vp->v_fd, addr, len, offset);
+ } else {
+ /*
+ * To simulate partial disk writes, we split writes into two
+ * system calls so that the process can be killed in between.
+ */
+ split = (len > 0 ? rand() % len : 0);
+ iolen = pwrite64(vp->v_fd, addr, split, offset);
+ iolen += pwrite64(vp->v_fd, (char *)addr + split,
+ len - split, offset + split);
+ }
+
+ if (iolen < 0)
+ return (errno);
+ if (residp)
+ *residp = len - iolen;
+ else if (iolen != len)
+ return (EIO);
+ return (0);
+}
+
+void
+vn_close(vnode_t *vp)
+{
+ close(vp->v_fd);
+ spa_strfree(vp->v_path);
+ umem_free(vp, sizeof (vnode_t));
+}
+
+#ifdef ZFS_DEBUG
+
+/*
+ * =========================================================================
+ * Figure out which debugging statements to print
+ * =========================================================================
+ */
+
+static char *dprintf_string;
+static int dprintf_print_all;
+
+int
+dprintf_find_string(const char *string)
+{
+ char *tmp_str = dprintf_string;
+ int len = strlen(string);
+
+ /*
+ * Find out if this is a string we want to print.
+ * String format: file1.c,function_name1,file2.c,file3.c
+ */
+
+ while (tmp_str != NULL) {
+ if (strncmp(tmp_str, string, len) == 0 &&
+ (tmp_str[len] == ',' || tmp_str[len] == '\0'))
+ return (1);
+ tmp_str = strchr(tmp_str, ',');
+ if (tmp_str != NULL)
+ tmp_str++; /* Get rid of , */
+ }
+ return (0);
+}
+
+void
+dprintf_setup(int *argc, char **argv)
+{
+ int i, j;
+
+ /*
+ * Debugging can be specified two ways: by setting the
+ * environment variable ZFS_DEBUG, or by including a
+ * "debug=..." argument on the command line. The command
+ * line setting overrides the environment variable.
+ */
+
+ for (i = 1; i < *argc; i++) {
+ int len = strlen("debug=");
+ /* First look for a command line argument */
+ if (strncmp("debug=", argv[i], len) == 0) {
+ dprintf_string = argv[i] + len;
+ /* Remove from args */
+ for (j = i; j < *argc; j++)
+ argv[j] = argv[j+1];
+ argv[j] = NULL;
+ (*argc)--;
+ }
+ }
+
+ if (dprintf_string == NULL) {
+ /* Look for ZFS_DEBUG environment variable */
+ dprintf_string = getenv("ZFS_DEBUG");
+ }
+
+ /*
+ * Are we just turning on all debugging?
+ */
+ if (dprintf_find_string("on"))
+ dprintf_print_all = 1;
+}
+
+/*
+ * =========================================================================
+ * debug printfs
+ * =========================================================================
+ */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ va_list adx;
+
+ /*
+ * Get rid of annoying "../common/" prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ if (dprintf_print_all ||
+ dprintf_find_string(newfile) ||
+ dprintf_find_string(func)) {
+ /* Print out just the function name if requested */
+ flockfile(stdout);
+ if (dprintf_find_string("pid"))
+ (void) printf("%d ", getpid());
+ if (dprintf_find_string("tid"))
+ (void) printf("%u ", thr_self());
+ if (dprintf_find_string("cpu"))
+ (void) printf("%u ", getcpuid());
+ if (dprintf_find_string("time"))
+ (void) printf("%llu ", gethrtime());
+ if (dprintf_find_string("long"))
+ (void) printf("%s, line %d: ", newfile, line);
+ (void) printf("%s: ", func);
+ va_start(adx, fmt);
+ (void) vprintf(fmt, adx);
+ va_end(adx);
+ funlockfile(stdout);
+ }
+}
+
+#endif /* ZFS_DEBUG */
+
+/*
+ * =========================================================================
+ * cmn_err() and panic()
+ * =========================================================================
+ */
+static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
+static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
+
+void
+vpanic(const char *fmt, va_list adx)
+{
+ (void) fprintf(stderr, "error: ");
+ (void) vfprintf(stderr, fmt, adx);
+ (void) fprintf(stderr, "\n");
+
+ abort(); /* think of it as a "user-level crash dump" */
+}
+
+void
+panic(const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ vpanic(fmt, adx);
+ va_end(adx);
+}
+
+void
+vcmn_err(int ce, const char *fmt, va_list adx)
+{
+ if (ce == CE_PANIC)
+ vpanic(fmt, adx);
+ if (ce != CE_NOTE) { /* suppress noise in userland stress testing */
+ (void) fprintf(stderr, "%s", ce_prefix[ce]);
+ (void) vfprintf(stderr, fmt, adx);
+ (void) fprintf(stderr, "%s", ce_suffix[ce]);
+ }
+}
+
+/*PRINTFLIKE2*/
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ vcmn_err(ce, fmt, adx);
+ va_end(adx);
+}
+
+/*
+ * =========================================================================
+ * kobj interfaces
+ * =========================================================================
+ */
+struct _buf *
+kobj_open_file(char *name)
+{
+ struct _buf *file;
+ vnode_t *vp;
+
+ /* set vp as the _fd field of the file */
+ if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir,
+ -1) != 0)
+ return ((void *)-1UL);
+
+ file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
+ file->_fd = (intptr_t)vp;
+ return (file);
+}
+
+int
+kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
+{
+ ssize_t resid;
+
+ vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
+ UIO_SYSSPACE, 0, 0, 0, &resid);
+
+ return (size - resid);
+}
+
+void
+kobj_close_file(struct _buf *file)
+{
+ vn_close((vnode_t *)file->_fd);
+ umem_free(file, sizeof (struct _buf));
+}
+
+int
+kobj_get_filesize(struct _buf *file, uint64_t *size)
+{
+ struct stat64 st;
+ vnode_t *vp = (vnode_t *)file->_fd;
+
+ if (fstat64(vp->v_fd, &st) == -1) {
+ vn_close(vp);
+ return (errno);
+ }
+ *size = st.st_size;
+ return (0);
+}
+
+/*
+ * =========================================================================
+ * misc routines
+ * =========================================================================
+ */
+
+void
+delay(clock_t ticks)
+{
+ poll(0, 0, ticks * (1000 / hz));
+}
+
+/*
+ * Find highest one bit set.
+ * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ * High order bit is 31 (or 63 in _LP64 kernel).
+ */
+int
+highbit(ulong_t i)
+{
+ register int h = 1;
+
+ if (i == 0)
+ return (0);
+#ifdef _LP64
+ if (i & 0xffffffff00000000ul) {
+ h += 32; i >>= 32;
+ }
+#endif
+ if (i & 0xffff0000) {
+ h += 16; i >>= 16;
+ }
+ if (i & 0xff00) {
+ h += 8; i >>= 8;
+ }
+ if (i & 0xf0) {
+ h += 4; i >>= 4;
+ }
+ if (i & 0xc) {
+ h += 2; i >>= 2;
+ }
+ if (i & 0x2) {
+ h += 1;
+ }
+ return (h);
+}
+
+static int random_fd = -1, urandom_fd = -1;
+
+static int
+random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
+{
+ size_t resid = len;
+ ssize_t bytes;
+
+ ASSERT(fd != -1);
+
+ while (resid != 0) {
+ bytes = read(fd, ptr, resid);
+ ASSERT3S(bytes, >=, 0);
+ ptr += bytes;
+ resid -= bytes;
+ }
+
+ return (0);
+}
+
+int
+random_get_bytes(uint8_t *ptr, size_t len)
+{
+ return (random_get_bytes_common(ptr, len, random_fd));
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+ return (random_get_bytes_common(ptr, len, urandom_fd));
+}
+
+int
+ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
+{
+ char *end;
+
+ *result = strtoul(hw_serial, &end, base);
+ if (*result == 0)
+ return (errno);
+ return (0);
+}
+
+/*
+ * =========================================================================
+ * kernel emulation setup & teardown
+ * =========================================================================
+ */
+static int
+umem_out_of_memory(void)
+{
+ char errmsg[] = "out of memory -- generating core dump\n";
+
+ write(fileno(stderr), errmsg, sizeof (errmsg));
+ abort();
+ return (0);
+}
+
+void
+kernel_init(int mode)
+{
+ umem_nofail_callback(umem_out_of_memory);
+
+ physmem = sysconf(_SC_PHYS_PAGES);
+
+ dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
+ (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
+
+ uname(&utsname);
+ snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid());
+
+ VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
+ VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
+
+ spa_init(mode);
+}
+
+void
+kernel_fini(void)
+{
+ spa_fini();
+
+ close(random_fd);
+ close(urandom_fd);
+
+ random_fd = -1;
+ urandom_fd = -1;
+}
+
+int
+z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
+{
+ int ret;
+ uLongf len = *dstlen;
+
+ if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
+ *dstlen = (size_t)len;
+
+ return (ret);
+}
+
+int
+z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
+ int level)
+{
+ int ret;
+ uLongf len = *dstlen;
+
+ if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
+ *dstlen = (size_t)len;
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+size_t u8_textprep_str(char *i, size_t *il, char *o, size_t *ol, int nf,
+ size_t vers, int *err)
+{
+ *err = EINVAL;
+ return ((size_t)-1);
+}
+
+uid_t
+crgetuid(cred_t *cr)
+{
+ return (0);
+}
+
+gid_t
+crgetgid(cred_t *cr)
+{
+ return (0);
+}
+
+int
+crgetngroups(cred_t *cr)
+{
+ return (0);
+}
+
+gid_t *
+crgetgroups(cred_t *cr)
+{
+ return (NULL);
+}
+
+int
+zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
+{
+ return (0);
+}
+
+int
+zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
+{
+ return (0);
+}
+
+int
+zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
+{
+ return (0);
+}
+
+ksiddomain_t *
+ksid_lookupdomain(const char *dom)
+{
+ ksiddomain_t *kd;
+
+ kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
+ kd->kd_name = spa_strdup(dom);
+ return (kd);
+}
+
+void
+ksiddomain_rele(ksiddomain_t *ksid)
+{
+ spa_strfree(ksid->kd_name);
+ umem_free(ksid, sizeof (ksiddomain_t));
+}
diff --git a/zfs/lib/libzpool/lzjb.c b/zfs/lib/libzpool/lzjb.c
new file mode 100644
index 000000000..22f9c2b1b
--- /dev/null
+++ b/zfs/lib/libzpool/lzjb.c
@@ -0,0 +1,128 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)lzjb.c 1.3 07/03/22 SMI"
+
+/*
+ * We keep our own copy of this algorithm for 2 main reasons:
+ * 1. If we didn't, anyone modifying common/os/compress.c would
+ * directly break our on disk format
+ * 2. Our version of lzjb does not have a number of checks that the
+ * common/os version needs and uses
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and return -1 if the data will not
+ * compress to d_len or less.
+ */
+
+#include <sys/types.h>
+
+#define MATCH_BITS 6
+#define MATCH_MIN 3
+#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
+#define LEMPEL_SIZE 256
+
+/*ARGSUSED*/
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *cpy, *copymap;
+ int copymask = 1 << (NBBY - 1);
+ int mlen, offset;
+ uint16_t *hp;
+ uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */
+
+ while (src < (uchar_t *)s_start + s_len) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
+ if (d_len != s_len)
+ return (s_len);
+ mlen = s_len;
+ for (src = s_start, dst = d_start; mlen; mlen--)
+ *dst++ = *src++;
+ return (s_len);
+ }
+ copymask = 1;
+ copymap = dst;
+ *dst++ = 0;
+ }
+ if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+ *dst++ = *src++;
+ continue;
+ }
+ hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
+ (LEMPEL_SIZE - 1)];
+ offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+ *hp = (uint16_t)(uintptr_t)src;
+ cpy = src - offset;
+ if (cpy >= (uchar_t *)s_start && cpy != src &&
+ src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+ *copymap |= copymask;
+ for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+ if (src[mlen] != cpy[mlen])
+ break;
+ *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+ (offset >> NBBY);
+ *dst++ = (uchar_t)offset;
+ src += mlen;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *d_end = (uchar_t *)d_start + d_len;
+ uchar_t *cpy, copymap;
+ int copymask = 1 << (NBBY - 1);
+
+ while (dst < d_end) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ copymask = 1;
+ copymap = *src++;
+ }
+ if (copymap & copymask) {
+ int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+ int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+ src += 2;
+ if ((cpy = dst - offset) < (uchar_t *)d_start)
+ return (-1);
+ while (--mlen >= 0 && dst < d_end)
+ *dst++ = *cpy++;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (0);
+}
diff --git a/zfs/lib/libzpool/metaslab.c b/zfs/lib/libzpool/metaslab.c
new file mode 100644
index 000000000..00533efa8
--- /dev/null
+++ b/zfs/lib/libzpool/metaslab.c
@@ -0,0 +1,1053 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)metaslab.c 1.17 07/11/27 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+uint64_t metaslab_aliquot = 512ULL << 10;
+uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(void)
+{
+ metaslab_class_t *mc;
+
+ mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+
+ mc->mc_rotor = NULL;
+
+ return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+
+ while ((mg = mc->mc_rotor) != NULL) {
+ metaslab_class_remove(mc, mg);
+ metaslab_group_destroy(mg);
+ }
+
+ kmem_free(mc, sizeof (metaslab_class_t));
+}
+
+void
+metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(mg->mg_class == NULL);
+
+ if ((mgprev = mc->mc_rotor) == NULL) {
+ mg->mg_prev = mg;
+ mg->mg_next = mg;
+ } else {
+ mgnext = mgprev->mg_next;
+ mg->mg_prev = mgprev;
+ mg->mg_next = mgnext;
+ mgprev->mg_next = mg;
+ mgnext->mg_prev = mg;
+ }
+ mc->mc_rotor = mg;
+ mg->mg_class = mc;
+}
+
+void
+metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(mg->mg_class == mc);
+
+ mgprev = mg->mg_prev;
+ mgnext = mg->mg_next;
+
+ if (mg == mgnext) {
+ mc->mc_rotor = NULL;
+ } else {
+ mc->mc_rotor = mgnext;
+ mgprev->mg_next = mgnext;
+ mgnext->mg_prev = mgprev;
+ }
+
+ mg->mg_prev = NULL;
+ mg->mg_next = NULL;
+ mg->mg_class = NULL;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+ const metaslab_t *m1 = x1;
+ const metaslab_t *m2 = x2;
+
+ if (m1->ms_weight < m2->ms_weight)
+ return (1);
+ if (m1->ms_weight > m2->ms_weight)
+ return (-1);
+
+ /*
+ * If the weights are identical, use the offset to force uniqueness.
+ */
+ if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+ return (-1);
+ if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+ return (1);
+
+ ASSERT3P(m1, ==, m2);
+
+ return (0);
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+{
+ metaslab_group_t *mg;
+
+ mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+ mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+ sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+ mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
+ mg->mg_vd = vd;
+ metaslab_class_add(mc, mg);
+
+ return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+ avl_destroy(&mg->mg_metaslab_tree);
+ mutex_destroy(&mg->mg_lock);
+ kmem_free(mg, sizeof (metaslab_group_t));
+}
+
+static void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == NULL);
+ msp->ms_group = mg;
+ msp->ms_weight = 0;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_group = NULL;
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ /*
+ * Although in principle the weight can be any value, in
+ * practice we do not use values in the range [1, 510].
+ */
+ ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static void
+metaslab_ff_load(space_map_t *sm)
+{
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+}
+
+static void
+metaslab_ff_unload(space_map_t *sm)
+{
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+}
+
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ space_seg_t *ss, ssearch;
+ avl_index_t where;
+
+ ssearch.ss_start = *cursor;
+ ssearch.ss_end = *cursor + size;
+
+ ss = avl_find(t, &ssearch, &where);
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+
+ while (ss != NULL) {
+ uint64_t offset = P2ROUNDUP(ss->ss_start, align);
+
+ if (offset + size <= ss->ss_end) {
+ *cursor = offset + size;
+ return (offset);
+ }
+ ss = AVL_NEXT(t, ss);
+ }
+
+ /*
+ * If we know we've searched the whole map (*cursor == 0), give up.
+ * Otherwise, reset the cursor to the beginning and try again.
+ */
+ if (*cursor == 0)
+ return (-1ULL);
+
+ *cursor = 0;
+ return (metaslab_ff_alloc(sm, size));
+}
+
+/* ARGSUSED */
+static void
+metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_ff_ops = {
+ metaslab_ff_load,
+ metaslab_ff_unload,
+ metaslab_ff_alloc,
+ metaslab_ff_claim,
+ metaslab_ff_free
+};
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+metaslab_t *
+metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+ uint64_t start, uint64_t size, uint64_t txg)
+{
+ vdev_t *vd = mg->mg_vd;
+ metaslab_t *msp;
+
+ msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+ mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ msp->ms_smo_syncing = *smo;
+
+ /*
+ * We create the main space map here, but we don't create the
+ * allocmaps and freemaps until metaslab_sync_done(). This serves
+ * two purposes: it allows metaslab_sync_done() to detect the
+ * addition of new space; and for debugging, it ensures that we'd
+ * data fault on any attempt to use this metaslab before it's ready.
+ */
+ space_map_create(&msp->ms_map, start, size,
+ vd->vdev_ashift, &msp->ms_lock);
+
+ metaslab_group_add(mg, msp);
+
+ /*
+ * If we're opening an existing pool (txg == 0) or creating
+ * a new one (txg == TXG_INITIAL), all space is available now.
+ * If we're adding space to an existing pool, the new space
+ * does not become available until after this txg has synced.
+ */
+ if (txg <= TXG_INITIAL)
+ metaslab_sync_done(msp, 0);
+
+ if (txg != 0) {
+ /*
+ * The vdev is dirty, but the metaslab isn't -- it just needs
+ * to have metaslab_sync_done() invoked from vdev_sync_done().
+ * [We could just dirty the metaslab, but that would cause us
+ * to allocate a space map object for it, which is wasteful
+ * and would mess up the locality logic in metaslab_weight().]
+ */
+ ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
+ }
+
+ return (msp);
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ int t;
+
+ vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
+ -msp->ms_smo.smo_alloc, B_TRUE);
+
+ metaslab_group_remove(mg, msp);
+
+ mutex_enter(&msp->ms_lock);
+
+ space_map_unload(&msp->ms_map);
+ space_map_destroy(&msp->ms_map);
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ space_map_destroy(&msp->ms_allocmap[t]);
+ space_map_destroy(&msp->ms_freemap[t]);
+ }
+
+ mutex_exit(&msp->ms_lock);
+ mutex_destroy(&msp->ms_lock);
+
+ kmem_free(msp, sizeof (metaslab_t));
+}
+
+#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
+#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
+#define METASLAB_ACTIVE_MASK \
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define METASLAB_SMO_BONUS_MULTIPLIER 2
+
+static uint64_t
+metaslab_weight(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo;
+ vdev_t *vd = mg->mg_vd;
+ uint64_t weight, space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * The baseline weight is the metaslab's free space.
+ */
+ space = sm->sm_size - smo->smo_alloc;
+ weight = space;
+
+ /*
+ * Modern disks have uniform bit density and constant angular velocity.
+ * Therefore, the outer recording zones are faster (higher bandwidth)
+ * than the inner zones by the ratio of outer to inner track diameter,
+ * which is typically around 2:1. We account for this by assigning
+ * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
+ * In effect, this means that we'll select the metaslab with the most
+ * free bandwidth rather than simply the one with the most free space.
+ */
+ weight = 2 * weight -
+ ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
+ ASSERT(weight >= space && weight <= 2 * space);
+
+ /*
+ * For locality, assign higher weight to metaslabs we've used before.
+ */
+ if (smo->smo_object != 0)
+ weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+ ASSERT(weight >= space &&
+ weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+
+ /*
+ * If this metaslab is one we're actively using, adjust its weight to
+ * make it preferable to any inactive metaslab so we'll polish it off.
+ */
+ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+ return (weight);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+{
+ space_map_t *sm = &msp->ms_map;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int error = space_map_load(sm, &metaslab_ff_ops,
+ SM_FREE, &msp->ms_smo,
+ msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
+ if (error) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
+ metaslab_group_sort(msp->ms_group, msp,
+ msp->ms_weight | activation_weight);
+ }
+ ASSERT(sm->sm_loaded);
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+ return (0);
+}
+
+static void
+metaslab_passivate(metaslab_t *msp, uint64_t size)
+{
+ /*
+ * If size < SPA_MINBLOCKSIZE, then we will not allocate from
+ * this metaslab again. In that case, it had better be empty,
+ * or we would be leaving space on the table.
+ */
+ ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
+ metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
+ ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
+ space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
+ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo_syncing;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ int t;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ /*
+ * The only state that can actually be changing concurrently with
+ * metaslab_sync() is the metaslab's ms_map. No other thread can
+ * be modifying this txg's allocmap, freemap, freed_map, or smo.
+ * Therefore, we only hold ms_lock to satify space_map ASSERTs.
+ * We drop it whenever we call into the DMU, because the DMU
+ * can call down to us (e.g. via zio_free()) at any time.
+ */
+ mutex_enter(&msp->ms_lock);
+
+ if (smo->smo_object == 0) {
+ ASSERT(smo->smo_objsize == 0);
+ ASSERT(smo->smo_alloc == 0);
+ mutex_exit(&msp->ms_lock);
+ smo->smo_object = dmu_object_alloc(mos,
+ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+ ASSERT(smo->smo_object != 0);
+ dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+ (sm->sm_start >> vd->vdev_ms_shift),
+ sizeof (uint64_t), &smo->smo_object, tx);
+ mutex_enter(&msp->ms_lock);
+ }
+
+ space_map_walk(freemap, space_map_add, freed_map);
+
+ if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
+ 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
+ /*
+ * The in-core space map representation is twice as compact
+ * as the on-disk one, so it's time to condense the latter
+ * by generating a pure allocmap from first principles.
+ *
+ * This metaslab is 100% allocated,
+ * minus the content of the in-core map (sm),
+ * minus what's been freed this txg (freed_map),
+ * minus allocations from txgs in the future
+ * (because they haven't been committed yet).
+ */
+ space_map_vacate(allocmap, NULL, NULL);
+ space_map_vacate(freemap, NULL, NULL);
+
+ space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
+
+ space_map_walk(sm, space_map_remove, allocmap);
+ space_map_walk(freed_map, space_map_remove, allocmap);
+
+ for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
+ space_map_remove, allocmap);
+
+ mutex_exit(&msp->ms_lock);
+ space_map_truncate(smo, mos, tx);
+ mutex_enter(&msp->ms_lock);
+ }
+
+ space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
+ space_map_sync(freemap, SM_FREE, smo, mos, tx);
+
+ mutex_exit(&msp->ms_lock);
+
+ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(smo, db->db_data, sizeof (*smo));
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+ space_map_obj_t *smo = &msp->ms_smo;
+ space_map_obj_t *smosync = &msp->ms_smo_syncing;
+ space_map_t *sm = &msp->ms_map;
+ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ int t;
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * If this metaslab is just becoming available, initialize its
+ * allocmaps and freemaps and add its capacity to the vdev.
+ */
+ if (freed_map->sm_size == 0) {
+ for (t = 0; t < TXG_SIZE; t++) {
+ space_map_create(&msp->ms_allocmap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+ space_map_create(&msp->ms_freemap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+ }
+ vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
+ }
+
+ vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+
+ ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
+ ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
+
+ /*
+ * If there's a space_map_load() in progress, wait for it to complete
+ * so that we have a consistent view of the in-core space map.
+ * Then, add everything we freed in this txg to the map.
+ */
+ space_map_load_wait(sm);
+ space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+
+ *smo = *smosync;
+
+ /*
+ * If the map is loaded but no longer active, evict it as soon as all
+ * future allocations have synced. (If we unloaded it now and then
+ * loaded a moment later, the map wouldn't reflect those allocations.)
+ */
+ if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int evictable = 1;
+
+ for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
+ evictable = 0;
+
+ if (evictable)
+ space_map_unload(sm);
+ }
+
+ metaslab_group_sort(mg, msp, metaslab_weight(msp));
+
+ mutex_exit(&msp->ms_lock);
+}
+
+static uint64_t
+metaslab_distance(metaslab_t *msp, dva_t *dva)
+{
+ uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
+ uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
+ uint64_t start = msp->ms_map.sm_start >> ms_shift;
+
+ if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+ return (1ULL << 63);
+
+ if (offset < start)
+ return ((start - offset) << ms_shift);
+ if (offset > start)
+ return ((offset - start) << ms_shift);
+ return (0);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
+ uint64_t min_distance, dva_t *dva, int d)
+{
+ metaslab_t *msp = NULL;
+ uint64_t offset = -1ULL;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ uint64_t activation_weight;
+ uint64_t target_distance;
+ int i;
+
+ activation_weight = METASLAB_WEIGHT_PRIMARY;
+ for (i = 0; i < d; i++)
+ if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+ activation_weight = METASLAB_WEIGHT_SECONDARY;
+
+ for (;;) {
+ mutex_enter(&mg->mg_lock);
+ for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
+ if (msp->ms_weight < size) {
+ mutex_exit(&mg->mg_lock);
+ return (-1ULL);
+ }
+
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY)
+ break;
+
+ target_distance = min_distance +
+ (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+
+ for (i = 0; i < d; i++)
+ if (metaslab_distance(msp, &dva[i]) <
+ target_distance)
+ break;
+ if (i == d)
+ break;
+ }
+ mutex_exit(&mg->mg_lock);
+ if (msp == NULL)
+ return (-1ULL);
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Ensure that the metaslab we have selected is still
+ * capable of handling our request. It's possible that
+ * another thread may have changed the weight while we
+ * were blocked on the metaslab lock.
+ */
+ if (msp->ms_weight < size) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
+ activation_weight == METASLAB_WEIGHT_PRIMARY) {
+ metaslab_passivate(msp,
+ msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if (metaslab_activate(msp, activation_weight) != 0) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+ break;
+
+ metaslab_passivate(msp, size - 1);
+
+ mutex_exit(&msp->ms_lock);
+ }
+
+ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ mutex_exit(&msp->ms_lock);
+
+ return (offset);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+static int
+metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+ dva_t *dva, int d, dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
+{
+ metaslab_group_t *mg, *rotor;
+ vdev_t *vd;
+ int dshift = 3;
+ int all_zero;
+ uint64_t offset = -1ULL;
+ uint64_t asize;
+ uint64_t distance;
+
+ ASSERT(!DVA_IS_VALID(&dva[d]));
+
+ /*
+ * For testing, make some blocks above a certain size be gang blocks.
+ */
+ if (psize >= metaslab_gang_bang && (lbolt & 3) == 0)
+ return (ENOSPC);
+
+ /*
+ * Start at the rotor and loop through all mgs until we find something.
+ * Note that there's no locking on mc_rotor or mc_allocated because
+ * nothing actually breaks if we miss a few updates -- we just won't
+ * allocate quite as evenly. It all balances out over time.
+ *
+ * If we are doing ditto or log blocks, try to spread them across
+ * consecutive vdevs. If we're forced to reuse a vdev before we've
+ * allocated all of our ditto blocks, then try and spread them out on
+ * that vdev as much as possible. If it turns out to not be possible,
+ * gradually lower our standards until anything becomes acceptable.
+ * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+ * gives us hope of containing our fault domains to something we're
+ * able to reason about. Otherwise, any two top-level vdev failures
+ * will guarantee the loss of data. With consecutive allocation,
+ * only two adjacent top-level vdev failures will result in data loss.
+ *
+ * If we are doing gang blocks (hintdva is non-NULL), try to keep
+ * ourselves on the same vdev as our gang block header. That
+ * way, we can hope for locality in vdev_cache, plus it makes our
+ * fault domains something tractable.
+ */
+ if (hintdva) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+ if (hintdva_avoid)
+ mg = vd->vdev_mg->mg_next;
+ else
+ mg = vd->vdev_mg;
+ } else if (d != 0) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+ mg = vd->vdev_mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
+
+ /*
+ * If the hint put us into the wrong class, just follow the rotor.
+ */
+ if (mg->mg_class != mc)
+ mg = mc->mc_rotor;
+
+ rotor = mg;
+top:
+ all_zero = B_TRUE;
+ do {
+ vd = mg->mg_vd;
+ /*
+ * Dont allocate from faulted devices
+ */
+ if (!vdev_writeable(vd))
+ goto next;
+ /*
+ * Avoid writing single-copy data to a failing vdev
+ */
+ if ((vd->vdev_stat.vs_write_errors > 0 ||
+ vd->vdev_state < VDEV_STATE_HEALTHY) &&
+ d == 0 && dshift == 3) {
+ all_zero = B_FALSE;
+ goto next;
+ }
+
+ ASSERT(mg->mg_class == mc);
+
+ distance = vd->vdev_asize >> dshift;
+ if (distance <= (1ULL << vd->vdev_ms_shift))
+ distance = 0;
+ else
+ all_zero = B_FALSE;
+
+ asize = vdev_psize_to_asize(vd, psize);
+ ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+ offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+ if (offset != -1ULL) {
+ /*
+ * If we've just selected this metaslab group,
+ * figure out whether the corresponding vdev is
+ * over- or under-used relative to the pool,
+ * and set an allocation bias to even it out.
+ */
+ if (mc->mc_allocated == 0) {
+ vdev_stat_t *vs = &vd->vdev_stat;
+ uint64_t alloc, space;
+ int64_t vu, su;
+
+ alloc = spa_get_alloc(spa);
+ space = spa_get_space(spa);
+
+ /*
+ * Determine percent used in units of 0..1024.
+ * (This is just to avoid floating point.)
+ */
+ vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
+ su = (alloc << 10) / (space + 1);
+
+ /*
+ * Bias by at most +/- 25% of the aliquot.
+ */
+ mg->mg_bias = ((su - vu) *
+ (int64_t)mg->mg_aliquot) / (1024 * 4);
+ }
+
+ if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+ mg->mg_aliquot + mg->mg_bias) {
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_allocated = 0;
+ }
+
+ DVA_SET_VDEV(&dva[d], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[d], offset);
+ DVA_SET_GANG(&dva[d], 0);
+ DVA_SET_ASIZE(&dva[d], asize);
+
+ return (0);
+ }
+next:
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_allocated = 0;
+ } while ((mg = mg->mg_next) != rotor);
+
+ if (!all_zero) {
+ dshift++;
+ ASSERT(dshift < 64);
+ goto top;
+ }
+
+ bzero(&dva[d], sizeof (dva_t));
+
+ return (ENOSPC);
+}
+
+/*
+ * Free the block represented by DVA in the context of the specified
+ * transaction group.
+ */
+static void
+metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+ metaslab_t *msp;
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if (txg > spa_freeze_txg(spa))
+ return;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+ cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
+ (u_longlong_t)vdev, (u_longlong_t)offset);
+ ASSERT(0);
+ return;
+ }
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ mutex_enter(&msp->ms_lock);
+
+ if (now) {
+ space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
+ offset, size);
+ space_map_free(&msp->ms_map, offset, size);
+ } else {
+ if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+
+ /*
+ * verify that this region is actually allocated in
+ * either a ms_allocmap or the ms_map
+ */
+ if (msp->ms_map.sm_loaded) {
+ boolean_t allocd = B_FALSE;
+ int i;
+
+ if (!space_map_contains(&msp->ms_map, offset, size)) {
+ allocd = B_TRUE;
+ } else {
+ for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
+ space_map_t *sm = &msp->ms_allocmap
+ [(txg - i) & TXG_MASK];
+ if (space_map_contains(sm,
+ offset, size)) {
+ allocd = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (!allocd) {
+ zfs_panic_recover("freeing free segment "
+ "(vdev=%llu offset=%llx size=%llx)",
+ (longlong_t)vdev, (longlong_t)offset,
+ (longlong_t)size);
+ }
+ }
+
+
+ }
+
+ mutex_exit(&msp->ms_lock);
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+static int
+metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+ metaslab_t *msp;
+ int error;
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+ return (ENXIO);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ mutex_enter(&msp->ms_lock);
+
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+ if (error) {
+ mutex_exit(&msp->ms_lock);
+ return (error);
+ }
+
+ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+
+ space_map_claim(&msp->ms_map, offset, size);
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ mutex_exit(&msp->ms_lock);
+
+ return (0);
+}
+
+int
+metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
+ int ndvas, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
+{
+ dva_t *dva = bp->blk_dva;
+ dva_t *hintdva = hintbp->blk_dva;
+ int d;
+ int error = 0;
+
+ if (mc->mc_rotor == NULL) /* no vdevs in this class */
+ return (ENOSPC);
+
+ ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
+ ASSERT(BP_GET_NDVAS(bp) == 0);
+ ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+
+ for (d = 0; d < ndvas; d++) {
+ error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
+ txg, hintbp_avoid);
+ if (error) {
+ for (d--; d >= 0; d--) {
+ metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+ bzero(&dva[d], sizeof (dva_t));
+ }
+ return (error);
+ }
+ }
+ ASSERT(error == 0);
+ ASSERT(BP_GET_NDVAS(bp) == ndvas);
+
+ return (0);
+}
+
+void
+metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int d;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ for (d = 0; d < ndvas; d++)
+ metaslab_free_dva(spa, &dva[d], txg, now);
+}
+
+int
+metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int d, error;
+ int last_error = 0;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ for (d = 0; d < ndvas; d++)
+ if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
+ last_error = error;
+
+ return (last_error);
+}
diff --git a/zfs/lib/libzpool/refcount.c b/zfs/lib/libzpool/refcount.c
new file mode 100644
index 000000000..d192394f8
--- /dev/null
+++ b/zfs/lib/libzpool/refcount.c
@@ -0,0 +1,195 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)refcount.c 1.2 07/08/02 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#if defined(DEBUG) || !defined(_KERNEL)
+
+#ifdef _KERNEL
+int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+#else
+int reference_tracking_enable = TRUE;
+#endif
+int reference_history = 4; /* tunable */
+
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+refcount_init(void)
+{
+ reference_cache = kmem_cache_create("reference_cache",
+ sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ reference_history_cache = kmem_cache_create("reference_history_cache",
+ sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+refcount_fini(void)
+{
+ kmem_cache_destroy(reference_cache);
+ kmem_cache_destroy(reference_history_cache);
+}
+
+void
+refcount_create(refcount_t *rc)
+{
+ mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&rc->rc_list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&rc->rc_removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ rc->rc_count = 0;
+ rc->rc_removed_count = 0;
+}
+
+void
+refcount_destroy_many(refcount_t *rc, uint64_t number)
+{
+ reference_t *ref;
+
+ ASSERT(rc->rc_count == number);
+ while (ref = list_head(&rc->rc_list)) {
+ list_remove(&rc->rc_list, ref);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_list);
+
+ while (ref = list_head(&rc->rc_removed)) {
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache, ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_removed);
+ mutex_destroy(&rc->rc_mtx);
+}
+
+void
+refcount_destroy(refcount_t *rc)
+{
+ refcount_destroy_many(rc, 0);
+}
+
+int
+refcount_is_zero(refcount_t *rc)
+{
+ ASSERT(rc->rc_count >= 0);
+ return (rc->rc_count == 0);
+}
+
+int64_t
+refcount_count(refcount_t *rc)
+{
+ ASSERT(rc->rc_count >= 0);
+ return (rc->rc_count);
+}
+
+int64_t
+refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ if (reference_tracking_enable) {
+ ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+ ref->ref_holder = holder;
+ ref->ref_number = number;
+ }
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= 0);
+ if (reference_tracking_enable)
+ list_insert_head(&rc->rc_list, ref);
+ rc->rc_count += number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+
+ return (count);
+}
+
+int64_t
+refcount_add(refcount_t *rc, void *holder)
+{
+ return (refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= number);
+
+ if (!reference_tracking_enable) {
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder && ref->ref_number == number) {
+ list_remove(&rc->rc_list, ref);
+ if (reference_history > 0) {
+ ref->ref_removed =
+ kmem_cache_alloc(reference_history_cache,
+ KM_SLEEP);
+ list_insert_head(&rc->rc_removed, ref);
+ rc->rc_removed_count++;
+ if (rc->rc_removed_count >= reference_history) {
+ ref = list_tail(&rc->rc_removed);
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache,
+ ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ rc->rc_removed_count--;
+ }
+ } else {
+ kmem_cache_free(reference_cache, ref);
+ }
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+ }
+ panic("No such hold %p on refcount %llx", holder,
+ (u_longlong_t)(uintptr_t)rc);
+ return (-1);
+}
+
+int64_t
+refcount_remove(refcount_t *rc, void *holder)
+{
+ return (refcount_remove_many(rc, 1, holder));
+}
+
+#endif
diff --git a/zfs/lib/libzpool/sha256.c b/zfs/lib/libzpool/sha256.c
new file mode 100644
index 000000000..eb3a49bf3
--- /dev/null
+++ b/zfs/lib/libzpool/sha256.c
@@ -0,0 +1,129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)sha256.c 1.2 07/12/12 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * SHA-256 checksum, as specified in FIPS 180-3, available at:
+ * http://csrc.nist.gov/publications/PubsFIPS.html
+ *
+ * This is a very compact implementation of SHA-256.
+ * It is designed to be simple and portable, not to be fast.
+ */
+
+/*
+ * The literal definitions of Ch() and Maj() according to FIPS 180-3 are:
+ *
+ * Ch(x, y, z) (x & y) ^ (~x & z)
+ * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z)
+ *
+ * We use equivalent logical reductions here that require one less op.
+ */
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
+#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s)))
+#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
+#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
+#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
+#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
+
+static const uint32_t SHA256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+SHA256Transform(uint32_t *H, const uint8_t *cp)
+{
+ uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
+
+ for (t = 0; t < 16; t++, cp += 4)
+ W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
+
+ for (t = 16; t < 64; t++)
+ W[t] = sigma1(W[t - 2]) + W[t - 7] +
+ sigma0(W[t - 15]) + W[t - 16];
+
+ a = H[0]; b = H[1]; c = H[2]; d = H[3];
+ e = H[4]; f = H[5]; g = H[6]; h = H[7];
+
+ for (t = 0; t < 64; t++) {
+ T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
+ T2 = SIGMA0(a) + Maj(a, b, c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+
+ H[0] += a; H[1] += b; H[2] += c; H[3] += d;
+ H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+}
+
+void
+zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+ uint8_t pad[128];
+ int i, padsize;
+
+ for (i = 0; i < (size & ~63ULL); i += 64)
+ SHA256Transform(H, (uint8_t *)buf + i);
+
+ for (padsize = 0; i < size; i++)
+ pad[padsize++] = *((uint8_t *)buf + i);
+
+ for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
+ pad[padsize] = 0;
+
+ for (i = 56; i >= 0; i -= 8)
+ pad[padsize++] = (size << 3) >> i;
+
+ for (i = 0; i < padsize; i += 64)
+ SHA256Transform(H, pad + i);
+
+ ZIO_SET_CHECKSUM(zcp,
+ (uint64_t)H[0] << 32 | H[1],
+ (uint64_t)H[2] << 32 | H[3],
+ (uint64_t)H[4] << 32 | H[5],
+ (uint64_t)H[6] << 32 | H[7]);
+}
diff --git a/zfs/lib/libzpool/spa.c b/zfs/lib/libzpool/spa.c
new file mode 100644
index 000000000..f59db06db
--- /dev/null
+++ b/zfs/lib/libzpool/spa.c
@@ -0,0 +1,4501 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)spa.c 1.51 08/04/09 SMI"
+
+/*
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/callb.h>
+#include <sys/systeminfo.h>
+#include <sys/sunddi.h>
+#include <sys/spa_boot.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+int zio_taskq_threads = 8;
+
+static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+
+/*
+ * ==========================================================================
+ * SPA properties routines
+ * ==========================================================================
+ */
+
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
+ uint64_t intval, zprop_source_t src)
+{
+ const char *propname = zpool_prop_to_name(prop);
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+
+ if (strval != NULL)
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+ else
+ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
+
+ VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+ nvlist_free(propval);
+}
+
+/*
+ * Get property values from the spa configuration.
+ */
+static void
+spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
+{
+ uint64_t size = spa_get_space(spa);
+ uint64_t used = spa_get_alloc(spa);
+ uint64_t cap, version;
+ zprop_source_t src = ZPROP_SRC_NONE;
+ char *cachefile;
+ size_t len;
+
+ /*
+ * readonly properties
+ */
+ spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 0, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src);
+
+ cap = (size == 0) ? 0 : (used * 100 / size);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+ spa->spa_root_vdev->vdev_state, src);
+
+ /*
+ * settable properties that are not stored in the pool property object.
+ */
+ version = spa_version(spa);
+ if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+ spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
+
+ if (spa->spa_root != NULL)
+ spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
+ 0, ZPROP_SRC_LOCAL);
+
+ if (spa->spa_config_dir != NULL) {
+ if (strcmp(spa->spa_config_dir, "none") == 0) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+ spa->spa_config_dir, 0, ZPROP_SRC_LOCAL);
+ } else {
+ len = strlen(spa->spa_config_dir) +
+ strlen(spa->spa_config_file) + 2;
+ cachefile = kmem_alloc(len, KM_SLEEP);
+ (void) snprintf(cachefile, len, "%s/%s",
+ spa->spa_config_dir, spa->spa_config_file);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+ cachefile, 0, ZPROP_SRC_LOCAL);
+ kmem_free(cachefile, len);
+ }
+ }
+}
+
+/*
+ * Get zpool property values.
+ */
+int
+spa_prop_get(spa_t *spa, nvlist_t **nvp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ objset_t *mos = spa->spa_meta_objset;
+ int err;
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ /*
+ * Get properties from the spa config.
+ */
+ spa_prop_get_config(spa, nvp);
+
+ mutex_enter(&spa->spa_props_lock);
+ /* If no pool property object, no more prop to get. */
+ if (spa->spa_pool_props_object == 0) {
+ mutex_exit(&spa->spa_props_lock);
+ return (0);
+ }
+
+ /*
+ * Get properties from the MOS pool property object.
+ */
+ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t intval = 0;
+ char *strval = NULL;
+ zprop_source_t src = ZPROP_SRC_DEFAULT;
+ zpool_prop_t prop;
+
+ if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
+ continue;
+
+ switch (za.za_integer_length) {
+ case 8:
+ /* integer property */
+ if (za.za_first_integer !=
+ zpool_prop_default_numeric(prop))
+ src = ZPROP_SRC_LOCAL;
+
+ if (prop == ZPOOL_PROP_BOOTFS) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if (err = dsl_dataset_open_obj(dp,
+ za.za_first_integer, NULL, DS_MODE_NONE,
+ FTAG, &ds)) {
+ rw_exit(&dp->dp_config_rwlock);
+ break;
+ }
+
+ strval = kmem_alloc(
+ MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
+ KM_SLEEP);
+ dsl_dataset_name(ds, strval);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+ } else {
+ strval = NULL;
+ intval = za.za_first_integer;
+ }
+
+ spa_prop_add_list(*nvp, prop, strval, intval, src);
+
+ if (strval != NULL)
+ kmem_free(strval,
+ MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
+
+ break;
+
+ case 1:
+ /* string property */
+ strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
+ err = zap_lookup(mos, spa->spa_pool_props_object,
+ za.za_name, 1, za.za_num_integers, strval);
+ if (err) {
+ kmem_free(strval, za.za_num_integers);
+ break;
+ }
+ spa_prop_add_list(*nvp, prop, strval, 0, src);
+ kmem_free(strval, za.za_num_integers);
+ break;
+
+ default:
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ mutex_exit(&spa->spa_props_lock);
+out:
+ if (err && err != ENOENT) {
+ nvlist_free(*nvp);
+ *nvp = NULL;
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Validate the given pool properties nvlist and modify the list
+ * for the property values to be set.
+ */
+static int
+spa_prop_validate(spa_t *spa, nvlist_t *props)
+{
+ nvpair_t *elem;
+ int error = 0, reset_bootfs = 0;
+ uint64_t objnum;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+ zpool_prop_t prop;
+ char *propname, *strval;
+ uint64_t intval;
+ vdev_t *rvdev;
+ char *vdev_type;
+ objset_t *os;
+ char *slash;
+
+ propname = nvpair_name(elem);
+
+ if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
+ return (EINVAL);
+
+ switch (prop) {
+ case ZPOOL_PROP_VERSION:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error &&
+ (intval < spa_version(spa) || intval > SPA_VERSION))
+ error = EINVAL;
+ break;
+
+ case ZPOOL_PROP_DELEGATION:
+ case ZPOOL_PROP_AUTOREPLACE:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && intval > 1)
+ error = EINVAL;
+ break;
+
+ case ZPOOL_PROP_BOOTFS:
+ if (spa_version(spa) < SPA_VERSION_BOOTFS) {
+ error = ENOTSUP;
+ break;
+ }
+
+ /*
+ * A bootable filesystem can not be on a RAIDZ pool
+ * nor a striped pool with more than 1 device.
+ */
+ rvdev = spa->spa_root_vdev;
+ vdev_type =
+ rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
+ if (rvdev->vdev_children > 1 ||
+ strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
+ strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
+ error = ENOTSUP;
+ break;
+ }
+
+ reset_bootfs = 1;
+
+ error = nvpair_value_string(elem, &strval);
+
+ if (!error) {
+ if (strval == NULL || strval[0] == '\0') {
+ objnum = zpool_prop_default_numeric(
+ ZPOOL_PROP_BOOTFS);
+ break;
+ }
+
+ if (error = dmu_objset_open(strval, DMU_OST_ZFS,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os))
+ break;
+ objnum = dmu_objset_id(os);
+ dmu_objset_close(os);
+ }
+ break;
+ case ZPOOL_PROP_FAILUREMODE:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
+ intval > ZIO_FAILURE_MODE_PANIC))
+ error = EINVAL;
+
+ /*
+ * This is a special case which only occurs when
+ * the pool has completely failed. This allows
+ * the user to change the in-core failmode property
+ * without syncing it out to disk (I/Os might
+ * currently be blocked). We do this by returning
+ * EIO to the caller (spa_prop_set) to trick it
+ * into thinking we encountered a property validation
+ * error.
+ */
+ if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
+ spa->spa_failmode = intval;
+ error = EIO;
+ }
+ break;
+
+ case ZPOOL_PROP_CACHEFILE:
+ if ((error = nvpair_value_string(elem, &strval)) != 0)
+ break;
+
+ if (strval[0] == '\0')
+ break;
+
+ if (strcmp(strval, "none") == 0)
+ break;
+
+ if (strval[0] != '/') {
+ error = EINVAL;
+ break;
+ }
+
+ slash = strrchr(strval, '/');
+ ASSERT(slash != NULL);
+
+ if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
+ strcmp(slash, "/..") == 0)
+ error = EINVAL;
+ break;
+ }
+
+ if (error)
+ break;
+ }
+
+ if (!error && reset_bootfs) {
+ error = nvlist_remove(props,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
+
+ if (!error) {
+ error = nvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
+ }
+ }
+
+ return (error);
+}
+
+int
+spa_prop_set(spa_t *spa, nvlist_t *nvp)
+{
+ int error;
+
+ if ((error = spa_prop_validate(spa, nvp)) != 0)
+ return (error);
+
+ return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
+ spa, nvp, 3));
+}
+
+/*
+ * If the bootfs property value is dsobj, clear it.
+ */
+void
+spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+{
+ if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
+ VERIFY(zap_remove(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
+ spa->spa_bootfs = 0;
+ }
+}
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+ spa_error_entry_t *sa = (spa_error_entry_t *)a;
+ spa_error_entry_t *sb = (spa_error_entry_t *)b;
+ int ret;
+
+ ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+ sizeof (zbookmark_t));
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+ ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+ bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+ bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa)
+{
+ int t;
+
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+
+ spa->spa_normal_class = metaslab_class_create();
+ spa->spa_log_class = metaslab_class_create();
+
+ for (t = 0; t < ZIO_TYPES; t++) {
+ spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
+ zio_taskq_threads, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
+ zio_taskq_threads, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ }
+
+ list_create(&spa->spa_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_dirty_node));
+ list_create(&spa->spa_zio_list, sizeof (zio_t),
+ offsetof(zio_t, zio_link_node));
+
+ txg_list_create(&spa->spa_vdev_txg_list,
+ offsetof(struct vdev, vdev_txg_node));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+ int t;
+
+ ASSERT(spa->spa_sync_on == B_FALSE);
+ ASSERT(spa->spa_dsl_pool == NULL);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+ txg_list_destroy(&spa->spa_vdev_txg_list);
+
+ list_destroy(&spa->spa_dirty_list);
+ list_destroy(&spa->spa_zio_list);
+
+ for (t = 0; t < ZIO_TYPES; t++) {
+ taskq_destroy(spa->spa_zio_issue_taskq[t]);
+ taskq_destroy(spa->spa_zio_intr_taskq[t]);
+ spa->spa_zio_issue_taskq[t] = NULL;
+ spa->spa_zio_intr_taskq[t] = NULL;
+ }
+
+ metaslab_class_destroy(spa->spa_normal_class);
+ spa->spa_normal_class = NULL;
+
+ metaslab_class_destroy(spa->spa_log_class);
+ spa->spa_log_class = NULL;
+
+ /*
+ * If this was part of an import or the open otherwise failed, we may
+ * still have errors left in the queues. Empty them just in case.
+ */
+ spa_errlog_drain(spa);
+
+ avl_destroy(&spa->spa_errlist_scrub);
+ avl_destroy(&spa->spa_errlist_last);
+
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately. This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state. This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+static int
+spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
+ uint_t id, int atype)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ int error;
+
+ if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
+ return (error);
+
+ if ((*vdp)->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < children; c++) {
+ vdev_t *vd;
+ if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
+ atype)) != 0) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (error);
+ }
+ }
+
+ ASSERT(*vdp != NULL);
+
+ return (0);
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+ int i;
+
+ /*
+ * Stop async tasks.
+ */
+ spa_async_suspend(spa);
+
+ /*
+ * Stop syncing.
+ */
+ if (spa->spa_sync_on) {
+ txg_sync_stop(spa->spa_dsl_pool);
+ spa->spa_sync_on = B_FALSE;
+ }
+
+ /*
+ * Wait for any outstanding prefetch I/O to complete.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Drop and purge level 2 cache
+ */
+ spa_l2cache_drop(spa);
+
+ /*
+ * Close the dsl pool.
+ */
+ if (spa->spa_dsl_pool) {
+ dsl_pool_close(spa->spa_dsl_pool);
+ spa->spa_dsl_pool = NULL;
+ }
+
+ /*
+ * Close all vdevs.
+ */
+ if (spa->spa_root_vdev)
+ vdev_free(spa->spa_root_vdev);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ vdev_free(spa->spa_spares.sav_vdevs[i]);
+ if (spa->spa_spares.sav_vdevs) {
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
+ spa->spa_spares.sav_vdevs = NULL;
+ }
+ if (spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
+ }
+
+ for (i = 0; i < spa->spa_l2cache.sav_count; i++)
+ vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+ if (spa->spa_l2cache.sav_vdevs) {
+ kmem_free(spa->spa_l2cache.sav_vdevs,
+ spa->spa_l2cache.sav_count * sizeof (void *));
+ spa->spa_l2cache.sav_vdevs = NULL;
+ }
+ if (spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
+ }
+
+ spa->spa_async_suspended = 0;
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active spares for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ */
+static void
+spa_load_spares(spa_t *spa)
+{
+ nvlist_t **spares;
+ uint_t nspares;
+ int i;
+ vdev_t *vd, *tvd;
+
+ /*
+ * First, close and free any existing spare vdevs.
+ */
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ vd = spa->spa_spares.sav_vdevs[i];
+
+ /* Undo the call to spa_activate() below */
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
+ tvd->vdev_isspare)
+ spa_spare_remove(tvd);
+ vdev_close(vd);
+ vdev_free(vd);
+ }
+
+ if (spa->spa_spares.sav_vdevs)
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
+
+ if (spa->spa_spares.sav_config == NULL)
+ nspares = 0;
+ else
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ spa->spa_spares.sav_count = (int)nspares;
+ spa->spa_spares.sav_vdevs = NULL;
+
+ if (nspares == 0)
+ return;
+
+ /*
+ * Construct the array of vdevs, opening them to get status in the
+ * process. For each spare, there is potentially two different vdev_t
+ * structures associated with it: one in the list of spares (used only
+ * for basic validation purposes) and one in the active vdev
+ * configuration (if it's spared in). During this phase we open and
+ * validate each vdev on the spare list. If the vdev also exists in the
+ * active configuration, then we also mark this vdev as an active spare.
+ */
+ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
+ VDEV_ALLOC_SPARE) == 0);
+ ASSERT(vd != NULL);
+
+ spa->spa_spares.sav_vdevs[i] = vd;
+
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
+ if (!tvd->vdev_isspare)
+ spa_spare_add(tvd);
+
+ /*
+ * We only mark the spare active if we were successfully
+ * able to load the vdev. Otherwise, importing a pool
+ * with a bad active spare would result in strange
+ * behavior, because multiple pool would think the spare
+ * is actively in use.
+ *
+ * There is a vulnerability here to an equally bizarre
+ * circumstance, where a dead active spare is later
+ * brought back to life (onlined or otherwise). Given
+ * the rarity of this scenario, and the extra complexity
+ * it adds, we ignore the possibility.
+ */
+ if (!vdev_is_dead(tvd))
+ spa_spare_activate(tvd);
+ }
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ vd->vdev_top = vd;
+ if (vdev_validate_aux(vd) == 0)
+ spa_spare_add(vd);
+ }
+
+ /*
+ * Recompute the stashed list of spares, with status information
+ * this time.
+ */
+ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ spares[i] = vdev_config_generate(spa,
+ spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ nvlist_free(spares[i]);
+ kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active l2cache for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ * Devices which are already active have their details maintained, and are
+ * not re-opened.
+ */
+static void
+spa_load_l2cache(spa_t *spa)
+{
+ nvlist_t **l2cache;
+ uint_t nl2cache;
+ int i, j, oldnvdevs;
+ uint64_t guid;
+ vdev_t *vd, **oldvdevs, **newvdevs;
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+ if (sav->sav_config != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+ newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
+ } else {
+ nl2cache = 0;
+ }
+
+ oldvdevs = sav->sav_vdevs;
+ oldnvdevs = sav->sav_count;
+ sav->sav_vdevs = NULL;
+ sav->sav_count = 0;
+
+ /*
+ * Process new nvlist of vdevs.
+ */
+ for (i = 0; i < nl2cache; i++) {
+ VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
+ &guid) == 0);
+
+ newvdevs[i] = NULL;
+ for (j = 0; j < oldnvdevs; j++) {
+ vd = oldvdevs[j];
+ if (vd != NULL && guid == vd->vdev_guid) {
+ /*
+ * Retain previous vdev for add/remove ops.
+ */
+ newvdevs[i] = vd;
+ oldvdevs[j] = NULL;
+ break;
+ }
+ }
+
+ if (newvdevs[i] == NULL) {
+ /*
+ * Create new vdev
+ */
+ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
+ VDEV_ALLOC_L2CACHE) == 0);
+ ASSERT(vd != NULL);
+ newvdevs[i] = vd;
+
+ /*
+ * Commit this vdev as an l2cache device,
+ * even if it fails to open.
+ */
+ spa_l2cache_add(vd);
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ vd->vdev_top = vd;
+ (void) vdev_validate_aux(vd);
+
+ if (!vdev_is_dead(vd)) {
+ uint64_t size;
+ size = vdev_get_rsize(vd);
+ ASSERT3U(size, >, 0);
+ if (spa_mode & FWRITE) {
+ l2arc_add_vdev(spa, vd,
+ VDEV_LABEL_START_SIZE,
+ size - VDEV_LABEL_START_SIZE);
+ }
+ spa_l2cache_activate(vd);
+ }
+ }
+ }
+
+ /*
+ * Purge vdevs that were dropped
+ */
+ for (i = 0; i < oldnvdevs; i++) {
+ uint64_t pool;
+
+ vd = oldvdevs[i];
+ if (vd != NULL) {
+ if (spa_mode & FWRITE &&
+ spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL) {
+ l2arc_remove_vdev(vd);
+ }
+ (void) vdev_close(vd);
+ spa_l2cache_remove(vd);
+ }
+ }
+
+ if (oldvdevs)
+ kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+
+ if (sav->sav_config == NULL)
+ goto out;
+
+ sav->sav_vdevs = newvdevs;
+ sav->sav_count = (int)nl2cache;
+
+ /*
+ * Recompute the stashed list of l2cache devices, with status
+ * information this time.
+ */
+ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < sav->sav_count; i++)
+ l2cache[i] = vdev_config_generate(spa,
+ sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
+out:
+ for (i = 0; i < sav->sav_count; i++)
+ nvlist_free(l2cache[i]);
+ if (sav->sav_count)
+ kmem_free(l2cache, sav->sav_count * sizeof (void *));
+}
+
+static int
+load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
+{
+ dmu_buf_t *db;
+ char *packed = NULL;
+ size_t nvsize = 0;
+ int error;
+ *value = NULL;
+
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+ error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
+ if (error == 0)
+ error = nvlist_unpack(packed, nvsize, value, 0);
+ kmem_free(packed, nvsize);
+
+ return (error);
+}
+
+/*
+ * Checks to see if the given vdev could not be opened, in which case we post a
+ * sysevent to notify the autoreplace code that the device has been removed.
+ */
+static void
+spa_check_removed(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ spa_check_removed(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
+ zfs_post_autoreplace(vd->vdev_spa, vd);
+ spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
+ }
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+static int
+spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+{
+ int error = 0;
+ nvlist_t *nvroot = NULL;
+ vdev_t *rvd;
+ uberblock_t *ub = &spa->spa_uberblock;
+ uint64_t config_cache_txg = spa->spa_config_txg;
+ uint64_t pool_guid;
+ uint64_t version;
+ zio_t *zio;
+ uint64_t autoreplace = 0;
+
+ spa->spa_load_state = state;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Versioning wasn't explicitly added to the label until later, so if
+ * it's not present treat it as the initial version.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
+ version = SPA_VERSION_INITIAL;
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+ error = EEXIST;
+ goto out;
+ }
+
+ spa->spa_load_guid = pool_guid;
+
+ /*
+ * Parse the configuration into a vdev tree. We explicitly set the
+ * value that will be returned by spa_version() since parsing the
+ * configuration requires knowing the version number.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa->spa_ubsync.ub_version = version;
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0)
+ goto out;
+
+ ASSERT(spa->spa_root_vdev == rvd);
+ ASSERT(spa_guid(spa) == pool_guid);
+
+ /*
+ * Try to open all vdevs, loading each label in the process.
+ */
+ error = vdev_open(rvd);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Validate the labels for all leaf vdevs. We need to grab the config
+ * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
+ * flag.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0)
+ goto out;
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * Find the best uberblock.
+ */
+ bzero(ub, sizeof (uberblock_t));
+
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+ vdev_uberblock_load(zio, rvd, ub);
+ error = zio_wait(zio);
+
+ /*
+ * If we weren't able to find a single valid uberblock, return failure.
+ */
+ if (ub->ub_txg == 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * If the pool is newer than the code, we can't open it.
+ */
+ if (ub->ub_version > SPA_VERSION) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_VERSION_NEWER);
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * If the vdev guid sum doesn't match the uberblock, we have an
+ * incomplete configuration.
+ */
+ if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_GUID_SUM);
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * Initialize internal SPA structures.
+ */
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+ error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ if (error) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ goto out;
+ }
+ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+ if (zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ if (!mosconfig) {
+ nvlist_t *newconfig;
+ uint64_t hostid;
+
+ if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
+ &hostid) == 0) {
+ char *hostname;
+ unsigned long myhostid = 0;
+
+ VERIFY(nvlist_lookup_string(newconfig,
+ ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
+
+ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
+ if (hostid != 0 && myhostid != 0 &&
+ (unsigned long)hostid != myhostid) {
+ cmn_err(CE_WARN, "pool '%s' could not be "
+ "loaded as it was last accessed by "
+ "another system (host: %s hostid: 0x%lx). "
+ "See: http://www.sun.com/msg/ZFS-8000-EY",
+ spa->spa_name, hostname,
+ (unsigned long)hostid);
+ error = EBADF;
+ goto out;
+ }
+ }
+
+ spa_config_set(spa, newconfig);
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_activate(spa);
+
+ return (spa_load(spa, newconfig, state, B_TRUE));
+ }
+
+ if (zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the bit that tells us to use the new accounting function
+ * (raid-z deflation). If we have an older pool, this will not
+ * be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the persistent error log. If we have an older pool, this will
+ * not be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+ sizeof (uint64_t), 1, &spa->spa_errlog_last);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+ sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the history object. If we have an older pool, this
+ * will not be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
+ sizeof (uint64_t), 1, &spa->spa_history);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load any hot spares for this pool.
+ */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+ if (error == 0) {
+ ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
+ if (load_nvlist(spa, spa->spa_spares.sav_object,
+ &spa->spa_spares.sav_config) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ }
+
+ /*
+ * Load any level 2 ARC devices for this pool.
+ */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
+ &spa->spa_l2cache.sav_object);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+ if (error == 0) {
+ ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
+ if (load_nvlist(spa, spa->spa_l2cache.sav_object,
+ &spa->spa_l2cache.sav_config) != 0) {
+ vdev_set_state(rvd, B_TRUE,
+ VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, FTAG);
+ }
+
+ spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
+
+ if (error && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ if (error == 0) {
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
+ sizeof (uint64_t), 1, &spa->spa_bootfs);
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
+ sizeof (uint64_t), 1, &autoreplace);
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
+ sizeof (uint64_t), 1, &spa->spa_delegation);
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
+ sizeof (uint64_t), 1, &spa->spa_failmode);
+ }
+
+ /*
+ * If the 'autoreplace' property is set, then post a resource notifying
+ * the ZFS DE that it should not issue any faults for unopenable
+ * devices. We also iterate over the vdevs, and post a sysevent for any
+ * unopenable vdevs so that the normal autoreplace handler can take
+ * over.
+ */
+ if (autoreplace && state != SPA_LOAD_TRYIMPORT)
+ spa_check_removed(spa->spa_root_vdev);
+
+ /*
+ * Load the vdev state for all toplevel vdevs.
+ */
+ vdev_load(rvd);
+
+ /*
+ * Propagate the leaf DTLs we just loaded all the way up the tree.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Check the state of the root vdev. If it can't be opened, it
+ * indicates one or more toplevel vdevs are faulted.
+ */
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ error = ENXIO;
+ goto out;
+ }
+
+ if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+ dmu_tx_t *tx;
+ int need_update = B_FALSE;
+ int c;
+
+ /*
+ * Claim log blocks that haven't been committed yet.
+ * This must all happen in a single txg.
+ */
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa),
+ spa_first_txg(spa));
+ (void) dmu_objset_find(spa->spa_name,
+ zil_claim, tx, DS_FIND_CHILDREN);
+ dmu_tx_commit(tx);
+
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * Wait for all claims to sync.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * If the config cache is stale, or we have uninitialized
+ * metaslabs (see spa_vdev_add()), then update the config.
+ */
+ if (config_cache_txg != spa->spa_config_txg ||
+ state == SPA_LOAD_IMPORT)
+ need_update = B_TRUE;
+
+ for (c = 0; c < rvd->vdev_children; c++)
+ if (rvd->vdev_child[c]->vdev_ms_array == 0)
+ need_update = B_TRUE;
+
+ /*
+ * Update the config cache asychronously in case we're the
+ * root pool, in which case the config cache isn't writable yet.
+ */
+ if (need_update)
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
+
+ error = 0;
+out:
+ if (error && error != EBADF)
+ zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+ spa->spa_load_state = SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache. For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNINITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+{
+ spa_t *spa;
+ int error;
+ int loaded = B_FALSE;
+ int locked = B_FALSE;
+
+ *spapp = NULL;
+
+ /*
+ * As disgusting as this is, we need to support recursive calls to this
+ * function because dsl_dir_open() is called during spa_load(), and ends
+ * up calling spa_open() again. The real fix is to figure out how to
+ * avoid dsl_dir_open() calling this in the first place.
+ */
+ if (mutex_owner(&spa_namespace_lock) != curthread) {
+ mutex_enter(&spa_namespace_lock);
+ locked = B_TRUE;
+ }
+
+ if ((spa = spa_lookup(pool)) == NULL) {
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+
+ spa_activate(spa);
+
+ error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+
+ if (error == EBADF) {
+ /*
+ * If vdev_validate() returns failure (indicated by
+ * EBADF), it indicates that one of the vdevs indicates
+ * that the pool has been exported or destroyed. If
+ * this is the case, the config cache is out of sync and
+ * we should remove the pool from the namespace.
+ */
+ zfs_post_ok(spa, NULL);
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ spa_config_sync();
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+
+ if (error) {
+ /*
+ * We can't open the pool, but we still have useful
+ * information: the state of each vdev after the
+ * attempted vdev_open(). Return this to the user.
+ */
+ if (config != NULL && spa->spa_root_vdev != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ *config = spa_config_generate(spa, NULL, -1ULL,
+ B_TRUE);
+ spa_config_exit(spa, FTAG);
+ }
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa->spa_last_open_failed = B_TRUE;
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ *spapp = NULL;
+ return (error);
+ } else {
+ zfs_post_ok(spa, NULL);
+ spa->spa_last_open_failed = B_FALSE;
+ }
+
+ loaded = B_TRUE;
+ }
+
+ spa_open_ref(spa, tag);
+
+ /*
+ * If we just loaded the pool, resilver anything that's out of date.
+ */
+ if (loaded && (spa_mode & FWRITE))
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+
+ *spapp = spa;
+
+ if (config != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ spa_config_exit(spa, FTAG);
+ }
+
+ return (0);
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+ return (spa_open_common(name, spapp, tag, NULL));
+}
+
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+ spa_t *spa;
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (NULL);
+ }
+ spa->spa_inject_ref++;
+ mutex_exit(&spa_namespace_lock);
+
+ return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+ mutex_enter(&spa_namespace_lock);
+ spa->spa_inject_ref--;
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * Add spares device information to the nvlist.
+ */
+static void
+spa_add_spares(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **spares;
+ uint_t i, nspares;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_stat_t *vs;
+ uint_t vsc;
+ uint64_t pool;
+
+ if (spa->spa_spares.sav_count == 0)
+ return;
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+ if (nspares != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ /*
+ * Go through and find any spares which have since been
+ * repurposed as an active spare. If this is the case, update
+ * their status appropriately.
+ */
+ for (i = 0; i < nspares; i++) {
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+ if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
+ VERIFY(nvlist_lookup_uint64_array(
+ spares[i], ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+ vs->vs_state = VDEV_STATE_CANT_OPEN;
+ vs->vs_aux = VDEV_AUX_SPARED;
+ }
+ }
+ }
+}
+
+/*
+ * Add l2cache device information to the nvlist, including vdev stats.
+ */
+static void
+spa_add_l2cache(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **l2cache;
+ uint_t i, j, nl2cache;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_t *vd;
+ vdev_stat_t *vs;
+ uint_t vsc;
+
+ if (spa->spa_l2cache.sav_count == 0)
+ return;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+ if (nl2cache != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+
+ /*
+ * Update level 2 cache device stats.
+ */
+
+ for (i = 0; i < nl2cache; i++) {
+ VERIFY(nvlist_lookup_uint64(l2cache[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+
+ vd = NULL;
+ for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
+ if (guid ==
+ spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
+ vd = spa->spa_l2cache.sav_vdevs[j];
+ break;
+ }
+ }
+ ASSERT(vd != NULL);
+
+ VERIFY(nvlist_lookup_uint64_array(l2cache[i],
+ ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+ vdev_get_stats(vd, vs);
+ }
+ }
+
+ spa_config_exit(spa, FTAG);
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
+{
+ int error;
+ spa_t *spa;
+
+ *config = NULL;
+ error = spa_open_common(name, &spa, FTAG, config);
+
+ if (spa && *config != NULL) {
+ VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
+ spa_get_errlog_size(spa)) == 0);
+
+ spa_add_spares(spa, *config);
+ spa_add_l2cache(spa, *config);
+ }
+
+ /*
+ * We want to get the alternate root even for faulted pools, so we cheat
+ * and call spa_lookup() directly.
+ */
+ if (altroot) {
+ if (spa == NULL) {
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(name);
+ if (spa)
+ spa_altroot(spa, altroot, buflen);
+ else
+ altroot[0] = '\0';
+ spa = NULL;
+ mutex_exit(&spa_namespace_lock);
+ } else {
+ spa_altroot(spa, altroot, buflen);
+ }
+ }
+
+ if (spa != NULL)
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Validate that the auxiliary device array is well formed. We must have an
+ * array of nvlists, each which describes a valid leaf vdev. If this is an
+ * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
+ * specified, as long as they are well-formed.
+ */
+static int
+spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
+ spa_aux_vdev_t *sav, const char *config, uint64_t version,
+ vdev_labeltype_t label)
+{
+ nvlist_t **dev;
+ uint_t i, ndev;
+ vdev_t *vd;
+ int error;
+
+ /*
+ * It's acceptable to have no devs specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
+ return (0);
+
+ if (ndev == 0)
+ return (EINVAL);
+
+ /*
+ * Make sure the pool is formatted with a version that supports this
+ * device type.
+ */
+ if (spa_version(spa) < version)
+ return (ENOTSUP);
+
+ /*
+ * Set the pending device list so we correctly handle device in-use
+ * checking.
+ */
+ sav->sav_pending = dev;
+ sav->sav_npending = ndev;
+
+ for (i = 0; i < ndev; i++) {
+ if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
+ mode)) != 0)
+ goto out;
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ vdev_free(vd);
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * The L2ARC currently only supports disk devices.
+ */
+ if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
+ strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
+ error = ENOTBLK;
+ goto out;
+ }
+
+ vd->vdev_top = vd;
+
+ if ((error = vdev_open(vd)) == 0 &&
+ (error = vdev_label_init(vd, crtxg, label)) == 0) {
+ VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ }
+
+ vdev_free(vd);
+
+ if (error &&
+ (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
+ goto out;
+ else
+ error = 0;
+ }
+
+out:
+ sav->sav_pending = NULL;
+ sav->sav_npending = 0;
+ return (error);
+}
+
+static int
+spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+{
+ int error;
+
+ if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+ &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
+ VDEV_LABEL_SPARE)) != 0) {
+ return (error);
+ }
+
+ return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+ &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
+ VDEV_LABEL_L2CACHE));
+}
+
+static void
+spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
+ const char *config)
+{
+ int i;
+
+ if (sav->sav_config != NULL) {
+ nvlist_t **olddevs;
+ uint_t oldndevs;
+ nvlist_t **newdevs;
+
+ /*
+ * Generate new dev list by concatentating with the
+ * current dev list.
+ */
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
+ &olddevs, &oldndevs) == 0);
+
+ newdevs = kmem_alloc(sizeof (void *) *
+ (ndevs + oldndevs), KM_SLEEP);
+ for (i = 0; i < oldndevs; i++)
+ VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
+ KM_SLEEP) == 0);
+ for (i = 0; i < ndevs; i++)
+ VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
+ KM_SLEEP) == 0);
+
+ VERIFY(nvlist_remove(sav->sav_config, config,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ config, newdevs, ndevs + oldndevs) == 0);
+ for (i = 0; i < oldndevs + ndevs; i++)
+ nvlist_free(newdevs[i]);
+ kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
+ } else {
+ /*
+ * Generate a new dev list.
+ */
+ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
+ devs, ndevs) == 0);
+ }
+}
+
+/*
+ * Stop and drop level 2 ARC devices
+ */
+void
+spa_l2cache_drop(spa_t *spa)
+{
+ vdev_t *vd;
+ int i;
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+ for (i = 0; i < sav->sav_count; i++) {
+ uint64_t pool;
+
+ vd = sav->sav_vdevs[i];
+ ASSERT(vd != NULL);
+
+ if (spa_mode & FWRITE &&
+ spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL) {
+ l2arc_remove_vdev(vd);
+ }
+ if (vd->vdev_isl2cache)
+ spa_l2cache_remove(vd);
+ vdev_clear_stats(vd);
+ (void) vdev_close(vd);
+ }
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
+ const char *history_str)
+{
+ spa_t *spa;
+ char *altroot = NULL;
+ vdev_t *rvd;
+ dsl_pool_t *dp;
+ dmu_tx_t *tx;
+ int c, error = 0;
+ uint64_t txg = TXG_INITIAL;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+ uint64_t version;
+
+ /*
+ * If this pool already exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+
+ /*
+ * Allocate a new spa_t structure.
+ */
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+ spa = spa_add(pool, altroot);
+ spa_activate(spa);
+
+ spa->spa_uberblock.ub_txg = txg - 1;
+
+ if (props && (error = spa_prop_validate(spa, props))) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ return (error);
+ }
+
+ if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
+ &version) != 0)
+ version = SPA_VERSION;
+ ASSERT(version <= SPA_VERSION);
+ spa->spa_uberblock.ub_version = version;
+ spa->spa_ubsync = spa->spa_uberblock;
+
+ /*
+ * Create the root vdev.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+ ASSERT(error != 0 || rvd != NULL);
+ ASSERT(error != 0 || spa->spa_root_vdev == rvd);
+
+ if (error == 0 && !zfs_allocatable_devs(nvroot))
+ error = EINVAL;
+
+ if (error == 0 &&
+ (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
+ (error = spa_validate_aux(spa, nvroot, txg,
+ VDEV_ALLOC_ADD)) == 0) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_init(rvd->vdev_child[c], txg);
+ vdev_config_dirty(rvd);
+ }
+
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Get the list of spares, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Get the list of level 2 cache devices, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
+ spa->spa_meta_objset = dp->dp_meta_objset;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * Create the pool config object.
+ */
+ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool config");
+ }
+
+ /* Newly created pools with the right version are always deflated. */
+ if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ spa->spa_deflate = TRUE;
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add deflate");
+ }
+ }
+
+ /*
+ * Create the deferred-free bplist object. Turn off compression
+ * because sync-to-convergence takes longer if the blocksize
+ * keeps changing.
+ */
+ spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+ 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+ ZIO_COMPRESS_OFF, tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bplist");
+ }
+
+ /*
+ * Create the pool's history object.
+ */
+ if (version >= SPA_VERSION_ZPOOL_HISTORY)
+ spa_history_create_obj(spa, tx);
+
+ /*
+ * Set pool properties.
+ */
+ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
+ spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+ spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
+ if (props)
+ spa_sync_props(spa, props, CRED(), tx);
+
+ dmu_tx_commit(tx);
+
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * We explicitly wait for the first transaction to complete so that our
+ * bean counters are appropriately updated.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ spa_config_sync();
+
+ if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
+ (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Import the given pool into the system. We set up the necessary spa_t and
+ * then call spa_load() to do the dirty work.
+ */
+static int
+spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
+ boolean_t isroot)
+{
+ spa_t *spa;
+ char *altroot = NULL;
+ int error;
+ nvlist_t *nvroot;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+ int mosconfig = isroot? B_FALSE : B_TRUE;
+
+ /*
+ * If a pool with this name exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+ spa = spa_add(pool, altroot);
+ spa_activate(spa);
+
+ /*
+ * Pass off the heavy lifting to spa_load().
+ * Pass TRUE for mosconfig because the user-supplied config
+ * is actually the one to trust when doing an import.
+ */
+ error = spa_load(spa, config, SPA_LOAD_IMPORT, mosconfig);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ /*
+ * Toss any existing sparelist, as it doesn't have any validity anymore,
+ * and conflicts with spa_has_spare().
+ */
+ if (!isroot && spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
+ spa_load_spares(spa);
+ }
+ if (!isroot && spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
+ spa_load_l2cache(spa);
+ }
+
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (error == 0)
+ error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
+ if (error == 0)
+ error = spa_validate_aux(spa, nvroot, -1ULL,
+ VDEV_ALLOC_L2CACHE);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Override any spares and level 2 cache devices as specified by
+ * the user, as these may have correct device names/devids, etc.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ if (spa->spa_spares.sav_config)
+ VERIFY(nvlist_remove(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ if (spa->spa_l2cache.sav_config)
+ VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Update the config cache to include the newly-imported pool.
+ */
+ if (spa_mode & FWRITE)
+ spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot);
+
+ /*
+ * Resilver anything that's out of date.
+ */
+ if (!isroot && (spa_mode & FWRITE))
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Build a "root" vdev for a top level vdev read in from a rootpool
+ * device label.
+ */
+static void
+spa_build_rootpool_config(nvlist_t *config)
+{
+ nvlist_t *nvtop, *nvroot;
+ uint64_t pgid;
+
+ /*
+ * Add this top-level vdev to the child array.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
+ == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
+ == 0);
+
+ /*
+ * Put this pool's top-level vdevs into a root vdev.
+ */
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
+ == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &nvtop, 1) == 0);
+
+ /*
+ * Replace the existing vdev_tree with the new root vdev in
+ * this pool's configuration (remove the old, add the new).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+ nvlist_free(nvroot);
+}
+
+/*
+ * Get the root pool information from the root disk, then import the root pool
+ * during the system boot up time.
+ */
+extern nvlist_t *vdev_disk_read_rootlabel(char *);
+
+void
+spa_check_rootconf(char *devpath, char **bestdev, nvlist_t **bestconf,
+ uint64_t *besttxg)
+{
+ nvlist_t *config;
+ uint64_t txg;
+
+ if ((config = vdev_disk_read_rootlabel(devpath)) == NULL)
+ return;
+
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+ if (txg > *besttxg) {
+ *besttxg = txg;
+ if (*bestconf != NULL)
+ nvlist_free(*bestconf);
+ *bestconf = config;
+ *bestdev = devpath;
+ }
+}
+
+boolean_t
+spa_rootdev_validate(nvlist_t *nv)
+{
+ uint64_t ival;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
+ nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
+ nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &ival) == 0 ||
+ nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Import a root pool.
+ *
+ * For x86. devpath_list will consist the physpath name of the vdev in a single
+ * disk root pool or a list of physnames for the vdevs in a mirrored rootpool.
+ * e.g.
+ * "/pci@1f,0/ide@d/disk@0,0:a /pci@1f,o/ide@d/disk@2,0:a"
+ *
+ * For Sparc, devpath_list consists the physpath name of the booting device
+ * no matter the rootpool is a single device pool or a mirrored pool.
+ * e.g.
+ * "/pci@1f,0/ide@d/disk@0,0:a"
+ */
+int
+spa_import_rootpool(char *devpath_list)
+{
+ nvlist_t *conf = NULL;
+ char *dev = NULL;
+ char *pname;
+ int error;
+
+ /*
+ * Get the vdev pathname and configuation from the most
+ * recently updated vdev (highest txg).
+ */
+ if (error = spa_get_rootconf(devpath_list, &dev, &conf))
+ goto msg_out;
+
+ /*
+ * Add type "root" vdev to the config.
+ */
+ spa_build_rootpool_config(conf);
+
+ VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
+
+ error = spa_import_common(pname, conf, NULL, TRUE);
+ if (error == EEXIST)
+ error = 0;
+
+ nvlist_free(conf);
+ return (error);
+
+msg_out:
+ cmn_err(CE_NOTE, "\n\n"
+ " *************************************************** \n"
+ " * This device is not bootable! * \n"
+ " * It is either offlined or detached or faulted. * \n"
+ " * Please try to boot from a different device. * \n"
+ " *************************************************** \n\n");
+
+ return (error);
+}
+#endif
+
+/*
+ * Import a non-root pool into the system.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+{
+ return (spa_import_common(pool, config, props, FALSE));
+}
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define TRYIMPORT_NAME "$import"
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+ nvlist_t *config = NULL;
+ char *poolname;
+ spa_t *spa;
+ uint64_t state;
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+ return (NULL);
+
+ if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+ return (NULL);
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_add(TRYIMPORT_NAME, NULL);
+ spa_activate(spa);
+
+ /*
+ * Pass off the heavy lifting to spa_load().
+ * Pass TRUE for mosconfig because the user-supplied config
+ * is actually the one to trust when doing an import.
+ */
+ (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+
+ /*
+ * If 'tryconfig' was at least parsable, return the current config.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ spa_config_exit(spa, FTAG);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ poolname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ state) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
+ spa->spa_uberblock.ub_timestamp) == 0);
+
+ /*
+ * If the bootfs property exists on this pool then we
+ * copy it out so that external consumers can tell which
+ * pools are bootable.
+ */
+ if (spa->spa_bootfs) {
+ char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ /*
+ * We have to play games with the name since the
+ * pool was opened as TRYIMPORT_NAME.
+ */
+ if (dsl_dsobj_to_dsname(spa->spa_name,
+ spa->spa_bootfs, tmpname) == 0) {
+ char *cp;
+ char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ cp = strchr(tmpname, '/');
+ if (cp == NULL) {
+ (void) strlcpy(dsname, tmpname,
+ MAXPATHLEN);
+ } else {
+ (void) snprintf(dsname, MAXPATHLEN,
+ "%s/%s", poolname, ++cp);
+ }
+ VERIFY(nvlist_add_string(config,
+ ZPOOL_CONFIG_BOOTFS, dsname) == 0);
+ kmem_free(dsname, MAXPATHLEN);
+ }
+ kmem_free(tmpname, MAXPATHLEN);
+ }
+
+ /*
+ * Add the list of hot spares and level 2 cache devices.
+ */
+ spa_add_spares(spa, config);
+ spa_add_l2cache(spa, config);
+ }
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+
+ return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple. We make sure there
+ * is no more pending I/O and any references to the pool are gone. Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards.
+ */
+static int
+spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
+{
+ spa_t *spa;
+
+ if (oldconfig)
+ *oldconfig = NULL;
+
+ if (!(spa_mode & FWRITE))
+ return (EROFS);
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pool)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+
+ /*
+ * Put a hold on the pool, drop the namespace lock, stop async tasks,
+ * reacquire the namespace lock, and see if we can export.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ /*
+ * The pool will be in core if it's openable,
+ * in which case we can modify its state.
+ */
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+ /*
+ * Objsets may be open only because they're dirty, so we
+ * have to force it to sync before checking spa_refcnt.
+ */
+ spa_scrub_suspend(spa);
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * A pool cannot be exported or destroyed if there are active
+ * references. If we are resetting a pool, allow references by
+ * fault injection handlers.
+ */
+ if (!spa_refcount_zero(spa) ||
+ (spa->spa_inject_ref != 0 &&
+ new_state != POOL_STATE_UNINITIALIZED)) {
+ spa_scrub_resume(spa);
+ spa_async_resume(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (EBUSY);
+ }
+
+ spa_scrub_resume(spa);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+
+ /*
+ * We want this to be reflected on every label,
+ * so mark them all dirty. spa_unload() will do the
+ * final sync that pushes these changes out.
+ */
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa->spa_state = new_state;
+ spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_config_exit(spa, FTAG);
+ }
+ }
+
+ spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+
+ if (oldconfig && spa->spa_config)
+ VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ spa_config_check(spa->spa_config_dir,
+ spa->spa_config_file);
+ spa_remove(spa);
+ spa_config_sync();
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(char *pool, nvlist_t **oldconfig)
+{
+ return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
+}
+
+/*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
+}
+
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * Add a device to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+ uint64_t txg;
+ int c, error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *tvd;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+
+ txg = spa_vdev_enter(spa);
+
+ if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
+ VDEV_ALLOC_ADD)) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ spa->spa_pending_vdev = vd;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
+ &nspares) != 0)
+ nspares = 0;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
+ &nl2cache) != 0)
+ nl2cache = 0;
+
+ if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+ }
+
+ if (vd->vdev_children != 0) {
+ if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+ }
+
+ /*
+ * We must validate the spares and l2cache devices after checking the
+ * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
+ */
+ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+
+ spa->spa_pending_vdev = NULL;
+
+ /*
+ * Transfer each new top-level vdev from vd to rvd.
+ */
+ for (c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ vdev_remove_child(vd, tvd);
+ tvd->vdev_id = rvd->vdev_children;
+ vdev_add_child(rvd, tvd);
+ vdev_config_dirty(tvd);
+ }
+
+ if (nspares != 0) {
+ spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
+ ZPOOL_CONFIG_SPARES);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ if (nl2cache != 0) {
+ spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
+ ZPOOL_CONFIG_L2CACHE);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+ /*
+ * We have to be careful when adding new vdevs to an existing pool.
+ * If other threads start allocating from these vdevs before we
+ * sync the config cache, and we lose power, then upon reboot we may
+ * fail to open the pool because there are DVAs that the config cache
+ * can't translate. Therefore, we first add the vdevs without
+ * initializing metaslabs; sync the config cache (via spa_vdev_exit());
+ * and then let spa_config_update() initialize the new metaslabs.
+ *
+ * spa_load() checks for added-but-not-initialized vdevs, so that
+ * if we lose power at any point in this sequence, the remaining
+ * steps will be completed the next time we load the pool.
+ */
+ (void) spa_vdev_exit(spa, vd, txg, 0);
+
+ mutex_enter(&spa_namespace_lock);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Attach a device to a mirror. The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device. If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally identical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ */
+int
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
+{
+ uint64_t txg, open_txg;
+ int error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+ vdev_ops_t *pvops;
+ int is_log;
+
+ txg = spa_vdev_enter(spa);
+
+ oldvd = vdev_lookup_by_guid(rvd, guid);
+
+ if (oldvd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!oldvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = oldvd->vdev_parent;
+
+ if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
+ VDEV_ALLOC_ADD)) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ if (newrootvd->vdev_children != 1)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ newvd = newrootvd->vdev_child[0];
+
+ if (!newvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+ /*
+ * Spares can't replace logs
+ */
+ is_log = oldvd->vdev_islog;
+ if (is_log && newvd->vdev_isspare)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ if (!replacing) {
+ /*
+ * For attach, the only allowable parent is a mirror or the root
+ * vdev.
+ */
+ if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ pvops = &vdev_mirror_ops;
+ } else {
+ /*
+ * Active hot spares can only be replaced by inactive hot
+ * spares.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ pvd->vdev_child[1] == oldvd &&
+ !spa_has_spare(spa, newvd->vdev_guid))
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * If the source is a hot spare, and the parent isn't already a
+ * spare, then we want to create a new hot spare. Otherwise, we
+ * want to create a replacing vdev. The user is not allowed to
+ * attach to a spared vdev child unless the 'isspare' state is
+ * the same (spare replaces spare, non-spare replaces
+ * non-spare).
+ */
+ if (pvd->vdev_ops == &vdev_replacing_ops)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ else if (pvd->vdev_ops == &vdev_spare_ops &&
+ newvd->vdev_isspare != oldvd->vdev_isspare)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ else if (pvd->vdev_ops != &vdev_spare_ops &&
+ newvd->vdev_isspare)
+ pvops = &vdev_spare_ops;
+ else
+ pvops = &vdev_replacing_ops;
+ }
+
+ /*
+ * Compare the new device size with the replaceable/attachable
+ * device size.
+ */
+ if (newvd->vdev_psize < vdev_get_rsize(oldvd))
+ return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+ /*
+ * The new device cannot have a higher alignment requirement
+ * than the top-level vdev.
+ */
+ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
+ return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+
+ /*
+ * If this is an in-place replacement, update oldvd's path and devid
+ * to make it distinguishable from newvd, and unopenable from now on.
+ */
+ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ spa_strfree(oldvd->vdev_path);
+ oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ KM_SLEEP);
+ (void) sprintf(oldvd->vdev_path, "%s/%s",
+ newvd->vdev_path, "old");
+ if (oldvd->vdev_devid != NULL) {
+ spa_strfree(oldvd->vdev_devid);
+ oldvd->vdev_devid = NULL;
+ }
+ }
+
+ /*
+ * If the parent is not a mirror, or if we're replacing, insert the new
+ * mirror/replacing/spare vdev above oldvd.
+ */
+ if (pvd->vdev_ops != pvops)
+ pvd = vdev_add_parent(oldvd, pvops);
+
+ ASSERT(pvd->vdev_top->vdev_parent == rvd);
+ ASSERT(pvd->vdev_ops == pvops);
+ ASSERT(oldvd->vdev_parent == pvd);
+
+ /*
+ * Extract the new device from its root and add it to pvd.
+ */
+ vdev_remove_child(newrootvd, newvd);
+ newvd->vdev_id = pvd->vdev_children;
+ vdev_add_child(pvd, newvd);
+
+ /*
+ * If newvd is smaller than oldvd, but larger than its rsize,
+ * the addition of newvd may have decreased our parent's asize.
+ */
+ pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
+
+ tvd = newvd->vdev_top;
+ ASSERT(pvd->vdev_top == tvd);
+ ASSERT(tvd->vdev_parent == rvd);
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
+ * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+ */
+ open_txg = txg + TXG_CONCURRENT_STATES - 1;
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
+ open_txg - TXG_INITIAL + 1);
+ mutex_exit(&newvd->vdev_dtl_lock);
+
+ if (newvd->vdev_isspare)
+ spa_spare_activate(newvd);
+
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+ (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+
+ /*
+ * Kick off a resilver to update newvd. We need to grab the namespace
+ * lock because spa_scrub() needs to post a sysevent with the pool name.
+ */
+ mutex_enter(&spa_namespace_lock);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
+{
+ uint64_t txg;
+ int c, t, error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *pvd, *cvd, *tvd;
+ boolean_t unspare = B_FALSE;
+ uint64_t unspare_guid;
+
+ txg = spa_vdev_enter(spa);
+
+ vd = vdev_lookup_by_guid(rvd, guid);
+
+ if (vd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If replace_done is specified, only remove this device if it's
+ * the first child of a replacing vdev. For the 'spare' vdev, either
+ * disk can be removed.
+ */
+ if (replace_done) {
+ if (pvd->vdev_ops == &vdev_replacing_ops) {
+ if (vd->vdev_id != 0)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ } else if (pvd->vdev_ops != &vdev_spare_ops) {
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ }
+ }
+
+ ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
+ spa_version(spa) >= SPA_VERSION_SPARES);
+
+ /*
+ * Only mirror, replacing, and spare vdevs support detach.
+ */
+ if (pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * If there's only one replica, you can't detach it.
+ */
+ if (pvd->vdev_children <= 1)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * If all siblings have non-empty DTLs, this device may have the only
+ * valid copy of the data, which means we cannot safely detach it.
+ *
+ * XXX -- as in the vdev_offline() case, we really want a more
+ * precise DTL check.
+ */
+ for (c = 0; c < pvd->vdev_children; c++) {
+ uint64_t dirty;
+
+ cvd = pvd->vdev_child[c];
+ if (cvd == vd)
+ continue;
+ if (vdev_is_dead(cvd))
+ continue;
+ mutex_enter(&cvd->vdev_dtl_lock);
+ dirty = cvd->vdev_dtl_map.sm_space |
+ cvd->vdev_dtl_scrub.sm_space;
+ mutex_exit(&cvd->vdev_dtl_lock);
+ if (!dirty)
+ break;
+ }
+
+ /*
+ * If we are a replacing or spare vdev, then we can always detach the
+ * latter child, as that is how one cancels the operation.
+ */
+ if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
+ c == pvd->vdev_children)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * If we are detaching the original disk from a spare, then it implies
+ * that the spare should become a real disk, and be removed from the
+ * active spare list for the pool.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_id == 0)
+ unspare = B_TRUE;
+
+ /*
+ * Erase the disk labels so the disk can be used for other things.
+ * This must be done after all other error cases are handled,
+ * but before we disembowel vd (so we can still do I/O to it).
+ * But if we can't do it, don't treat the error as fatal --
+ * it may be that the unwritability of the disk is the reason
+ * it's being detached!
+ */
+ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ /*
+ * Remove vd from its parent and compact the parent's children.
+ */
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ /*
+ * Remember one of the remaining children so we can get tvd below.
+ */
+ cvd = pvd->vdev_child[0];
+
+ /*
+ * If we need to remove the remaining child from the list of hot spares,
+ * do it now, marking the vdev as no longer a spare in the process. We
+ * must do this before vdev_remove_parent(), because that can change the
+ * GUID if it creates a new toplevel GUID.
+ */
+ if (unspare) {
+ ASSERT(cvd->vdev_isspare);
+ spa_spare_remove(cvd);
+ unspare_guid = cvd->vdev_guid;
+ }
+
+ /*
+ * If the parent mirror/replacing vdev only has one child,
+ * the parent is no longer needed. Remove it from the tree.
+ */
+ if (pvd->vdev_children == 1)
+ vdev_remove_parent(cvd);
+
+ /*
+ * We don't set tvd until now because the parent we just removed
+ * may have been the previous top-level vdev.
+ */
+ tvd = cvd->vdev_top;
+ ASSERT(tvd->vdev_parent == rvd);
+
+ /*
+ * Reevaluate the parent vdev state.
+ */
+ vdev_propagate_state(cvd);
+
+ /*
+ * If the device we just detached was smaller than the others, it may be
+ * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init()
+ * can't fail because the existing metaslabs are already in core, so
+ * there's nothing to read from disk.
+ */
+ VERIFY(vdev_metaslab_init(tvd, txg) == 0);
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
+ * vd->vdev_detached is set and free vd's DTL object in syncing context.
+ * But first make sure we're not on any *other* txg's DTL list, to
+ * prevent vd from being accessed after it's freed.
+ */
+ for (t = 0; t < TXG_SIZE; t++)
+ (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+ vd->vdev_detached = B_TRUE;
+ vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+ spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+
+ error = spa_vdev_exit(spa, vd, txg, 0);
+
+ /*
+ * If this was the removal of the original device in a hot spare vdev,
+ * then we want to go through and remove the device from the hot spare
+ * list of every other pool.
+ */
+ if (unspare) {
+ spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa->spa_state != POOL_STATE_ACTIVE)
+ continue;
+
+ (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ }
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (error);
+}
+
+/*
+ * Remove a spares vdev from the nvlist config.
+ */
+static int
+spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare,
+ nvlist_t **spares, int nspares, vdev_t *vd)
+{
+ nvlist_t *nv, **newspares;
+ int i, j;
+
+ nv = NULL;
+ for (i = 0; i < nspares; i++) {
+ uint64_t theguid;
+
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ nv = spares[i];
+ break;
+ }
+ }
+
+ /*
+ * Only remove the hot spare if it's not currently in use in this pool.
+ */
+ if (nv == NULL && vd == NULL)
+ return (ENOENT);
+
+ if (nv == NULL && vd != NULL)
+ return (ENOTSUP);
+
+ if (!unspare && nv != NULL && vd != NULL)
+ return (EBUSY);
+
+ if (nspares == 1) {
+ newspares = NULL;
+ } else {
+ newspares = kmem_alloc((nspares - 1) * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0, j = 0; i < nspares; i++) {
+ if (spares[i] != nv)
+ VERIFY(nvlist_dup(spares[i],
+ &newspares[j++], KM_SLEEP) == 0);
+ }
+ }
+
+ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0);
+ for (i = 0; i < nspares - 1; i++)
+ nvlist_free(newspares[i]);
+ kmem_free(newspares, (nspares - 1) * sizeof (void *));
+
+ return (0);
+}
+
+/*
+ * Remove an l2cache vdev from the nvlist config.
+ */
+static int
+spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache,
+ int nl2cache, vdev_t *vd)
+{
+ nvlist_t *nv, **newl2cache;
+ int i, j;
+
+ nv = NULL;
+ for (i = 0; i < nl2cache; i++) {
+ uint64_t theguid;
+
+ VERIFY(nvlist_lookup_uint64(l2cache[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ nv = l2cache[i];
+ break;
+ }
+ }
+
+ if (vd == NULL) {
+ for (i = 0; i < nl2cache; i++) {
+ if (sav->sav_vdevs[i]->vdev_guid == guid) {
+ vd = sav->sav_vdevs[i];
+ break;
+ }
+ }
+ }
+
+ if (nv == NULL && vd == NULL)
+ return (ENOENT);
+
+ if (nv == NULL && vd != NULL)
+ return (ENOTSUP);
+
+ if (nl2cache == 1) {
+ newl2cache = NULL;
+ } else {
+ newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0, j = 0; i < nl2cache; i++) {
+ if (l2cache[i] != nv)
+ VERIFY(nvlist_dup(l2cache[i],
+ &newl2cache[j++], KM_SLEEP) == 0);
+ }
+ }
+
+ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0);
+ for (i = 0; i < nl2cache - 1; i++)
+ nvlist_free(newl2cache[i]);
+ kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *));
+
+ return (0);
+}
+
+/*
+ * Remove a device from the pool. Currently, this supports removing only hot
+ * spares and level 2 ARC devices.
+ */
+int
+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+{
+ vdev_t *vd;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+ int error = 0;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ vd = spa_lookup_by_guid(spa, guid);
+
+ if (spa->spa_spares.sav_vdevs != NULL &&
+ spa_spare_exists(guid, NULL) &&
+ nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
+ if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare,
+ spares, nspares, vd)) != 0)
+ goto out;
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ goto out;
+ }
+
+ if (spa->spa_l2cache.sav_vdevs != NULL &&
+ spa_l2cache_exists(guid, NULL) &&
+ nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) {
+ if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid,
+ l2cache, nl2cache, vd)) != 0)
+ goto out;
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
+
+out:
+ spa_config_exit(spa, FTAG);
+ return (error);
+}
+
+/*
+ * Find any device that's done replacing, or a vdev marked 'unspare' that's
+ * current spared, so we can detach it.
+ */
+static vdev_t *
+spa_vdev_resilver_done_hunt(vdev_t *vd)
+{
+ vdev_t *newvd, *oldvd;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
+ if (oldvd != NULL)
+ return (oldvd);
+ }
+
+ /*
+ * Check for a completed replacement.
+ */
+ if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+ oldvd = vd->vdev_child[0];
+ newvd = vd->vdev_child[1];
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ if (newvd->vdev_dtl_map.sm_space == 0 &&
+ newvd->vdev_dtl_scrub.sm_space == 0) {
+ mutex_exit(&newvd->vdev_dtl_lock);
+ return (oldvd);
+ }
+ mutex_exit(&newvd->vdev_dtl_lock);
+ }
+
+ /*
+ * Check for a completed resilver with the 'unspare' flag set.
+ */
+ if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
+ newvd = vd->vdev_child[0];
+ oldvd = vd->vdev_child[1];
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ if (newvd->vdev_unspare &&
+ newvd->vdev_dtl_map.sm_space == 0 &&
+ newvd->vdev_dtl_scrub.sm_space == 0) {
+ newvd->vdev_unspare = 0;
+ mutex_exit(&newvd->vdev_dtl_lock);
+ return (oldvd);
+ }
+ mutex_exit(&newvd->vdev_dtl_lock);
+ }
+
+ return (NULL);
+}
+
+static void
+spa_vdev_resilver_done(spa_t *spa)
+{
+ vdev_t *vd;
+ vdev_t *pvd;
+ uint64_t guid;
+ uint64_t pguid = 0;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+ guid = vd->vdev_guid;
+ /*
+ * If we have just finished replacing a hot spared device, then
+ * we need to detach the parent's first child (the original hot
+ * spare) as well.
+ */
+ pvd = vd->vdev_parent;
+ if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ pvd->vdev_id == 0) {
+ ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
+ ASSERT(pvd->vdev_parent->vdev_children == 2);
+ pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
+ }
+ spa_config_exit(spa, FTAG);
+ if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+ return;
+ if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
+ return;
+ spa_config_enter(spa, RW_READER, FTAG);
+ }
+
+ spa_config_exit(spa, FTAG);
+}
+
+/*
+ * Update the stored path for this vdev. Dirty the vdev configuration, relying
+ * on spa_vdev_enter/exit() to synchronize the labels and cache.
+ */
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ rvd = spa->spa_root_vdev;
+
+ txg = spa_vdev_enter(spa);
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+ /*
+ * Determine if this is a reference to a hot spare or l2cache
+ * device. If it is, update the path as stored in their
+ * device list.
+ */
+ nvlist_t **spares, **l2cache;
+ uint_t i, nspares, nl2cache;
+
+ if (spa->spa_spares.sav_config != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(
+ spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0);
+ for (i = 0; i < nspares; i++) {
+ uint64_t theguid;
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ VERIFY(nvlist_add_string(spares[i],
+ ZPOOL_CONFIG_PATH, newpath) == 0);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ return (spa_vdev_exit(spa, NULL, txg,
+ 0));
+ }
+ }
+ }
+
+ if (spa->spa_l2cache.sav_config != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(
+ spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0);
+ for (i = 0; i < nl2cache; i++) {
+ uint64_t theguid;
+ VERIFY(nvlist_lookup_uint64(l2cache[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ VERIFY(nvlist_add_string(l2cache[i],
+ ZPOOL_CONFIG_PATH, newpath) == 0);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ return (spa_vdev_exit(spa, NULL, txg,
+ 0));
+ }
+ }
+ }
+
+ return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = spa_strdup(newpath);
+
+ vdev_config_dirty(vd->vdev_top);
+
+ return (spa_vdev_exit(spa, NULL, txg, 0));
+}
+
+/*
+ * ==========================================================================
+ * SPA Scrubbing
+ * ==========================================================================
+ */
+
+static void
+spa_scrub_io_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ arc_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
+ spa->spa_scrub_errors++;
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ ASSERT(spa->spa_scrub_inflight >= 0);
+
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static void
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
+ zbookmark_t *zb)
+{
+ size_t size = BP_GET_LSIZE(bp);
+ void *data;
+
+ mutex_enter(&spa->spa_scrub_lock);
+ /*
+ * Do not give too much work to vdev(s).
+ */
+ while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ }
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ data = arc_data_buf_alloc(size);
+
+ if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */
+
+ flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ spa_scrub_io_done, NULL, priority, flags, zb));
+}
+
+/* ARGSUSED */
+static int
+spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+{
+ blkptr_t *bp = &bc->bc_blkptr;
+ vdev_t *vd = spa->spa_root_vdev;
+ dva_t *dva = bp->blk_dva;
+ int needs_resilver = B_FALSE;
+ int d;
+
+ if (bc->bc_errno) {
+ /*
+ * We can't scrub this block, but we can continue to scrub
+ * the rest of the pool. Note the error and move along.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_errors++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ return (ERESTART);
+ }
+
+ ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
+
+ ASSERT(vd != NULL);
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+ if (DVA_GET_GANG(&dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best we can do is look at the
+ * pool-wide DTL.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that this can't
+ * happen.
+ */
+ vd = spa->spa_root_vdev;
+ }
+ if (vdev_dtl_contains(&vd->vdev_dtl_map,
+ bp->blk_birth, 1))
+ needs_resilver = B_TRUE;
+ }
+ }
+
+ if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SCRUB, &bc->bc_bookmark);
+ else if (needs_resilver)
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+ ZIO_FLAG_RESILVER, &bc->bc_bookmark);
+
+ return (0);
+}
+
+static void
+spa_scrub_thread(spa_t *spa)
+{
+ callb_cpr_t cprinfo;
+ traverse_handle_t *th = spa->spa_scrub_th;
+ vdev_t *rvd = spa->spa_root_vdev;
+ pool_scrub_type_t scrub_type = spa->spa_scrub_type;
+ int error = 0;
+ boolean_t complete;
+
+ CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
+
+ /*
+ * If we're restarting due to a snapshot create/delete,
+ * wait for that to complete.
+ */
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ dprintf("start %s mintxg=%llu maxtxg=%llu\n",
+ scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+ spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ vdev_reopen(rvd); /* purge all vdev caches */
+ vdev_config_dirty(rvd); /* rewrite all disk labels */
+ vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
+ spa_config_exit(spa, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_errors = 0;
+ spa->spa_scrub_active = 1;
+ ASSERT(spa->spa_scrub_inflight == 0);
+
+ while (!spa->spa_scrub_stop) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ while (spa->spa_scrub_suspended) {
+ spa->spa_scrub_active = 0;
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_active = 1;
+ }
+ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
+
+ if (spa->spa_scrub_restart_txg != 0)
+ break;
+
+ mutex_exit(&spa->spa_scrub_lock);
+ error = traverse_more(th);
+ mutex_enter(&spa->spa_scrub_lock);
+ if (error != EAGAIN)
+ break;
+ }
+
+ while (spa->spa_scrub_inflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+
+ spa->spa_scrub_active = 0;
+ cv_broadcast(&spa->spa_scrub_cv);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
+ * AND the spa config lock to synchronize with any config changes
+ * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
+ */
+ if (spa->spa_scrub_restart_txg != 0)
+ error = ERESTART;
+
+ if (spa->spa_scrub_stop)
+ error = EINTR;
+
+ /*
+ * Even if there were uncorrectable errors, we consider the scrub
+ * completed. The downside is that if there is a transient error during
+ * a resilver, we won't resilver the data properly to the target. But
+ * if the damage is permanent (more likely) we will resilver forever,
+ * which isn't really acceptable. Since there is enough information for
+ * the user to know what has failed and why, this seems like a more
+ * tractable approach.
+ */
+ complete = (error == 0);
+
+ dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+ scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+ spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
+ error, spa->spa_scrub_errors, spa->spa_scrub_stop);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to reflect this.
+ * Whether it succeeded or not, vacate all temporary scrub DTLs.
+ */
+ vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
+ complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
+ vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
+ spa_errlog_rotate(spa);
+
+ if (scrub_type == POOL_SCRUB_RESILVER && complete)
+ spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
+
+ spa_config_exit(spa, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+ /*
+ * If we were told to restart, our final act is to start a new scrub.
+ */
+ if (error == ERESTART)
+ spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
+ SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
+
+ spa->spa_scrub_type = POOL_SCRUB_NONE;
+ spa->spa_scrub_active = 0;
+ spa->spa_scrub_thread = NULL;
+ cv_broadcast(&spa->spa_scrub_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */
+ thread_exit();
+}
+
+void
+spa_scrub_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_suspended++;
+ while (spa->spa_scrub_active) {
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ }
+ while (spa->spa_scrub_inflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT(spa->spa_scrub_suspended != 0);
+ if (--spa->spa_scrub_suspended == 0)
+ cv_broadcast(&spa->spa_scrub_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_restart(spa_t *spa, uint64_t txg)
+{
+ /*
+ * Something happened (e.g. snapshot create/delete) that means
+ * we must restart any in-progress scrubs. The itinerary will
+ * fix this properly.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_restart_txg = txg;
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+ space_seg_t *ss;
+ uint64_t mintxg, maxtxg;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(!spa_config_held(spa, RW_WRITER));
+
+ if ((uint_t)type >= POOL_SCRUB_TYPES)
+ return (ENOTSUP);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * If there's a scrub or resilver already in progress, stop it.
+ */
+ while (spa->spa_scrub_thread != NULL) {
+ /*
+ * Don't stop a resilver unless forced.
+ */
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
+ mutex_exit(&spa->spa_scrub_lock);
+ return (EBUSY);
+ }
+ spa->spa_scrub_stop = 1;
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ }
+
+ /*
+ * Terminate the previous traverse.
+ */
+ if (spa->spa_scrub_th != NULL) {
+ traverse_fini(spa->spa_scrub_th);
+ spa->spa_scrub_th = NULL;
+ }
+
+ if (rvd == NULL) {
+ ASSERT(spa->spa_scrub_stop == 0);
+ ASSERT(spa->spa_scrub_type == type);
+ ASSERT(spa->spa_scrub_restart_txg == 0);
+ mutex_exit(&spa->spa_scrub_lock);
+ return (0);
+ }
+
+ mintxg = TXG_INITIAL - 1;
+ maxtxg = spa_last_synced_txg(spa) + 1;
+
+ mutex_enter(&rvd->vdev_dtl_lock);
+
+ if (rvd->vdev_dtl_map.sm_space == 0) {
+ /*
+ * The pool-wide DTL is empty.
+ * If this is a resilver, there's nothing to do except
+ * check whether any in-progress replacements have completed.
+ */
+ if (type == POOL_SCRUB_RESILVER) {
+ type = POOL_SCRUB_NONE;
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ }
+ } else {
+ /*
+ * The pool-wide DTL is non-empty.
+ * If this is a normal scrub, upgrade to a resilver instead.
+ */
+ if (type == POOL_SCRUB_EVERYTHING)
+ type = POOL_SCRUB_RESILVER;
+ }
+
+ if (type == POOL_SCRUB_RESILVER) {
+ /*
+ * Determine the resilvering boundaries.
+ *
+ * Note: (mintxg, maxtxg) is an open interval,
+ * i.e. mintxg and maxtxg themselves are not included.
+ *
+ * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
+ * so we don't claim to resilver a txg that's still changing.
+ */
+ ss = avl_first(&rvd->vdev_dtl_map.sm_root);
+ mintxg = ss->ss_start - 1;
+ ss = avl_last(&rvd->vdev_dtl_map.sm_root);
+ maxtxg = MIN(ss->ss_end, maxtxg);
+
+ spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+ }
+
+ mutex_exit(&rvd->vdev_dtl_lock);
+
+ spa->spa_scrub_stop = 0;
+ spa->spa_scrub_type = type;
+ spa->spa_scrub_restart_txg = 0;
+
+ if (type != POOL_SCRUB_NONE) {
+ spa->spa_scrub_mintxg = mintxg;
+ spa->spa_scrub_maxtxg = maxtxg;
+ spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
+ ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
+ ZIO_FLAG_CANFAIL);
+ traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
+ spa->spa_scrub_thread = thread_create(NULL, 0,
+ spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+ }
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ return (0);
+}
+
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_remove(spa_t *spa, vdev_t *vd)
+{
+ vdev_t *tvd;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ if (tvd->vdev_remove_wanted) {
+ tvd->vdev_remove_wanted = 0;
+ vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
+ VDEV_AUX_NONE);
+ vdev_clear(spa, tvd, B_TRUE);
+ vdev_config_dirty(tvd->vdev_top);
+ }
+ spa_async_remove(spa, tvd);
+ }
+}
+
+static void
+spa_async_thread(spa_t *spa)
+{
+ int tasks;
+ uint64_t txg;
+
+ ASSERT(spa->spa_sync_on);
+
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+ spa->spa_async_tasks = 0;
+ mutex_exit(&spa->spa_async_lock);
+
+ /*
+ * See if the config needs to be updated.
+ */
+ if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * See if any devices need to be marked REMOVED.
+ *
+ * XXX - We avoid doing this when we are in
+ * I/O failure state since spa_vdev_enter() grabs
+ * the namespace lock and would not be able to obtain
+ * the writer config lock.
+ */
+ if (tasks & SPA_ASYNC_REMOVE &&
+ spa_state(spa) != POOL_STATE_IO_FAILURE) {
+ txg = spa_vdev_enter(spa);
+ spa_async_remove(spa, spa->spa_root_vdev);
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+ }
+
+ /*
+ * If any devices are done replacing, detach them.
+ */
+ if (tasks & SPA_ASYNC_RESILVER_DONE)
+ spa_vdev_resilver_done(spa);
+
+ /*
+ * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING
+ * scrub which can become a resilver), we need to hold
+ * spa_namespace_lock() because the sysevent we post via
+ * spa_event_notify() needs to get the name of the pool.
+ */
+ if (tasks & SPA_ASYNC_SCRUB) {
+ mutex_enter(&spa_namespace_lock);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * Kick off a resilver.
+ */
+ if (tasks & SPA_ASYNC_RESILVER) {
+ mutex_enter(&spa_namespace_lock);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * Let the world know that we're done.
+ */
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_thread = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_suspended++;
+ while (spa->spa_async_thread != NULL)
+ cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ ASSERT(spa->spa_async_suspended != 0);
+ spa->spa_async_suspended--;
+ mutex_exit(&spa->spa_async_lock);
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+ spa->spa_async_thread == NULL &&
+ rootdir != NULL && !vn_is_readonly(rootdir))
+ spa->spa_async_thread = thread_create(NULL, 0,
+ spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks |= task;
+ mutex_exit(&spa->spa_async_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+static void
+spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+{
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ dmu_tx_t *tx;
+ blkptr_t blk;
+ uint64_t itor = 0;
+ zio_t *zio;
+ int error;
+ uint8_t c = 1;
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
+
+ while (bplist_iterate(bpl, &itor, &blk) == 0)
+ zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
+
+ error = zio_wait(zio);
+ ASSERT3U(error, ==, 0);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ bplist_vacate(bpl, tx);
+
+ /*
+ * Pre-dirty the first block so we sync to convergence faster.
+ * (Usually only the first block is needed.)
+ */
+ dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
+{
+ char *packed = NULL;
+ size_t nvsize = 0;
+ dmu_buf_t *db;
+
+ VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+
+ VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
+
+ dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
+
+ kmem_free(packed, nvsize);
+
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = nvsize;
+ dmu_buf_rele(db, FTAG);
+}
+
+static void
+spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
+ const char *config, const char *entry)
+{
+ nvlist_t *nvroot;
+ nvlist_t **list;
+ int i;
+
+ if (!sav->sav_sync)
+ return;
+
+ /*
+ * Update the MOS nvlist describing the list of available devices.
+ * spa_validate_aux() will have already made sure this nvlist is
+ * valid and the vdevs are labeled appropriately.
+ */
+ if (sav->sav_object == 0) {
+ sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
+ sizeof (uint64_t), tx);
+ VERIFY(zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
+ &sav->sav_object, tx) == 0);
+ }
+
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (sav->sav_count == 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
+ } else {
+ list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < sav->sav_count; i++)
+ list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
+ B_FALSE, B_FALSE, B_TRUE);
+ VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
+ sav->sav_count) == 0);
+ for (i = 0; i < sav->sav_count; i++)
+ nvlist_free(list[i]);
+ kmem_free(list, sav->sav_count * sizeof (void *));
+ }
+
+ spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
+ nvlist_free(nvroot);
+
+ sav->sav_sync = B_FALSE;
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+ nvlist_t *config;
+
+ if (list_is_empty(&spa->spa_dirty_list))
+ return;
+
+ config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
+
+ if (spa->spa_config_syncing)
+ nvlist_free(spa->spa_config_syncing);
+ spa->spa_config_syncing = config;
+
+ spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
+}
+
+/*
+ * Set zpool properties.
+ */
+static void
+spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ spa_t *spa = arg1;
+ objset_t *mos = spa->spa_meta_objset;
+ nvlist_t *nvp = arg2;
+ nvpair_t *elem;
+ uint64_t intval;
+ char *strval, *slash;
+ zpool_prop_t prop;
+ const char *propname;
+ zprop_type_t proptype;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nvp, elem))) {
+ switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
+ case ZPOOL_PROP_VERSION:
+ /*
+ * Only set version for non-zpool-creation cases
+ * (set/import). spa_create() needs special care
+ * for version setting.
+ */
+ if (tx->tx_txg != TXG_INITIAL) {
+ VERIFY(nvpair_value_uint64(elem,
+ &intval) == 0);
+ ASSERT(intval <= SPA_VERSION);
+ ASSERT(intval >= spa_version(spa));
+ spa->spa_uberblock.ub_version = intval;
+ vdev_config_dirty(spa->spa_root_vdev);
+ }
+ break;
+
+ case ZPOOL_PROP_ALTROOT:
+ /*
+ * 'altroot' is a non-persistent property. It should
+ * have been set temporarily at creation or import time.
+ */
+ ASSERT(spa->spa_root != NULL);
+ break;
+
+ case ZPOOL_PROP_CACHEFILE:
+ /*
+ * 'cachefile' is a non-persistent property, but note
+ * an async request that the config cache needs to be
+ * udpated.
+ */
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ if (spa->spa_config_dir)
+ spa_strfree(spa->spa_config_dir);
+ if (spa->spa_config_file)
+ spa_strfree(spa->spa_config_file);
+
+ if (strval[0] == '\0') {
+ spa->spa_config_dir = NULL;
+ spa->spa_config_file = NULL;
+ } else if (strcmp(strval, "none") == 0) {
+ spa->spa_config_dir = spa_strdup(strval);
+ spa->spa_config_file = NULL;
+ } else {
+ /*
+ * If the cachefile is in the root directory,
+ * we will end up with an empty string for
+ * spa_config_dir. This value is only ever
+ * used when concatenated with '/', so an empty
+ * string still behaves correctly and keeps the
+ * rest of the code simple.
+ */
+ slash = strrchr(strval, '/');
+ ASSERT(slash != NULL);
+ *slash = '\0';
+ if (strcmp(strval, spa_config_dir) == 0 &&
+ strcmp(slash + 1, ZPOOL_CACHE_FILE) == 0) {
+ spa->spa_config_dir = NULL;
+ spa->spa_config_file = NULL;
+ } else {
+ spa->spa_config_dir =
+ spa_strdup(strval);
+ spa->spa_config_file =
+ spa_strdup(slash + 1);
+ }
+ }
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ break;
+ default:
+ /*
+ * Set pool property values in the poolprops mos object.
+ */
+ mutex_enter(&spa->spa_props_lock);
+ if (spa->spa_pool_props_object == 0) {
+ objset_t *mos = spa->spa_meta_objset;
+
+ VERIFY((spa->spa_pool_props_object =
+ zap_create(mos, DMU_OT_POOL_PROPS,
+ DMU_OT_NONE, 0, tx)) > 0);
+
+ VERIFY(zap_update(mos,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
+ 8, 1, &spa->spa_pool_props_object, tx)
+ == 0);
+ }
+ mutex_exit(&spa->spa_props_lock);
+
+ /* normalize the property name */
+ propname = zpool_prop_to_name(prop);
+ proptype = zpool_prop_get_type(prop);
+
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ ASSERT(proptype == PROP_TYPE_STRING);
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ VERIFY(zap_update(mos,
+ spa->spa_pool_props_object, propname,
+ 1, strlen(strval) + 1, strval, tx) == 0);
+
+ } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+ VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+
+ if (proptype == PROP_TYPE_INDEX) {
+ const char *unused;
+ VERIFY(zpool_prop_index_to_string(
+ prop, intval, &unused) == 0);
+ }
+ VERIFY(zap_update(mos,
+ spa->spa_pool_props_object, propname,
+ 8, 1, &intval, tx) == 0);
+ } else {
+ ASSERT(0); /* not allowed */
+ }
+
+ switch (prop) {
+ case ZPOOL_PROP_DELEGATION:
+ spa->spa_delegation = intval;
+ break;
+ case ZPOOL_PROP_BOOTFS:
+ spa->spa_bootfs = intval;
+ break;
+ case ZPOOL_PROP_FAILUREMODE:
+ spa->spa_failmode = intval;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* log internal history if this is not a zpool create */
+ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
+ tx->tx_txg != TXG_INITIAL) {
+ spa_history_internal_log(LOG_POOL_PROPSET,
+ spa, tx, cr, "%s %lld %s",
+ nvpair_name(elem), intval, spa->spa_name);
+ }
+ }
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ objset_t *mos = spa->spa_meta_objset;
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ vdev_t *svd[SPA_DVAS_PER_BP];
+ int svdcount = 0;
+ dmu_tx_t *tx;
+ int dirty_vdevs;
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+ * set spa_deflate if we have no raid-z vdevs.
+ */
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ int i;
+
+ for (i = 0; i < rvd->vdev_children; i++) {
+ vd = rvd->vdev_child[i];
+ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+ break;
+ }
+ if (i == rvd->vdev_children) {
+ spa->spa_deflate = TRUE;
+ VERIFY(0 == zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+ }
+ }
+
+ /*
+ * If anything has changed in this txg, push the deferred frees
+ * from the previous txg. If not, leave them alone so that we
+ * don't generate work on an otherwise idle system.
+ */
+ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
+ !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
+ !txg_list_empty(&dp->dp_sync_tasks, txg))
+ spa_sync_deferred_frees(spa, txg);
+
+ /*
+ * Iterate to convergence.
+ */
+ do {
+ spa->spa_sync_pass++;
+
+ spa_sync_config_object(spa, tx);
+ spa_sync_aux_dev(spa, &spa->spa_spares, tx,
+ ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
+ spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
+ ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
+ spa_errlog_sync(spa, txg);
+ dsl_pool_sync(dp, txg);
+
+ dirty_vdevs = 0;
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
+ vdev_sync(vd, txg);
+ dirty_vdevs++;
+ }
+
+ bplist_sync(bpl, tx);
+ } while (dirty_vdevs);
+
+ bplist_close(bpl);
+
+ dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+
+ /*
+ * Rewrite the vdev configuration (which includes the uberblock)
+ * to commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few
+ * random top-level vdevs that are known to be visible in the
+ * config cache (see spa_vdev_add() for details). If there *are*
+ * dirty vdevs -- or if the sync to our random subset fails --
+ * then sync the uberblock to all vdevs.
+ */
+ if (list_is_empty(&spa->spa_dirty_list)) {
+ int children = rvd->vdev_children;
+ int c0 = spa_get_random(children);
+ int c;
+
+ for (c = 0; c < children; c++) {
+ vd = rvd->vdev_child[(c0 + c) % children];
+ if (vd->vdev_ms_array == 0 || vd->vdev_islog)
+ continue;
+ svd[svdcount++] = vd;
+ if (svdcount == SPA_DVAS_PER_BP)
+ break;
+ }
+ }
+ if (svdcount == 0 || vdev_config_sync(svd, svdcount, txg) != 0)
+ VERIFY3U(vdev_config_sync(rvd->vdev_child,
+ rvd->vdev_children, txg), ==, 0);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear the dirty config list.
+ */
+ while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
+ vdev_config_clean(vd);
+
+ /*
+ * Now that the new config has synced transactionally,
+ * let it become visible to the config cache.
+ */
+ if (spa->spa_config_syncing != NULL) {
+ spa_config_set(spa, spa->spa_config_syncing);
+ spa->spa_config_txg = txg;
+ spa->spa_config_syncing = NULL;
+ }
+
+ /*
+ * Make a stable copy of the fully synced uberblock.
+ * We use this as the root for pool traversals.
+ */
+ spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */
+
+ spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */
+
+ rw_enter(&spa->spa_traverse_lock, RW_WRITER);
+ spa->spa_traverse_wanted = 0;
+ spa->spa_ubsync = spa->spa_uberblock;
+ rw_exit(&spa->spa_traverse_lock);
+
+ spa_scrub_resume(spa); /* resume scrub with new ubsync */
+
+ /*
+ * Clean up the ZIL records for the synced txg.
+ */
+ dsl_pool_zil_clean(dp);
+
+ /*
+ * Update usable space statistics.
+ */
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ vdev_sync_done(vd, txg);
+
+ /*
+ * It had better be the case that we didn't dirty anything
+ * since vdev_config_sync().
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+ ASSERT(bpl->bpl_queue == NULL);
+
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * If any async tasks have been requested, kick them off.
+ */
+ spa_async_dispatch(spa);
+}
+
+/*
+ * Sync all pools. We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+ spa_t *spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE)
+ continue;
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+ spa_t *spa;
+
+ /*
+ * Remove all cached state. All pools should be closed now,
+ * so every spa in the AVL tree should be unreferenced.
+ */
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(NULL)) != NULL) {
+ /*
+ * Stop async tasks. The async thread may need to detach
+ * a device that's been replaced, which requires grabbing
+ * spa_namespace_lock, so we must drop it here.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ mutex_enter(&spa_namespace_lock);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+ spa_close(spa, FTAG);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+ spa_remove(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid)
+{
+ return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
+}
+
+void
+spa_upgrade(spa_t *spa, uint64_t version)
+{
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ /*
+ * This should only be called for a non-faulted pool, and since a
+ * future version would result in an unopenable pool, this shouldn't be
+ * possible.
+ */
+ ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
+ ASSERT(version >= spa->spa_uberblock.ub_version);
+
+ spa->spa_uberblock.ub_version = version;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa, FTAG);
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+}
+
+boolean_t
+spa_has_spare(spa_t *spa, uint64_t guid)
+{
+ int i;
+ uint64_t spareguid;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
+
+ for (i = 0; i < sav->sav_count; i++)
+ if (sav->sav_vdevs[i]->vdev_guid == guid)
+ return (B_TRUE);
+
+ for (i = 0; i < sav->sav_npending; i++) {
+ if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
+ &spareguid) == 0 && spareguid == guid)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Post a sysevent corresponding to the given event. The 'name' must be one of
+ * the event definitions in sys/sysevent/eventdefs.h. The payload will be
+ * filled in from the spa and (optionally) the vdev. This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+{
+#ifdef _KERNEL
+ sysevent_t *ev;
+ sysevent_attr_list_t *attr = NULL;
+ sysevent_value_t value;
+ sysevent_id_t eid;
+
+ ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
+ SE_SLEEP);
+
+ value.value_type = SE_DATA_TYPE_STRING;
+ value.value.sv_string = spa_name(spa);
+ if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
+ goto done;
+
+ value.value_type = SE_DATA_TYPE_UINT64;
+ value.value.sv_uint64 = spa_guid(spa);
+ if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
+ goto done;
+
+ if (vd) {
+ value.value_type = SE_DATA_TYPE_UINT64;
+ value.value.sv_uint64 = vd->vdev_guid;
+ if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
+ SE_SLEEP) != 0)
+ goto done;
+
+ if (vd->vdev_path) {
+ value.value_type = SE_DATA_TYPE_STRING;
+ value.value.sv_string = vd->vdev_path;
+ if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
+ &value, SE_SLEEP) != 0)
+ goto done;
+ }
+ }
+
+ if (sysevent_attach_attributes(ev, attr) != 0)
+ goto done;
+ attr = NULL;
+
+ (void) log_sysevent(ev, SE_SLEEP, &eid);
+
+done:
+ if (attr)
+ sysevent_free_attr(attr);
+ sysevent_free(ev);
+#endif
+}
diff --git a/zfs/lib/libzpool/spa_boot.c b/zfs/lib/libzpool/spa_boot.c
new file mode 100644
index 000000000..1107b0298
--- /dev/null
+++ b/zfs/lib/libzpool/spa_boot.c
@@ -0,0 +1,198 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)spa_boot.c 1.1 08/04/09 SMI"
+
+#include <sys/spa.h>
+#include <sys/sunddi.h>
+
+char *
+spa_get_bootfs()
+{
+ char *zfs_bp;
+
+ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
+ DDI_PROP_DONTPASS, "zfs-bootfs", &zfs_bp) !=
+ DDI_SUCCESS)
+ return (NULL);
+ return (zfs_bp);
+}
+
+void
+spa_free_bootfs(char *bootfs)
+{
+ ddi_prop_free(bootfs);
+}
+
+/*
+ * Calculate how many device pathnames are in devpath_list.
+ * The devpath_list could look like this:
+ *
+ * "/pci@1f,0/ide@d/disk@0,0:a /pci@1f,o/ide@d/disk@2,0:a"
+ */
+static int
+spa_count_devpath(char *devpath_list)
+{
+ int numpath;
+ char *tmp_path, *blank;
+
+ numpath = 0;
+ tmp_path = devpath_list;
+
+ /* skip leading blanks */
+ while (*tmp_path == ' ')
+ tmp_path++;
+
+ while ((blank = strchr(tmp_path, ' ')) != NULL) {
+
+ numpath++;
+ /* skip contiguous blanks */
+ while (*blank == ' ')
+ blank++;
+ tmp_path = blank;
+ }
+
+ if (strlen(tmp_path) > 0)
+ numpath++;
+
+ return (numpath);
+}
+
+/*
+ * Only allow booting the device if it has the same vdev information as
+ * the most recently updated vdev (highest txg) and is in a valid state.
+ *
+ * GRUB passes online/active device path names, e.g.
+ * "/pci@1f,0/ide@d/disk@0,0:a /pci@1f,o/ide@d/disk@2,0:a"
+ * to the kernel. The best vdev should have the same matching online/active
+ * list as what GRUB passes in.
+ */
+static int
+spa_check_devstate(char *devpath_list, char *dev, nvlist_t *conf)
+{
+ nvlist_t *nvtop, **child;
+ uint_t label_path, grub_path, c, children;
+ char *type;
+
+ VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+ if (strcmp(type, VDEV_TYPE_DISK) == 0)
+ return (spa_rootdev_validate(nvtop)? 0 : EINVAL);
+
+ ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0);
+
+ VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0);
+
+ /*
+ * Check if the devpath_list is the same as the path list in conf.
+ * If these two lists are different, then the booting device is not an
+ * up-to-date device that can be booted.
+ */
+ label_path = 0;
+ for (c = 0; c < children; c++) {
+ char *physpath;
+
+ if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
+ &physpath) != 0)
+ return (EINVAL);
+
+ if (spa_rootdev_validate(child[c])) {
+ if (strstr(devpath_list, physpath) == NULL)
+ return (EINVAL);
+ label_path++;
+ } else {
+ char *blank;
+
+ if (blank = strchr(dev, ' '))
+ *blank = '\0';
+ if (strcmp(physpath, dev) == 0)
+ return (EINVAL);
+ if (blank)
+ *blank = ' ';
+ }
+ }
+
+ grub_path = spa_count_devpath(devpath_list);
+
+ if (label_path != grub_path)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Given a list of vdev physpath names, pick the vdev with the most recent txg,
+ * and return the point of the device's physpath in the list and the device's
+ * label configuration. The content of the label would be the most recent
+ * updated information.
+ */
+int
+spa_get_rootconf(char *devpath_list, char **bestdev, nvlist_t **bestconf)
+{
+ nvlist_t *conf = NULL;
+ char *dev = NULL;
+ uint64_t txg = 0;
+ char *devpath, *blank;
+
+ devpath = devpath_list;
+ dev = devpath;
+
+ while (devpath[0] == ' ')
+ devpath++;
+
+ while ((blank = strchr(devpath, ' ')) != NULL) {
+ *blank = '\0';
+ spa_check_rootconf(devpath, &dev, &conf, &txg);
+ *blank = ' ';
+
+ while (*blank == ' ')
+ blank++;
+ devpath = blank;
+ }
+
+ /* for the only or the last devpath in the devpath_list */
+ if (strlen(devpath) > 0)
+ spa_check_rootconf(devpath, &dev, &conf, &txg);
+
+ if (conf == NULL)
+ return (EINVAL);
+
+ /*
+ * dev/conf is the vdev with the most recent txg.
+ * Check if the device is in a bootable state.
+ * dev may have a trailing blank since it points to a string
+ * in the devpath_list.
+ */
+ if (spa_check_devstate(devpath_list, dev, conf) != 0)
+ return (EINVAL);
+
+ *bestdev = dev;
+ *bestconf = conf;
+ return (0);
+}
diff --git a/zfs/lib/libzpool/spa_config.c b/zfs/lib/libzpool/spa_config.c
new file mode 100644
index 000000000..c22e5e89d
--- /dev/null
+++ b/zfs/lib/libzpool/spa_config.c
@@ -0,0 +1,492 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)spa_config.c 1.15 08/04/01 SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/utsname.h>
+#include <sys/systeminfo.h>
+#include <sys/sunddi.h>
+#ifdef _KERNEL
+#include <sys/kobj.h>
+#endif
+
+/*
+ * Pool configuration repository.
+ *
+ * Pool configuration is stored as a packed nvlist on the filesystem. By
+ * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
+ * (when the ZFS module is loaded). Pools can also have the 'cachefile'
+ * property set that allows them to be stored in an alternate location until
+ * the control of external software.
+ *
+ * For each cache file, we have a single nvlist which holds all the
+ * configuration information. When the module loads, we read this information
+ * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is
+ * maintained independently in spa.c. Whenever the namespace is modified, or
+ * the configuration of a pool is changed, we call spa_config_sync(), which
+ * walks through all the active pools and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+const char *spa_config_dir = ZPOOL_CACHE_DIR;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace. It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+ void *buf = NULL;
+ nvlist_t *nvlist, *child;
+ nvpair_t *nvpair;
+ spa_t *spa;
+ char pathname[128];
+ struct _buf *file;
+ uint64_t fsize;
+
+ /*
+ * Open the configuration file.
+ */
+ (void) snprintf(pathname, sizeof (pathname), "%s%s/%s",
+ (rootdir != NULL) ? "./" : "", spa_config_dir, ZPOOL_CACHE_FILE);
+
+ file = kobj_open_file(pathname);
+ if (file == (struct _buf *)-1)
+ return;
+
+ if (kobj_get_filesize(file, &fsize) != 0)
+ goto out;
+
+ buf = kmem_alloc(fsize, KM_SLEEP);
+
+ /*
+ * Read the nvlist from the file.
+ */
+ if (kobj_read_file(file, buf, fsize, 0) < 0)
+ goto out;
+
+ /*
+ * Unpack the nvlist.
+ */
+ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
+ goto out;
+
+ /*
+ * Iterate over all elements in the nvlist, creating a new spa_t for
+ * each one with the specified configuration.
+ */
+ mutex_enter(&spa_namespace_lock);
+ nvpair = NULL;
+ while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+
+ if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+ continue;
+
+ VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
+
+ if (spa_lookup(nvpair_name(nvpair)) != NULL)
+ continue;
+ spa = spa_add(nvpair_name(nvpair), NULL);
+
+ /*
+ * We blindly duplicate the configuration here. If it's
+ * invalid, we will catch it when the pool is first opened.
+ */
+ VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(nvlist);
+
+out:
+ if (buf != NULL)
+ kmem_free(buf, fsize);
+
+ kobj_close_file(file);
+}
+
+/*
+ * This function is called when destroying or exporting a pool. It walks the
+ * list of active pools, and searches for any that match the given cache file.
+ * If there is only one cachefile, then the file is removed immediately,
+ * because we won't see the pool when iterating in spa_config_sync().
+ */
+void
+spa_config_check(const char *dir, const char *file)
+{
+ size_t count = 0;
+ char pathname[128];
+ spa_t *spa;
+
+ if (dir != NULL && strcmp(dir, "none") == 0)
+ return;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ spa = NULL;
+ while ((spa = spa_next(spa)) != NULL) {
+ if (dir == NULL) {
+ if (spa->spa_config_dir == NULL)
+ count++;
+ } else {
+ if (spa->spa_config_dir &&
+ strcmp(spa->spa_config_dir, dir) == 0 &&
+ strcmp(spa->spa_config_file, file) == 0)
+ count++;
+ }
+ }
+
+ if (count == 1) {
+ if (dir == NULL) {
+ dir = spa_config_dir;
+ file = ZPOOL_CACHE_FILE;
+ }
+
+ (void) snprintf(pathname, sizeof (pathname),
+ "%s/%s", dir, file);
+ (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
+ }
+}
+
+typedef struct spa_config_entry {
+ list_t sc_link;
+ const char *sc_dir;
+ const char *sc_file;
+ nvlist_t *sc_nvl;
+} spa_config_entry_t;
+
+static void
+spa_config_entry_add(list_t *listp, spa_t *spa)
+{
+ spa_config_entry_t *entry;
+ const char *dir, *file;
+
+ mutex_enter(&spa->spa_config_cache_lock);
+ if (!spa->spa_config || !spa->spa_name) {
+ mutex_exit(&spa->spa_config_cache_lock);
+ return;
+ }
+
+ if (spa->spa_config_dir) {
+ dir = spa->spa_config_dir;
+ file = spa->spa_config_file;
+ } else {
+ dir = spa_config_dir;
+ file = ZPOOL_CACHE_FILE;
+ }
+
+ if (strcmp(dir, "none") == 0) {
+ mutex_exit(&spa->spa_config_cache_lock);
+ return;
+ }
+
+ for (entry = list_head(listp); entry != NULL;
+ entry = list_next(listp, entry)) {
+ if (strcmp(entry->sc_dir, dir) == 0 &&
+ strcmp(entry->sc_file, file) == 0)
+ break;
+ }
+
+ if (entry == NULL) {
+ entry = kmem_alloc(sizeof (spa_config_entry_t), KM_SLEEP);
+ entry->sc_dir = dir;
+ entry->sc_file = file;
+ VERIFY(nvlist_alloc(&entry->sc_nvl, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ list_insert_tail(listp, entry);
+ }
+
+ VERIFY(nvlist_add_nvlist(entry->sc_nvl, spa->spa_name,
+ spa->spa_config) == 0);
+ mutex_exit(&spa->spa_config_cache_lock);
+}
+
+static void
+spa_config_entry_write(spa_config_entry_t *entry)
+{
+ nvlist_t *config = entry->sc_nvl;
+ size_t buflen;
+ char *buf;
+ vnode_t *vp;
+ int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+ char pathname[128];
+ char pathname2[128];
+
+ /*
+ * Pack the configuration into a buffer.
+ */
+ VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
+
+ buf = kmem_alloc(buflen, KM_SLEEP);
+
+ VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
+
+ /*
+ * Write the configuration to disk. We need to do the traditional
+ * 'write to temporary file, sync, move over original' to make sure we
+ * always have a consistent view of the data.
+ */
+ (void) snprintf(pathname, sizeof (pathname), "%s/.%s", entry->sc_dir,
+ entry->sc_file);
+
+ if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
+ goto out;
+
+ if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, NULL) == 0 &&
+ VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) {
+ (void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
+ entry->sc_dir, entry->sc_file);
+ (void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
+ }
+
+ (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
+ VN_RELE(vp);
+
+out:
+ (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
+ kmem_free(buf, buflen);
+}
+
+/*
+ * Synchronize all pools to disk. This must be called with the namespace lock
+ * held.
+ */
+void
+spa_config_sync(void)
+{
+ spa_t *spa = NULL;
+ list_t files = { 0 };
+ spa_config_entry_t *entry;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ list_create(&files, sizeof (spa_config_entry_t),
+ offsetof(spa_config_entry_t, sc_link));
+
+ /*
+ * Add all known pools to the configuration list, ignoring those with
+ * alternate root paths.
+ */
+ spa = NULL;
+ while ((spa = spa_next(spa)) != NULL)
+ spa_config_entry_add(&files, spa);
+
+ while ((entry = list_head(&files)) != NULL) {
+ spa_config_entry_write(entry);
+ list_remove(&files, entry);
+ nvlist_free(entry->sc_nvl);
+ kmem_free(entry, sizeof (spa_config_entry_t));
+ }
+
+ spa_config_generation++;
+}
+
+/*
+ * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+ nvlist_t *pools;
+ spa_t *spa;
+
+ if (*generation == spa_config_generation)
+ return (NULL);
+
+ VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(spa_name(spa), NULL)) {
+ mutex_enter(&spa->spa_config_cache_lock);
+ VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
+ spa->spa_config) == 0);
+ mutex_exit(&spa->spa_config_cache_lock);
+ }
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ *generation = spa_config_generation;
+
+ return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+ mutex_enter(&spa->spa_config_cache_lock);
+ if (spa->spa_config != NULL)
+ nvlist_free(spa->spa_config);
+ spa->spa_config = config;
+ mutex_exit(&spa->spa_config_cache_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+ nvlist_t *config, *nvroot;
+ vdev_t *rvd = spa->spa_root_vdev;
+ unsigned long hostid = 0;
+
+ ASSERT(spa_config_held(spa, RW_READER) ||
+ spa_config_held(spa, RW_WRITER));
+
+ if (vd == NULL)
+ vd = rvd;
+
+ /*
+ * If txg is -1, report the current value of spa->spa_config_txg.
+ */
+ if (txg == -1ULL)
+ txg = spa->spa_config_txg;
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ spa_name(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ spa_state(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_guid(spa)) == 0);
+ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
+ if (hostid != 0) {
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
+ hostid) == 0);
+ }
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
+ utsname.nodename) == 0);
+
+ if (vd != rvd) {
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+ vd->vdev_top->vdev_guid) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ if (vd->vdev_isspare)
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE,
+ 1ULL) == 0);
+ if (vd->vdev_islog)
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG,
+ 1ULL) == 0);
+ vd = vd->vdev_top; /* label contains top config */
+ }
+
+ nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+ nvlist_free(nvroot);
+
+ return (config);
+}
+
+/*
+ * For a pool that's not currently a booting rootpool, update all disk labels,
+ * generate a fresh config based on the current in-core state, and sync the
+ * global config cache.
+ */
+void
+spa_config_update(spa_t *spa, int what)
+{
+ spa_config_update_common(spa, what, FALSE);
+}
+
+/*
+ * Update all disk labels, generate a fresh config based on the current
+ * in-core state, and sync the global config cache (do not sync the config
+ * cache if this is a booting rootpool).
+ */
+void
+spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg;
+ int c;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ txg = spa_last_synced_txg(spa) + 1;
+ if (what == SPA_CONFIG_UPDATE_POOL) {
+ vdev_config_dirty(rvd);
+ } else {
+ /*
+ * If we have top-level vdevs that were added but have
+ * not yet been prepared for allocation, do that now.
+ * (It's safe now because the config cache is up to date,
+ * so it will be able to translate the new DVAs.)
+ * See comments in spa_vdev_add() for full details.
+ */
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ if (tvd->vdev_ms_array == 0) {
+ vdev_init(tvd, txg);
+ vdev_config_dirty(tvd);
+ }
+ }
+ }
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Wait for the mosconfig to be regenerated and synced.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ /*
+ * Update the global config cache to reflect the new mosconfig.
+ */
+ if (!isroot)
+ spa_config_sync();
+
+ if (what == SPA_CONFIG_UPDATE_POOL)
+ spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot);
+}
diff --git a/zfs/lib/libzpool/spa_errlog.c b/zfs/lib/libzpool/spa_errlog.c
new file mode 100644
index 000000000..162845471
--- /dev/null
+++ b/zfs/lib/libzpool/spa_errlog.c
@@ -0,0 +1,440 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)spa_errlog.c 1.2 06/10/02 SMI"
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation. This is actually the union of two distinct logs: the last log,
+ * and the current log. All errors seen are logged to the current log. When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized. This way, if an error is somehow
+ * corrected, a new scrub will show that that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data. When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known. This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path. Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name. Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+#ifdef _KERNEL
+static uint64_t
+strtonum(char *str, char **nptr)
+{
+ uint64_t val = 0;
+ char c;
+ int digit;
+
+ while ((c = *str) != '\0') {
+ if (c >= '0' && c <= '9')
+ digit = c - '0';
+ else if (c >= 'a' && c <= 'f')
+ digit = 10 + c - 'a';
+ else
+ break;
+
+ val *= 16;
+ val += digit;
+
+ str++;
+ }
+
+ *nptr = str;
+
+ return (val);
+}
+#endif
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+#ifdef _KERNEL
+static void
+name_to_bookmark(char *buf, zbookmark_t *zb)
+{
+ zb->zb_objset = strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_object = strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_level = (int)strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_blkid = strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+#endif
+
+/*
+ * Log an uncorrectable error to the persistent error log. We add it to the
+ * spa's list of pending errors. The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, zio_t *zio)
+{
+ zbookmark_t *zb = &zio->io_logical->io_bookmark;
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_tree_t *tree;
+ avl_index_t where;
+
+ /*
+ * If we are trying to import a pool, ignore any errors, as we won't be
+ * writing to the pool any time soon.
+ */
+ if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ return;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * If we have had a request to rotate the log, log it to the next list
+ * instead of the current one.
+ */
+ if (spa->spa_scrub_active || spa->spa_scrub_finished)
+ tree = &spa->spa_errlist_scrub;
+ else
+ tree = &spa->spa_errlist_last;
+
+ search.se_bookmark = *zb;
+ if (avl_find(tree, &search, &where) != NULL) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *zb;
+ avl_insert(tree, new, where);
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log. This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+ uint64_t total = 0, count;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += avl_numnodes(&spa->spa_errlist_last);
+ total += avl_numnodes(&spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_t zb;
+
+ if (obj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+
+ if (*count == 0) {
+ zap_cursor_fini(&zc);
+ return (ENOMEM);
+ }
+
+ name_to_bookmark(za.za_name, &zb);
+
+ if (copyout(&zb, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_t),
+ sizeof (zbookmark_t)) != 0)
+ return (EFAULT);
+
+ *count -= 1;
+ }
+
+ zap_cursor_fini(&zc);
+
+ return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+ spa_error_entry_t *se;
+
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+ if (*count == 0)
+ return (ENOMEM);
+
+ if (copyout(&se->se_bookmark, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_t),
+ sizeof (zbookmark_t)) != 0)
+ return (EFAULT);
+
+ *count -= 1;
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks. This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists. We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+ int ret = 0;
+
+#ifdef _KERNEL
+ mutex_enter(&spa->spa_errlog_lock);
+
+ ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+ if (!ret && !spa->spa_scrub_finished)
+ ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+ count);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ count);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ count);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+ return (ret);
+}
+
+/*
+ * Called when a scrub completes. This simply set a bit which tells which AVL
+ * tree to add new errors. spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+ mutex_enter(&spa->spa_errlist_lock);
+
+ ASSERT(!spa->spa_scrub_finished);
+ spa->spa_scrub_finished = B_TRUE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t. Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+ spa_error_entry_t *se;
+ void *cookie;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+ spa_error_entry_t *se;
+ char buf[64];
+ void *cookie;
+
+ if (avl_numnodes(t) != 0) {
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE,
+ 0, tx);
+
+ /* add errors to the current log */
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ char *name = se->se_name ? se->se_name : "";
+
+ bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+ (void) zap_update(spa->spa_meta_objset,
+ *obj, buf, 1, strlen(name) + 1, name, tx);
+ }
+
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ }
+}
+
+/*
+ * Sync the error log out to disk. This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock. So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them. Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing. Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list. Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ avl_tree_t scrub, last;
+ int scrub_finished;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Bail out early under normal circumstances.
+ */
+ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+ avl_numnodes(&spa->spa_errlist_last) == 0 &&
+ !spa->spa_scrub_finished) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ spa_get_errlists(spa, &last, &scrub);
+ scrub_finished = spa->spa_scrub_finished;
+ spa->spa_scrub_finished = B_FALSE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+ mutex_enter(&spa->spa_errlog_lock);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ /*
+ * Sync out the current list of errors.
+ */
+ sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+ /*
+ * Rotate the log if necessary.
+ */
+ if (scrub_finished) {
+ if (spa->spa_errlog_last != 0)
+ VERIFY(dmu_object_free(spa->spa_meta_objset,
+ spa->spa_errlog_last, tx) == 0);
+ spa->spa_errlog_last = spa->spa_errlog_scrub;
+ spa->spa_errlog_scrub = 0;
+
+ sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+ }
+
+ /*
+ * Sync out any pending scrub errors.
+ */
+ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+ /*
+ * Update the MOS to reflect the new values.
+ */
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+ &spa->spa_errlog_last, tx);
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+ &spa->spa_errlog_scrub, tx);
+
+ dmu_tx_commit(tx);
+
+ mutex_exit(&spa->spa_errlog_lock);
+}
diff --git a/zfs/lib/libzpool/spa_history.c b/zfs/lib/libzpool/spa_history.c
new file mode 100644
index 000000000..0fa6411a1
--- /dev/null
+++ b/zfs/lib/libzpool/spa_history.c
@@ -0,0 +1,421 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)spa_history.c 1.5 07/07/09 SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/utsname.h>
+#include <sys/cmn_err.h>
+#include <sys/sunddi.h>
+#ifdef _KERNEL
+#include <sys/zone.h>
+#endif
+
+/*
+ * Routines to manage the on-disk history log.
+ *
+ * The history log is stored as a dmu object containing
+ * <packed record length, record nvlist> tuples.
+ *
+ * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
+ * "packed record length" is the packed length of the "record nvlist" stored
+ * as a little endian uint64_t.
+ *
+ * The log is implemented as a ring buffer, though the original creation
+ * of the pool ('zpool create') is never overwritten.
+ *
+ * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer
+ * of 'spa_history' stores the offsets for logging/retrieving history as
+ * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of
+ * where the 'zpool create' record is stored. This allows us to never
+ * overwrite the original creation of the pool. 'sh_phys_max_off' is the
+ * physical ending offset in bytes of the log. This tells you the length of
+ * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record
+ * is added, 'sh_eof' is incremented by the the size of the record.
+ * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes).
+ * This is where the consumer should start reading from after reading in
+ * the 'zpool create' portion of the log.
+ *
+ * 'sh_records_lost' keeps track of how many records have been overwritten
+ * and permanently lost.
+ */
+
+/* convert a logical offset to physical */
+static uint64_t
+spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
+{
+ uint64_t phys_len;
+
+ phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
+ return ((log_off - shpp->sh_pool_create_len) % phys_len
+ + shpp->sh_pool_create_len);
+}
+
+void
+spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
+{
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ objset_t *mos = spa->spa_meta_objset;
+
+ ASSERT(spa->spa_history == 0);
+ spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
+ SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ sizeof (spa_history_phys_t), tx);
+
+ VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_HISTORY, sizeof (uint64_t), 1,
+ &spa->spa_history, tx) == 0);
+
+ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
+
+ shpp = dbp->db_data;
+ dmu_buf_will_dirty(dbp, tx);
+
+ /*
+ * Figure out maximum size of history log. We set it at
+ * 1% of pool size, with a max of 32MB and min of 128KB.
+ */
+ shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+ shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
+ shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
+
+ dmu_buf_rele(dbp, FTAG);
+}
+
+/*
+ * Change 'sh_bof' to the beginning of the next record.
+ */
+static int
+spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t firstread, reclen, phys_bof;
+ char buf[sizeof (reclen)];
+ int err;
+
+ phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
+ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
+
+ if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
+ buf)) != 0)
+ return (err);
+ if (firstread != sizeof (reclen)) {
+ if ((err = dmu_read(mos, spa->spa_history,
+ shpp->sh_pool_create_len, sizeof (reclen) - firstread,
+ buf + firstread)) != 0)
+ return (err);
+ }
+
+ reclen = LE_64(*((uint64_t *)buf));
+ shpp->sh_bof += reclen + sizeof (reclen);
+ shpp->sh_records_lost++;
+ return (0);
+}
+
+static int
+spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
+ dmu_tx_t *tx)
+{
+ uint64_t firstwrite, phys_eof;
+ objset_t *mos = spa->spa_meta_objset;
+ int err;
+
+ ASSERT(MUTEX_HELD(&spa->spa_history_lock));
+
+ /* see if we need to reset logical BOF */
+ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
+ (shpp->sh_eof - shpp->sh_bof) <= len) {
+ if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
+ return (err);
+ }
+ }
+
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+ firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
+ shpp->sh_eof += len;
+ dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
+
+ len -= firstwrite;
+ if (len > 0) {
+ /* write out the rest at the beginning of physical file */
+ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
+ len, (char *)buf + firstwrite, tx);
+ }
+
+ return (0);
+}
+
+static char *
+spa_history_zone()
+{
+#ifdef _KERNEL
+ return (curproc->p_zone->zone_name);
+#else
+ return ("global");
+#endif
+}
+
+/*
+ * Write out a history event.
+ */
+static void
+spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ spa_t *spa = arg1;
+ history_arg_t *hap = arg2;
+ const char *history_str = hap->ha_history_str;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ size_t reclen;
+ uint64_t le_len;
+ nvlist_t *nvrecord;
+ char *record_packed = NULL;
+ int ret;
+
+ /*
+ * If we have an older pool that doesn't have a command
+ * history object, create it now.
+ */
+ mutex_enter(&spa->spa_history_lock);
+ if (!spa->spa_history)
+ spa_history_create_obj(spa, tx);
+ mutex_exit(&spa->spa_history_lock);
+
+ /*
+ * Get the offset of where we need to write via the bonus buffer.
+ * Update the offset when the write completes.
+ */
+ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ shpp = dbp->db_data;
+
+ dmu_buf_will_dirty(dbp, tx);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
+ gethrestime_sec()) == 0);
+ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO,
+ (uint64_t)crgetuid(cr)) == 0);
+ if (hap->ha_zone[0] != '\0')
+ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE,
+ hap->ha_zone) == 0);
+#ifdef _KERNEL
+ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST,
+ utsname.nodename) == 0);
+#endif
+ if (hap->ha_log_type == LOG_CMD_POOL_CREATE ||
+ hap->ha_log_type == LOG_CMD_NORMAL) {
+ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
+ history_str) == 0);
+ } else {
+ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
+ hap->ha_event) == 0);
+ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG,
+ tx->tx_txg) == 0);
+ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
+ history_str) == 0);
+ }
+
+ VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
+ record_packed = kmem_alloc(reclen, KM_SLEEP);
+
+ VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen,
+ NV_ENCODE_XDR, KM_SLEEP) == 0);
+
+ mutex_enter(&spa->spa_history_lock);
+ if (hap->ha_log_type == LOG_CMD_POOL_CREATE)
+ VERIFY(shpp->sh_eof == shpp->sh_pool_create_len);
+
+ /* write out the packed length as little endian */
+ le_len = LE_64((uint64_t)reclen);
+ ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
+ if (!ret)
+ ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
+
+ if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) {
+ shpp->sh_pool_create_len += sizeof (le_len) + reclen;
+ shpp->sh_bof = shpp->sh_pool_create_len;
+ }
+
+ mutex_exit(&spa->spa_history_lock);
+ nvlist_free(nvrecord);
+ kmem_free(record_packed, reclen);
+ dmu_buf_rele(dbp, FTAG);
+
+ if (hap->ha_log_type == LOG_INTERNAL) {
+ kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN);
+ kmem_free(hap, sizeof (history_arg_t));
+ }
+}
+
+/*
+ * Write out a history event.
+ */
+int
+spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what)
+{
+ history_arg_t ha;
+
+ ASSERT(what != LOG_INTERNAL);
+
+ ha.ha_history_str = history_str;
+ ha.ha_log_type = what;
+ (void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone));
+ return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
+ spa, &ha, 0));
+}
+
+/*
+ * Read out the command history.
+ */
+int
+spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ uint64_t read_len, phys_read_off, phys_eof;
+ uint64_t leftover = 0;
+ spa_history_phys_t *shpp;
+ int err;
+
+ /*
+ * If the command history doesn't exist (older pool),
+ * that's ok, just return ENOENT.
+ */
+ if (!spa->spa_history)
+ return (ENOENT);
+
+ if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
+ return (err);
+ shpp = dbp->db_data;
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ mutex_enter(&spa->spa_history_lock);
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+
+ if (*offp < shpp->sh_pool_create_len) {
+ /* read in just the zpool create history */
+ phys_read_off = *offp;
+ read_len = MIN(*len, shpp->sh_pool_create_len -
+ phys_read_off);
+ } else {
+ /*
+ * Need to reset passed in offset to BOF if the passed in
+ * offset has since been overwritten.
+ */
+ *offp = MAX(*offp, shpp->sh_bof);
+ phys_read_off = spa_history_log_to_phys(*offp, shpp);
+
+ /*
+ * Read up to the minimum of what the user passed down or
+ * the EOF (physical or logical). If we hit physical EOF,
+ * use 'leftover' to read from the physical BOF.
+ */
+ if (phys_read_off <= phys_eof) {
+ read_len = MIN(*len, phys_eof - phys_read_off);
+ } else {
+ read_len = MIN(*len,
+ shpp->sh_phys_max_off - phys_read_off);
+ if (phys_read_off + *len > shpp->sh_phys_max_off) {
+ leftover = MIN(*len - read_len,
+ phys_eof - shpp->sh_pool_create_len);
+ }
+ }
+ }
+
+ /* offset for consumer to use next */
+ *offp += read_len + leftover;
+
+ /* tell the consumer how much you actually read */
+ *len = read_len + leftover;
+
+ if (read_len == 0) {
+ mutex_exit(&spa->spa_history_lock);
+ dmu_buf_rele(dbp, FTAG);
+ return (0);
+ }
+
+ err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf);
+ if (leftover && err == 0) {
+ err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
+ leftover, buf + read_len);
+ }
+ mutex_exit(&spa->spa_history_lock);
+
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+}
+
+void
+spa_history_internal_log(history_internal_events_t event, spa_t *spa,
+ dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+{
+ history_arg_t *hap;
+ char *str;
+ va_list adx;
+
+ hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
+ str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+
+ va_start(adx, fmt);
+ (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
+ va_end(adx);
+
+ hap->ha_log_type = LOG_INTERNAL;
+ hap->ha_history_str = str;
+ hap->ha_event = event;
+ hap->ha_zone[0] = '\0';
+
+ if (dmu_tx_is_syncing(tx)) {
+ spa_history_log_sync(spa, hap, cr, tx);
+ } else {
+ dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
+ spa_history_log_sync, spa, hap, 0, tx);
+ }
+ /* spa_history_log_sync() will free hap and str */
+}
diff --git a/zfs/lib/libzpool/spa_misc.c b/zfs/lib/libzpool/spa_misc.c
new file mode 100644
index 000000000..4ec538038
--- /dev/null
+++ b/zfs/lib/libzpool/spa_misc.c
@@ -0,0 +1,1280 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)spa_misc.c 1.31 08/04/01 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+#include <sys/metaslab_impl.h>
+#include "zfs_prop.h"
+
+/*
+ * SPA locking
+ *
+ * There are four basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ * This lock must be acquired to do any of the following:
+ *
+ * - Lookup a spa_t by name
+ * - Add or remove a spa_t from the namespace
+ * - Increase spa_refcount from non-zero
+ * - Check if spa_refcount is zero
+ * - Rename a spa_t
+ * - add/remove/attach/detach devices
+ * - Held for the duration of create/destroy/import/export
+ *
+ * It does not need to handle recursion. A create or destroy may
+ * reference objects (files or zvols) in other pools, but by
+ * definition they must have an existing reference, and will never need
+ * to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa refcount_t protected by mutex)
+ *
+ * This reference count keep track of any active users of the spa_t. The
+ * spa_t cannot be destroyed or freed while this is non-zero. Internally,
+ * the refcount is never really 'zero' - opening a pool implicitly keeps
+ * some references in the DMU. Internally we check against SPA_MINREF, but
+ * present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock (per-spa read-priority rwlock)
+ *
+ * This protects the spa_t from config changes, and must be held in
+ * the following circumstances:
+ *
+ * - RW_READER to perform I/O to the spa
+ * - RW_WRITER to change the vdev config
+ *
+ * spa_config_cache_lock (per-spa mutex)
+ *
+ * This mutex prevents the spa_config nvlist from being updated. No
+ * other locks are required to obtain this lock, although implicitly you
+ * must have the namespace lock or non-zero refcount to have any kind
+ * of spa_t pointer at all.
+ *
+ * The locking order is fairly straightforward:
+ *
+ * spa_namespace_lock -> spa_refcount
+ *
+ * The namespace lock must be acquired to increase the refcount from 0
+ * or to check if it is zero.
+ *
+ * spa_refcount -> spa_config_lock
+ *
+ * There must be at least one valid reference on the spa_t to acquire
+ * the config lock.
+ *
+ * spa_namespace_lock -> spa_config_lock
+ *
+ * The namespace lock must always be taken before the config lock.
+ *
+ *
+ * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
+ * are globally visible.
+ *
+ * The namespace is manipulated using the following functions, all which require
+ * the spa_namespace_lock to be held.
+ *
+ * spa_lookup() Lookup a spa_t by name.
+ *
+ * spa_add() Create a new spa_t in the namespace.
+ *
+ * spa_remove() Remove a spa_t from the namespace. This also
+ * frees up any memory associated with the spa_t.
+ *
+ * spa_next() Returns the next spa_t in the system, or the
+ * first if NULL is passed.
+ *
+ * spa_evict_all() Shutdown and remove all spa_t structures in
+ * the system.
+ *
+ * spa_guid_exists() Determine whether a pool/device guid exists.
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ * spa_open_ref() Adds a reference to the given spa_t. Must be
+ * called with spa_namespace_lock held if the
+ * refcount is currently zero.
+ *
+ * spa_close() Remove a reference from the spa_t. This will
+ * not free the spa_t or remove it from the
+ * namespace. No locking is required.
+ *
+ * spa_refcount_zero() Returns true if the refcount is currently
+ * zero. Must be called with spa_namespace_lock
+ * held.
+ *
+ * The spa_config_lock is a form of rwlock. It must be held as RW_READER
+ * to perform I/O to the pool, and as RW_WRITER to change the vdev config.
+ * The spa_config_lock is manipulated with spa_config_{enter,exit,held}().
+ *
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
+ *
+ * spa_vdev_enter() Acquire the namespace lock and the config lock
+ * for writing.
+ *
+ * spa_vdev_exit() Release the config lock, wait for all I/O
+ * to complete, sync the updated configs to the
+ * cache, and release the namespace lock.
+ *
+ * The spa_name() function also requires either the spa_namespace_lock
+ * or the spa_config_lock, as both are needed to do a rename. spa_rename() is
+ * also implemented within this file since is requires manipulation of the
+ * namespace.
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+static int spa_active_count;
+int spa_max_replication_override = SPA_DVAS_PER_BP;
+
+static kmutex_t spa_spare_lock;
+static avl_tree_t spa_spare_avl;
+static kmutex_t spa_l2cache_lock;
+static avl_tree_t spa_l2cache_avl;
+
+kmem_cache_t *spa_buffer_pool;
+int spa_mode;
+
+#ifdef ZFS_DEBUG
+/* Everything except dprintf is on by default in debug builds */
+int zfs_flags = ~ZFS_DEBUG_DPRINTF;
+#else
+int zfs_flags = 0;
+#endif
+
+/*
+ * zfs_recover can be set to nonzero to attempt to recover from
+ * otherwise-fatal errors, typically caused by on-disk corruption. When
+ * set, calls to zfs_panic_recover() will turn into warning messages.
+ */
+int zfs_recover = 0;
+
+#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+static void
+spa_config_lock_init(spa_config_lock_t *scl)
+{
+ mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
+ scl->scl_writer = NULL;
+ cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
+ refcount_create(&scl->scl_count);
+}
+
+static void
+spa_config_lock_destroy(spa_config_lock_t *scl)
+{
+ mutex_destroy(&scl->scl_lock);
+ ASSERT(scl->scl_writer == NULL);
+ cv_destroy(&scl->scl_cv);
+ refcount_destroy(&scl->scl_count);
+}
+
+void
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ if (rw == RW_READER) {
+ while (scl->scl_writer != NULL && scl->scl_writer != curthread)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ } else {
+ while (!refcount_is_zero(&scl->scl_count) &&
+ scl->scl_writer != curthread)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ scl->scl_writer = curthread;
+ }
+
+ (void) refcount_add(&scl->scl_count, tag);
+
+ mutex_exit(&scl->scl_lock);
+}
+
+void
+spa_config_exit(spa_t *spa, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ ASSERT(!refcount_is_zero(&scl->scl_count));
+
+ if (refcount_remove(&scl->scl_count, tag) == 0) {
+ cv_broadcast(&scl->scl_cv);
+ ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread);
+ scl->scl_writer = NULL; /* OK in either case */
+ }
+
+ mutex_exit(&scl->scl_lock);
+}
+
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ if (rw == RW_READER)
+ return (!refcount_is_zero(&scl->scl_count));
+ else
+ return (scl->scl_writer == curthread);
+}
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+ spa_t search, *spa;
+ avl_index_t where;
+ char c;
+ char *cp;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * If it's a full dataset name, figure out the pool name and
+ * just use that.
+ */
+ cp = strpbrk(name, "/@");
+ if (cp) {
+ c = *cp;
+ *cp = '\0';
+ }
+
+ search.spa_name = (char *)name;
+ spa = avl_find(&spa_namespace_avl, &search, &where);
+
+ if (cp)
+ *cp = c;
+
+ return (spa);
+}
+
+/*
+ * Create an uninitialized spa_t with the given name. Requires
+ * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name, const char *altroot)
+{
+ spa_t *spa;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+ rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
+
+ mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
+
+ spa->spa_name = spa_strdup(name);
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+ spa->spa_freeze_txg = UINT64_MAX;
+ spa->spa_final_txg = UINT64_MAX;
+
+ refcount_create(&spa->spa_refcount);
+ spa_config_lock_init(&spa->spa_config_lock);
+
+ avl_add(&spa_namespace_avl, spa);
+
+ mutex_init(&spa->spa_zio_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * Set the alternate root, if there is one.
+ */
+ if (altroot) {
+ spa->spa_root = spa_strdup(altroot);
+ spa_active_count++;
+ }
+
+ return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used. Requires
+ * spa_namespace_lock. This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ ASSERT(spa->spa_scrub_thread == NULL);
+
+ avl_remove(&spa_namespace_avl, spa);
+ cv_broadcast(&spa_namespace_cv);
+
+ if (spa->spa_root) {
+ spa_strfree(spa->spa_root);
+ spa_active_count--;
+ }
+
+ if (spa->spa_name)
+ spa_strfree(spa->spa_name);
+
+ if (spa->spa_config_dir)
+ spa_strfree(spa->spa_config_dir);
+ if (spa->spa_config_file)
+ spa_strfree(spa->spa_config_file);
+
+ spa_config_set(spa, NULL);
+
+ refcount_destroy(&spa->spa_refcount);
+
+ spa_config_lock_destroy(&spa->spa_config_lock);
+
+ rw_destroy(&spa->spa_traverse_lock);
+
+ cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_scrub_cv);
+ cv_destroy(&spa->spa_scrub_io_cv);
+
+ mutex_destroy(&spa->spa_uberblock_lock);
+ mutex_destroy(&spa->spa_async_lock);
+ mutex_destroy(&spa->spa_config_cache_lock);
+ mutex_destroy(&spa->spa_scrub_lock);
+ mutex_destroy(&spa->spa_errlog_lock);
+ mutex_destroy(&spa->spa_errlist_lock);
+ mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
+ mutex_destroy(&spa->spa_history_lock);
+ mutex_destroy(&spa->spa_props_lock);
+ mutex_destroy(&spa->spa_zio_lock);
+
+ kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none. If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (prev)
+ return (AVL_NEXT(&spa_namespace_avl, prev));
+ else
+ return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+ ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+ MUTEX_HELD(&spa_namespace_lock));
+
+ (void) refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+ ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+ MUTEX_HELD(&spa_namespace_lock));
+
+ (void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero. Must be called with
+ * spa_namespace_lock held. We really compare against SPA_MINREF, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
+}
+
+/*
+ * ==========================================================================
+ * SPA spare and l2cache tracking
+ * ==========================================================================
+ */
+
+/*
+ * Hot spares and cache devices are tracked using the same code below,
+ * for 'auxiliary' devices.
+ */
+
+typedef struct spa_aux {
+ uint64_t aux_guid;
+ uint64_t aux_pool;
+ avl_node_t aux_avl;
+ int aux_count;
+} spa_aux_t;
+
+static int
+spa_aux_compare(const void *a, const void *b)
+{
+ const spa_aux_t *sa = a;
+ const spa_aux_t *sb = b;
+
+ if (sa->aux_guid < sb->aux_guid)
+ return (-1);
+ else if (sa->aux_guid > sb->aux_guid)
+ return (1);
+ else
+ return (0);
+}
+
+void
+spa_aux_add(vdev_t *vd, avl_tree_t *avl)
+{
+ avl_index_t where;
+ spa_aux_t search;
+ spa_aux_t *aux;
+
+ search.aux_guid = vd->vdev_guid;
+ if ((aux = avl_find(avl, &search, &where)) != NULL) {
+ aux->aux_count++;
+ } else {
+ aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
+ aux->aux_guid = vd->vdev_guid;
+ aux->aux_count = 1;
+ avl_insert(avl, aux, where);
+ }
+}
+
+void
+spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
+{
+ spa_aux_t search;
+ spa_aux_t *aux;
+ avl_index_t where;
+
+ search.aux_guid = vd->vdev_guid;
+ aux = avl_find(avl, &search, &where);
+
+ ASSERT(aux != NULL);
+
+ if (--aux->aux_count == 0) {
+ avl_remove(avl, aux);
+ kmem_free(aux, sizeof (spa_aux_t));
+ } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
+ aux->aux_pool = 0ULL;
+ }
+}
+
+boolean_t
+spa_aux_exists(uint64_t guid, uint64_t *pool, avl_tree_t *avl)
+{
+ spa_aux_t search, *found;
+ avl_index_t where;
+
+ search.aux_guid = guid;
+ found = avl_find(avl, &search, &where);
+
+ if (pool) {
+ if (found)
+ *pool = found->aux_pool;
+ else
+ *pool = 0ULL;
+ }
+
+ return (found != NULL);
+}
+
+void
+spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
+{
+ spa_aux_t search, *found;
+ avl_index_t where;
+
+ search.aux_guid = vd->vdev_guid;
+ found = avl_find(avl, &search, &where);
+ ASSERT(found != NULL);
+ ASSERT(found->aux_pool == 0ULL);
+
+ found->aux_pool = spa_guid(vd->vdev_spa);
+}
+
+/*
+ * Spares are tracked globally due to the following constraints:
+ *
+ * - A spare may be part of multiple pools.
+ * - A spare may be added to a pool even if it's actively in use within
+ * another pool.
+ * - A spare in use in any pool can only be the source of a replacement if
+ * the target is a spare in the same pool.
+ *
+ * We keep track of all spares on the system through the use of a reference
+ * counted AVL tree. When a vdev is added as a spare, or used as a replacement
+ * spare, then we bump the reference count in the AVL tree. In addition, we set
+ * the 'vdev_isspare' member to indicate that the device is a spare (active or
+ * inactive). When a spare is made active (used to replace a device in the
+ * pool), we also keep track of which pool its been made a part of.
+ *
+ * The 'spa_spare_lock' protects the AVL tree. These functions are normally
+ * called under the spa_namespace lock as part of vdev reconfiguration. The
+ * separate spare lock exists for the status query path, which does not need to
+ * be completely consistent with respect to other vdev configuration changes.
+ */
+
+static int
+spa_spare_compare(const void *a, const void *b)
+{
+ return (spa_aux_compare(a, b));
+}
+
+void
+spa_spare_add(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(!vd->vdev_isspare);
+ spa_aux_add(vd, &spa_spare_avl);
+ vd->vdev_isspare = B_TRUE;
+ mutex_exit(&spa_spare_lock);
+}
+
+void
+spa_spare_remove(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(vd->vdev_isspare);
+ spa_aux_remove(vd, &spa_spare_avl);
+ vd->vdev_isspare = B_FALSE;
+ mutex_exit(&spa_spare_lock);
+}
+
+boolean_t
+spa_spare_exists(uint64_t guid, uint64_t *pool)
+{
+ boolean_t found;
+
+ mutex_enter(&spa_spare_lock);
+ found = spa_aux_exists(guid, pool, &spa_spare_avl);
+ mutex_exit(&spa_spare_lock);
+
+ return (found);
+}
+
+void
+spa_spare_activate(vdev_t *vd)
+{
+ mutex_enter(&spa_spare_lock);
+ ASSERT(vd->vdev_isspare);
+ spa_aux_activate(vd, &spa_spare_avl);
+ mutex_exit(&spa_spare_lock);
+}
+
+/*
+ * Level 2 ARC devices are tracked globally for the same reasons as spares.
+ * Cache devices currently only support one pool per cache device, and so
+ * for these devices the aux reference count is currently unused beyond 1.
+ */
+
+static int
+spa_l2cache_compare(const void *a, const void *b)
+{
+ return (spa_aux_compare(a, b));
+}
+
+void
+spa_l2cache_add(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(!vd->vdev_isl2cache);
+ spa_aux_add(vd, &spa_l2cache_avl);
+ vd->vdev_isl2cache = B_TRUE;
+ mutex_exit(&spa_l2cache_lock);
+}
+
+void
+spa_l2cache_remove(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(vd->vdev_isl2cache);
+ spa_aux_remove(vd, &spa_l2cache_avl);
+ vd->vdev_isl2cache = B_FALSE;
+ mutex_exit(&spa_l2cache_lock);
+}
+
+boolean_t
+spa_l2cache_exists(uint64_t guid, uint64_t *pool)
+{
+ boolean_t found;
+
+ mutex_enter(&spa_l2cache_lock);
+ found = spa_aux_exists(guid, pool, &spa_l2cache_avl);
+ mutex_exit(&spa_l2cache_lock);
+
+ return (found);
+}
+
+void
+spa_l2cache_activate(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(vd->vdev_isl2cache);
+ spa_aux_activate(vd, &spa_l2cache_avl);
+ mutex_exit(&spa_l2cache_lock);
+}
+
+void
+spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
+{
+ vdev_space_update(vd, space, alloc, B_FALSE);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+ mutex_enter(&spa_namespace_lock);
+
+ /*
+ * Suspend scrub activity while we mess with the config. We must do
+ * this after acquiring the namespace lock to avoid a 3-way deadlock
+ * with spa_scrub_stop() and the scrub thread.
+ */
+ spa_scrub_suspend(spa);
+
+ spa_config_enter(spa, RW_WRITER, spa);
+
+ return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ int config_changed = B_FALSE;
+
+ ASSERT(txg > spa_last_synced_txg(spa));
+
+ /*
+ * Reassess the DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+ /*
+ * If the config changed, notify the scrub thread that it must restart.
+ */
+ if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) {
+ config_changed = B_TRUE;
+ spa_scrub_restart(spa, txg);
+ }
+
+ spa_config_exit(spa, spa);
+
+ /*
+ * Allow scrubbing to resume.
+ */
+ spa_scrub_resume(spa);
+
+ /*
+ * Note: this txg_wait_synced() is important because it ensures
+ * that there won't be more than one config change per txg.
+ * This allows us to use the txg as the generation number.
+ */
+ if (error == 0)
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ if (vd != NULL) {
+ ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+ vdev_free(vd);
+ }
+
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed)
+ spa_config_sync();
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+/*
+ * Rename a spa_t.
+ */
+int
+spa_rename(const char *name, const char *newname)
+{
+ spa_t *spa;
+ int err;
+
+ /*
+ * Lookup the spa_t and grab the config lock for writing. We need to
+ * actually open the pool so that we can sync out the necessary labels.
+ * It's OK to call spa_open() with the namespace lock held because we
+ * allow recursive calls for other reasons.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((err = spa_open(name, &spa, FTAG)) != 0) {
+ mutex_exit(&spa_namespace_lock);
+ return (err);
+ }
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ avl_remove(&spa_namespace_avl, spa);
+ spa_strfree(spa->spa_name);
+ spa->spa_name = spa_strdup(newname);
+ avl_add(&spa_namespace_avl, spa);
+
+ /*
+ * Sync all labels to disk with the new names by marking the root vdev
+ * dirty and waiting for it to sync. It will pick up the new pool name
+ * during the sync.
+ */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa, FTAG);
+
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * Sync the updated config cache.
+ */
+ spa_config_sync();
+
+ spa_close(spa, FTAG);
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+
+/*
+ * Determine whether a pool with given pool_guid exists. If device_guid is
+ * non-zero, determine whether the pool exists *and* contains a device with the
+ * specified device_guid.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ spa_t *spa;
+ avl_tree_t *t = &spa_namespace_avl;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ continue;
+ if (spa->spa_root_vdev == NULL)
+ continue;
+ if (spa_guid(spa) == pool_guid) {
+ if (device_guid == 0)
+ break;
+
+ if (vdev_lookup_by_guid(spa->spa_root_vdev,
+ device_guid) != NULL)
+ break;
+
+ /*
+ * Check any devices we may be in the process of adding.
+ */
+ if (spa->spa_pending_vdev) {
+ if (vdev_lookup_by_guid(spa->spa_pending_vdev,
+ device_guid) != NULL)
+ break;
+ }
+ }
+ }
+
+ return (spa != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+ size_t len;
+ char *new;
+
+ len = strlen(s);
+ new = kmem_alloc(len + 1, KM_SLEEP);
+ bcopy(s, new, len);
+ new[len] = '\0';
+
+ return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+ kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+ uint64_t r;
+
+ ASSERT(range != 0);
+
+ (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+ return (r % range);
+}
+
+void
+sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+{
+ int d;
+
+ if (bp == NULL) {
+ (void) snprintf(buf, len, "<NULL>");
+ return;
+ }
+
+ if (BP_IS_HOLE(bp)) {
+ (void) snprintf(buf, len, "<hole>");
+ return;
+ }
+
+ (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
+ (u_longlong_t)BP_GET_LEVEL(bp),
+ dmu_ot[BP_GET_TYPE(bp)].ot_name,
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)BP_GET_PSIZE(bp));
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ const dva_t *dva = &bp->blk_dva[d];
+ (void) snprintf(buf + strlen(buf), len - strlen(buf),
+ "DVA[%d]=<%llu:%llx:%llx> ", d,
+ (u_longlong_t)DVA_GET_VDEV(dva),
+ (u_longlong_t)DVA_GET_OFFSET(dva),
+ (u_longlong_t)DVA_GET_ASIZE(dva));
+ }
+
+ (void) snprintf(buf + strlen(buf), len - strlen(buf),
+ "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+ zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
+ BP_IS_GANG(bp) ? "gang" : "contiguous",
+ (u_longlong_t)bp->blk_birth,
+ (u_longlong_t)bp->blk_fill,
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+ uint64_t freeze_txg = 0;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ if (spa->spa_freeze_txg == UINT64_MAX) {
+ freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+ spa->spa_freeze_txg = freeze_txg;
+ }
+ spa_config_exit(spa, FTAG);
+ if (freeze_txg != 0)
+ txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+void
+zfs_panic_recover(const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
+ va_end(adx);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+krwlock_t *
+spa_traverse_rwlock(spa_t *spa)
+{
+ return (&spa->spa_traverse_lock);
+}
+
+int
+spa_traverse_wanted(spa_t *spa)
+{
+ return (spa->spa_traverse_wanted);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+ return (spa->spa_dsl_pool);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+ return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+ spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+ if (spa->spa_root == NULL)
+ buf[0] = '\0';
+ else
+ (void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+ return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+ /*
+ * Accessing the name requires holding either the namespace lock or the
+ * config lock, both of which are required to do a rename.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa_config_held(spa, RW_READER));
+
+ return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+ /*
+ * If we fail to parse the config during spa_load(), we can go through
+ * the error path (which posts an ereport) and end up here with no root
+ * vdev. We stash the original pool guid in 'spa_load_guid' to handle
+ * this case.
+ */
+ if (spa->spa_root_vdev != NULL)
+ return (spa->spa_root_vdev->vdev_guid);
+ else
+ return (spa->spa_load_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+ return (spa->spa_first_txg);
+}
+
+int
+spa_state(spa_t *spa)
+{
+ return (spa->spa_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+ return (spa->spa_freeze_txg);
+}
+
+/*
+ * Return how much space is allocated in the pool (ie. sum of all asize)
+ */
+uint64_t
+spa_get_alloc(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+}
+
+/*
+ * Return how much (raid-z inflated) space there is in the pool.
+ */
+uint64_t
+spa_get_space(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/*
+ * Return the amount of raid-z-deflated space in the pool.
+ */
+uint64_t
+spa_get_dspace(spa_t *spa)
+{
+ if (spa->spa_deflate)
+ return (spa->spa_root_vdev->vdev_stat.vs_dspace);
+ else
+ return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/* ARGSUSED */
+uint64_t
+spa_get_asize(spa_t *spa, uint64_t lsize)
+{
+ /*
+ * For now, the worst case is 512-byte RAID-Z blocks, in which
+ * case the space requirement is exactly 2x; so just assume that.
+ * Add to this the fact that we can have up to 3 DVAs per bp, and
+ * we have to multiply by a total of 6x.
+ */
+ return (lsize * 6);
+}
+
+/*
+ * Return the failure mode that has been set to this pool. The default
+ * behavior will be to block all I/Os when a complete failure occurs.
+ */
+uint8_t
+spa_get_failmode(spa_t *spa)
+{
+ return (spa->spa_failmode);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_version);
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+ /*
+ * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
+ * handle BPs with more than one DVA allocated. Set our max
+ * replication level accordingly.
+ */
+ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
+ return (1);
+ return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
+}
+
+uint64_t
+bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+{
+ int sz = 0, i;
+
+ if (!spa->spa_deflate)
+ return (BP_GET_ASIZE(bp));
+
+ spa_config_enter(spa, RW_READER, FTAG);
+ for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+ vdev_t *vd =
+ vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
+ if (vd)
+ sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
+ SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+ }
+ spa_config_exit(spa, FTAG);
+ return (sz);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+ const spa_t *s1 = a1;
+ const spa_t *s2 = a2;
+ int s;
+
+ s = strcmp(s1->spa_name, s2->spa_name);
+ if (s > 0)
+ return (1);
+ if (s < 0)
+ return (-1);
+ return (0);
+}
+
+int
+spa_busy(void)
+{
+ return (spa_active_count);
+}
+
+void
+spa_boot_init()
+{
+ spa_config_load();
+}
+
+void
+spa_init(int mode)
+{
+ mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+ avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+ offsetof(spa_t, spa_avl));
+
+ avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
+ offsetof(spa_aux_t, aux_avl));
+
+ avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
+ offsetof(spa_aux_t, aux_avl));
+
+ spa_mode = mode;
+
+ refcount_init();
+ unique_init();
+ zio_init();
+ dmu_init();
+ zil_init();
+ vdev_cache_stat_init();
+ zfs_prop_init();
+ zpool_prop_init();
+ spa_config_load();
+}
+
+void
+spa_fini(void)
+{
+ spa_evict_all();
+
+ vdev_cache_stat_fini();
+ zil_fini();
+ dmu_fini();
+ zio_fini();
+ unique_fini();
+ refcount_fini();
+
+ avl_destroy(&spa_namespace_avl);
+ avl_destroy(&spa_spare_avl);
+ avl_destroy(&spa_l2cache_avl);
+
+ cv_destroy(&spa_namespace_cv);
+ mutex_destroy(&spa_namespace_lock);
+ mutex_destroy(&spa_spare_lock);
+ mutex_destroy(&spa_l2cache_lock);
+}
+
+/*
+ * Return whether this pool has slogs. No locking needed.
+ * It's not a problem if the wrong answer is returned as it's only for
+ * performance and not correctness
+ */
+boolean_t
+spa_has_slogs(spa_t *spa)
+{
+ return (spa->spa_log_class->mc_rotor != NULL);
+}
diff --git a/zfs/lib/libzpool/space_map.c b/zfs/lib/libzpool/space_map.c
new file mode 100644
index 000000000..b1e6bc15f
--- /dev/null
+++ b/zfs/lib/libzpool/space_map.c
@@ -0,0 +1,506 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)space_map.c 1.7 08/03/11 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/space_map.h>
+
+/*
+ * Space map routines.
+ * NOTE: caller is responsible for all locking.
+ */
+static int
+space_map_seg_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+
+ if (s1->ss_start < s2->ss_start) {
+ if (s1->ss_end > s2->ss_start)
+ return (0);
+ return (-1);
+ }
+ if (s1->ss_start > s2->ss_start) {
+ if (s1->ss_start < s2->ss_end)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+void
+space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
+ kmutex_t *lp)
+{
+ bzero(sm, sizeof (*sm));
+
+ avl_create(&sm->sm_root, space_map_seg_compare,
+ sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
+
+ sm->sm_start = start;
+ sm->sm_size = size;
+ sm->sm_shift = shift;
+ sm->sm_lock = lp;
+}
+
+void
+space_map_destroy(space_map_t *sm)
+{
+ ASSERT(!sm->sm_loaded && !sm->sm_loading);
+ VERIFY3U(sm->sm_space, ==, 0);
+ avl_destroy(&sm->sm_root);
+}
+
+void
+space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss_before, *ss_after, *ss;
+ uint64_t end = start + size;
+ int merge_before, merge_after;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY3U(start, >=, sm->sm_start);
+ VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
+ VERIFY(sm->sm_space + size <= sm->sm_size);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
+ zfs_panic_recover("zfs: allocating allocated segment"
+ "(offset=%llu size=%llu)\n",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+
+ /* Make sure we don't overlap with either of our neighbors */
+ VERIFY(ss == NULL);
+
+ ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
+ ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
+
+ merge_before = (ss_before != NULL && ss_before->ss_end == start);
+ merge_after = (ss_after != NULL && ss_after->ss_start == end);
+
+ if (merge_before && merge_after) {
+ avl_remove(&sm->sm_root, ss_before);
+ ss_after->ss_start = ss_before->ss_start;
+ kmem_free(ss_before, sizeof (*ss_before));
+ } else if (merge_before) {
+ ss_before->ss_end = end;
+ } else if (merge_after) {
+ ss_after->ss_start = start;
+ } else {
+ ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
+ ss->ss_start = start;
+ ss->ss_end = end;
+ avl_insert(&sm->sm_root, ss, where);
+ }
+
+ sm->sm_space += size;
+}
+
+void
+space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss, *newseg;
+ uint64_t end = start + size;
+ int left_over, right_over;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ /* Make sure we completely overlap with someone */
+ if (ss == NULL) {
+ zfs_panic_recover("zfs: freeing free segment "
+ "(offset=%llu size=%llu)",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+ VERIFY3U(ss->ss_start, <=, start);
+ VERIFY3U(ss->ss_end, >=, end);
+ VERIFY(sm->sm_space - size <= sm->sm_size);
+
+ left_over = (ss->ss_start != start);
+ right_over = (ss->ss_end != end);
+
+ if (left_over && right_over) {
+ newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
+ newseg->ss_start = end;
+ newseg->ss_end = ss->ss_end;
+ ss->ss_end = start;
+ avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+ } else if (left_over) {
+ ss->ss_end = start;
+ } else if (right_over) {
+ ss->ss_start = end;
+ } else {
+ avl_remove(&sm->sm_root, ss);
+ kmem_free(ss, sizeof (*ss));
+ }
+
+ sm->sm_space -= size;
+}
+
+int
+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss;
+ uint64_t end = start + size;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
+}
+
+void
+space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+ space_seg_t *ss;
+ void *cookie = NULL;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+ if (func != NULL)
+ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+ kmem_free(ss, sizeof (*ss));
+ }
+ sm->sm_space = 0;
+}
+
+void
+space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+ space_seg_t *ss;
+
+ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+}
+
+void
+space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ avl_index_t where;
+ space_seg_t *ss, search;
+ uint64_t end = start + size;
+ uint64_t rm_start, rm_end;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ search.ss_start = start;
+ search.ss_end = start;
+
+ for (;;) {
+ ss = avl_find(t, &search, &where);
+
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+
+ if (ss == NULL || ss->ss_start >= end)
+ break;
+
+ rm_start = MAX(ss->ss_start, start);
+ rm_end = MIN(ss->ss_end, end);
+
+ space_map_remove(sm, rm_start, rm_end - rm_start);
+ }
+}
+
+/*
+ * Replace smd with the union of smd and sms.
+ */
+void
+space_map_union(space_map_t *smd, space_map_t *sms)
+{
+ avl_tree_t *t = &sms->sm_root;
+ space_seg_t *ss;
+
+ ASSERT(MUTEX_HELD(smd->sm_lock));
+
+ /*
+ * For each source segment, remove any intersections with the
+ * destination, then add the source segment to the destination.
+ */
+ for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
+ space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+ space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+ }
+}
+
+/*
+ * Wait for any in-progress space_map_load() to complete.
+ */
+void
+space_map_load_wait(space_map_t *sm)
+{
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ while (sm->sm_loading)
+ cv_wait(&sm->sm_load_cv, sm->sm_lock);
+}
+
+/*
+ * Note: space_map_load() will drop sm_lock across dmu_read() calls.
+ * The caller must be OK with this.
+ */
+int
+space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os)
+{
+ uint64_t *entry, *entry_map, *entry_map_end;
+ uint64_t bufsize, size, offset, end, space;
+ uint64_t mapstart = sm->sm_start;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ space_map_load_wait(sm);
+
+ if (sm->sm_loaded)
+ return (0);
+
+ sm->sm_loading = B_TRUE;
+ end = smo->smo_objsize;
+ space = smo->smo_alloc;
+
+ ASSERT(sm->sm_ops == NULL);
+ VERIFY3U(sm->sm_space, ==, 0);
+
+ if (maptype == SM_FREE) {
+ space_map_add(sm, sm->sm_start, sm->sm_size);
+ space = sm->sm_size - space;
+ }
+
+ bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
+ entry_map = zio_buf_alloc(bufsize);
+
+ mutex_exit(sm->sm_lock);
+ if (end > bufsize)
+ dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
+ mutex_enter(sm->sm_lock);
+
+ for (offset = 0; offset < end; offset += bufsize) {
+ size = MIN(end - offset, bufsize);
+ VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+ VERIFY(size != 0);
+
+ dprintf("object=%llu offset=%llx size=%llx\n",
+ smo->smo_object, offset, size);
+
+ mutex_exit(sm->sm_lock);
+ error = dmu_read(os, smo->smo_object, offset, size, entry_map);
+ mutex_enter(sm->sm_lock);
+ if (error != 0)
+ break;
+
+ entry_map_end = entry_map + (size / sizeof (uint64_t));
+ for (entry = entry_map; entry < entry_map_end; entry++) {
+ uint64_t e = *entry;
+
+ if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
+ continue;
+
+ (SM_TYPE_DECODE(e) == maptype ?
+ space_map_add : space_map_remove)(sm,
+ (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
+ SM_RUN_DECODE(e) << sm->sm_shift);
+ }
+ }
+
+ if (error == 0) {
+ VERIFY3U(sm->sm_space, ==, space);
+
+ sm->sm_loaded = B_TRUE;
+ sm->sm_ops = ops;
+ if (ops != NULL)
+ ops->smop_load(sm);
+ } else {
+ space_map_vacate(sm, NULL, NULL);
+ }
+
+ zio_buf_free(entry_map, bufsize);
+
+ sm->sm_loading = B_FALSE;
+
+ cv_broadcast(&sm->sm_load_cv);
+
+ return (error);
+}
+
+void
+space_map_unload(space_map_t *sm)
+{
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ if (sm->sm_loaded && sm->sm_ops != NULL)
+ sm->sm_ops->smop_unload(sm);
+
+ sm->sm_loaded = B_FALSE;
+ sm->sm_ops = NULL;
+
+ space_map_vacate(sm, NULL, NULL);
+}
+
+uint64_t
+space_map_alloc(space_map_t *sm, uint64_t size)
+{
+ uint64_t start;
+
+ start = sm->sm_ops->smop_alloc(sm, size);
+ if (start != -1ULL)
+ space_map_remove(sm, start, size);
+ return (start);
+}
+
+void
+space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ sm->sm_ops->smop_claim(sm, start, size);
+ space_map_remove(sm, start, size);
+}
+
+void
+space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ space_map_add(sm, start, size);
+ sm->sm_ops->smop_free(sm, start, size);
+}
+
+/*
+ * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
+ */
+void
+space_map_sync(space_map_t *sm, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ void *cookie = NULL;
+ space_seg_t *ss;
+ uint64_t bufsize, start, size, run_len;
+ uint64_t *entry, *entry_map, *entry_map_end;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ if (sm->sm_space == 0)
+ return;
+
+ dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
+ smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
+ maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
+ sm->sm_space);
+
+ if (maptype == SM_ALLOC)
+ smo->smo_alloc += sm->sm_space;
+ else
+ smo->smo_alloc -= sm->sm_space;
+
+ bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
+ bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
+ entry_map = zio_buf_alloc(bufsize);
+ entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+ entry = entry_map;
+
+ *entry++ = SM_DEBUG_ENCODE(1) |
+ SM_DEBUG_ACTION_ENCODE(maptype) |
+ SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
+ SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+ size = ss->ss_end - ss->ss_start;
+ start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+
+ sm->sm_space -= size;
+ size >>= sm->sm_shift;
+
+ while (size) {
+ run_len = MIN(size, SM_RUN_MAX);
+
+ if (entry == entry_map_end) {
+ mutex_exit(sm->sm_lock);
+ dmu_write(os, smo->smo_object, smo->smo_objsize,
+ bufsize, entry_map, tx);
+ mutex_enter(sm->sm_lock);
+ smo->smo_objsize += bufsize;
+ entry = entry_map;
+ }
+
+ *entry++ = SM_OFFSET_ENCODE(start) |
+ SM_TYPE_ENCODE(maptype) |
+ SM_RUN_ENCODE(run_len);
+
+ start += run_len;
+ size -= run_len;
+ }
+ kmem_free(ss, sizeof (*ss));
+ }
+
+ if (entry != entry_map) {
+ size = (entry - entry_map) * sizeof (uint64_t);
+ mutex_exit(sm->sm_lock);
+ dmu_write(os, smo->smo_object, smo->smo_objsize,
+ size, entry_map, tx);
+ mutex_enter(sm->sm_lock);
+ smo->smo_objsize += size;
+ }
+
+ zio_buf_free(entry_map, bufsize);
+
+ VERIFY3U(sm->sm_space, ==, 0);
+}
+
+void
+space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+{
+ VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
+
+ smo->smo_objsize = 0;
+ smo->smo_alloc = 0;
+}
diff --git a/zfs/lib/libzpool/taskq.c b/zfs/lib/libzpool/taskq.c
new file mode 100644
index 000000000..6f6cfc269
--- /dev/null
+++ b/zfs/lib/libzpool/taskq.c
@@ -0,0 +1,255 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+#include <sys/zfs_context.h>
+
+int taskq_now;
+
+typedef struct task {
+ struct task *task_next;
+ struct task *task_prev;
+ task_func_t *task_func;
+ void *task_arg;
+} task_t;
+
+#define TASKQ_ACTIVE 0x00010000
+
+struct taskq {
+ kmutex_t tq_lock;
+ krwlock_t tq_threadlock;
+ kcondvar_t tq_dispatch_cv;
+ kcondvar_t tq_wait_cv;
+ thread_t *tq_threadlist;
+ int tq_flags;
+ int tq_active;
+ int tq_nthreads;
+ int tq_nalloc;
+ int tq_minalloc;
+ int tq_maxalloc;
+ task_t *tq_freelist;
+ task_t tq_task;
+};
+
+static task_t *
+task_alloc(taskq_t *tq, int tqflags)
+{
+ task_t *t;
+
+ if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
+ tq->tq_freelist = t->task_next;
+ } else {
+ mutex_exit(&tq->tq_lock);
+ if (tq->tq_nalloc >= tq->tq_maxalloc) {
+ if (!(tqflags & KM_SLEEP)) {
+ mutex_enter(&tq->tq_lock);
+ return (NULL);
+ }
+ /*
+ * We don't want to exceed tq_maxalloc, but we can't
+ * wait for other tasks to complete (and thus free up
+ * task structures) without risking deadlock with
+ * the caller. So, we just delay for one second
+ * to throttle the allocation rate.
+ */
+ delay(hz);
+ }
+ t = kmem_alloc(sizeof (task_t), tqflags);
+ mutex_enter(&tq->tq_lock);
+ if (t != NULL)
+ tq->tq_nalloc++;
+ }
+ return (t);
+}
+
+static void
+task_free(taskq_t *tq, task_t *t)
+{
+ if (tq->tq_nalloc <= tq->tq_minalloc) {
+ t->task_next = tq->tq_freelist;
+ tq->tq_freelist = t;
+ } else {
+ tq->tq_nalloc--;
+ mutex_exit(&tq->tq_lock);
+ kmem_free(t, sizeof (task_t));
+ mutex_enter(&tq->tq_lock);
+ }
+}
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
+{
+ task_t *t;
+
+ if (taskq_now) {
+ func(arg);
+ return (1);
+ }
+
+ mutex_enter(&tq->tq_lock);
+ ASSERT(tq->tq_flags & TASKQ_ACTIVE);
+ if ((t = task_alloc(tq, tqflags)) == NULL) {
+ mutex_exit(&tq->tq_lock);
+ return (0);
+ }
+ t->task_next = &tq->tq_task;
+ t->task_prev = tq->tq_task.task_prev;
+ t->task_next->task_prev = t;
+ t->task_prev->task_next = t;
+ t->task_func = func;
+ t->task_arg = arg;
+ cv_signal(&tq->tq_dispatch_cv);
+ mutex_exit(&tq->tq_lock);
+ return (1);
+}
+
+void
+taskq_wait(taskq_t *tq)
+{
+ mutex_enter(&tq->tq_lock);
+ while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0)
+ cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+ mutex_exit(&tq->tq_lock);
+}
+
+static void *
+taskq_thread(void *arg)
+{
+ taskq_t *tq = arg;
+ task_t *t;
+
+ mutex_enter(&tq->tq_lock);
+ while (tq->tq_flags & TASKQ_ACTIVE) {
+ if ((t = tq->tq_task.task_next) == &tq->tq_task) {
+ if (--tq->tq_active == 0)
+ cv_broadcast(&tq->tq_wait_cv);
+ cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
+ tq->tq_active++;
+ continue;
+ }
+ t->task_prev->task_next = t->task_next;
+ t->task_next->task_prev = t->task_prev;
+ mutex_exit(&tq->tq_lock);
+
+ rw_enter(&tq->tq_threadlock, RW_READER);
+ t->task_func(t->task_arg);
+ rw_exit(&tq->tq_threadlock);
+
+ mutex_enter(&tq->tq_lock);
+ task_free(tq, t);
+ }
+ tq->tq_nthreads--;
+ cv_broadcast(&tq->tq_wait_cv);
+ mutex_exit(&tq->tq_lock);
+ return (NULL);
+}
+
+/*ARGSUSED*/
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags)
+{
+ taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP);
+ int t;
+
+ rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+ tq->tq_flags = flags | TASKQ_ACTIVE;
+ tq->tq_active = nthreads;
+ tq->tq_nthreads = nthreads;
+ tq->tq_minalloc = minalloc;
+ tq->tq_maxalloc = maxalloc;
+ tq->tq_task.task_next = &tq->tq_task;
+ tq->tq_task.task_prev = &tq->tq_task;
+ tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP);
+
+ if (flags & TASKQ_PREPOPULATE) {
+ mutex_enter(&tq->tq_lock);
+ while (minalloc-- > 0)
+ task_free(tq, task_alloc(tq, KM_SLEEP));
+ mutex_exit(&tq->tq_lock);
+ }
+
+ for (t = 0; t < nthreads; t++)
+ VERIFY(thr_create(0, 0, taskq_thread,
+ tq, THR_BOUND, &tq->tq_threadlist[t]) == 0);
+
+ return (tq);
+}
+
+void
+taskq_destroy(taskq_t *tq)
+{
+ int t;
+ int nthreads = tq->tq_nthreads;
+
+ taskq_wait(tq);
+
+ mutex_enter(&tq->tq_lock);
+
+ tq->tq_flags &= ~TASKQ_ACTIVE;
+ cv_broadcast(&tq->tq_dispatch_cv);
+
+ while (tq->tq_nthreads != 0)
+ cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+
+ tq->tq_minalloc = 0;
+ while (tq->tq_nalloc != 0) {
+ ASSERT(tq->tq_freelist != NULL);
+ task_free(tq, task_alloc(tq, KM_SLEEP));
+ }
+
+ mutex_exit(&tq->tq_lock);
+
+ for (t = 0; t < nthreads; t++)
+ VERIFY(thr_join(tq->tq_threadlist[t], NULL, NULL) == 0);
+
+ kmem_free(tq->tq_threadlist, nthreads * sizeof (thread_t));
+
+ rw_destroy(&tq->tq_threadlock);
+ mutex_destroy(&tq->tq_lock);
+ cv_destroy(&tq->tq_dispatch_cv);
+ cv_destroy(&tq->tq_wait_cv);
+
+ kmem_free(tq, sizeof (taskq_t));
+}
+
+int
+taskq_member(taskq_t *tq, void *t)
+{
+ int i;
+
+ if (taskq_now)
+ return (1);
+
+ for (i = 0; i < tq->tq_nthreads; i++)
+ if (tq->tq_threadlist[i] == (thread_t)(uintptr_t)t)
+ return (1);
+
+ return (0);
+}
diff --git a/zfs/lib/libzpool/txg.c b/zfs/lib/libzpool/txg.c
new file mode 100644
index 000000000..f810a0dc6
--- /dev/null
+++ b/zfs/lib/libzpool/txg.c
@@ -0,0 +1,661 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)txg.c 1.4 08/03/20 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/callb.h>
+
+/*
+ * Pool-wide transaction groups.
+ */
+
+static void txg_sync_thread(dsl_pool_t *dp);
+static void txg_quiesce_thread(dsl_pool_t *dp);
+
+int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */
+int zfs_txg_synctime = 5; /* target seconds to sync a txg */
+
+int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
+
+uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
+uint64_t zfs_write_limit_max = 0; /* max data payload per txg */
+uint64_t zfs_write_limit_inflated = 0;
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c;
+ bzero(tx, sizeof (tx_state_t));
+
+ tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+
+ for (c = 0; c < max_ncpus; c++) {
+ int i;
+
+ mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+ for (i = 0; i < TXG_SIZE; i++) {
+ cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
+ NULL);
+ }
+ }
+
+ rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
+ mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c;
+
+ ASSERT(tx->tx_threads == 0);
+
+ rw_destroy(&tx->tx_suspend);
+ mutex_destroy(&tx->tx_sync_lock);
+
+ for (c = 0; c < max_ncpus; c++) {
+ int i;
+
+ mutex_destroy(&tx->tx_cpu[c].tc_lock);
+ for (i = 0; i < TXG_SIZE; i++)
+ cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+ }
+
+ kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+ bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+
+ dprintf("pool %p\n", dp);
+
+ ASSERT(tx->tx_threads == 0);
+
+ tx->tx_threads = 2;
+
+ tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+ CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+ mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+ ASSERT(*tpp != NULL);
+ *tpp = NULL;
+ tx->tx_threads--;
+ cv_broadcast(&tx->tx_exit_cv);
+ CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
+ thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
+{
+ CALLB_CPR_SAFE_BEGIN(cpr);
+
+ if (time)
+ (void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + time);
+ else
+ cv_wait(cv, &tx->tx_sync_lock);
+
+ CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ dprintf("pool %p\n", dp);
+ /*
+ * Finish off any work in progress.
+ */
+ ASSERT(tx->tx_threads == 2);
+ txg_wait_synced(dp, 0);
+
+ /*
+ * Wake all sync threads and wait for them to die.
+ */
+ mutex_enter(&tx->tx_sync_lock);
+
+ ASSERT(tx->tx_threads == 2);
+
+ tx->tx_exiting = 1;
+
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ cv_broadcast(&tx->tx_sync_more_cv);
+
+ while (tx->tx_threads != 0)
+ cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+ tx->tx_exiting = 0;
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
+ uint64_t txg;
+
+ mutex_enter(&tc->tc_lock);
+
+ txg = tx->tx_open_txg;
+ tc->tc_count[txg & TXG_MASK]++;
+
+ th->th_cpu = tc;
+ th->th_txg = txg;
+
+ return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+
+ mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ ASSERT(tc->tc_count[g] != 0);
+ if (--tc->tc_count[g] == 0)
+ cv_broadcast(&tc->tc_cv[g]);
+ mutex_exit(&tc->tc_lock);
+
+ th->th_cpu = NULL; /* defensive */
+}
+
+static void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int g = txg & TXG_MASK;
+ int c;
+
+ /*
+ * Grab all tx_cpu locks so nobody else can get into this txg.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_enter(&tx->tx_cpu[c].tc_lock);
+
+ ASSERT(txg == tx->tx_open_txg);
+ tx->tx_open_txg++;
+
+ /*
+ * Now that we've incremented tx_open_txg, we can let threads
+ * enter the next transaction group.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_exit(&tx->tx_cpu[c].tc_lock);
+
+ /*
+ * Quiesce the transaction group by waiting for everyone to txg_exit().
+ */
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ mutex_enter(&tc->tc_lock);
+ while (tc->tc_count[g] != 0)
+ cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+ mutex_exit(&tc->tc_lock);
+ }
+}
+
+static void
+txg_sync_thread(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+ uint64_t timeout, start, delta, timer;
+ int target;
+
+ txg_thread_enter(tx, &cpr);
+
+ start = delta = 0;
+ timeout = zfs_txg_timeout * hz;
+ for (;;) {
+ uint64_t txg, written;
+
+ /*
+ * We sync when there's someone waiting on us, or the
+ * quiesce thread has handed off a txg to us, or we have
+ * reached our timeout.
+ */
+ timer = (delta >= timeout ? 0 : timeout - delta);
+ while (!tx->tx_exiting && timer > 0 &&
+ tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+ tx->tx_quiesced_txg == 0) {
+ dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
+ delta = lbolt - start;
+ timer = (delta > timeout ? 0 : timeout - delta);
+ }
+
+ /*
+ * Wait until the quiesce thread hands off a txg to us,
+ * prompting it to do so if necessary.
+ */
+ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+ if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+ rw_enter(&tx->tx_suspend, RW_WRITER);
+
+ /*
+ * Consume the quiesced txg which has been handed off to
+ * us. This may cause the quiescing thread to now be
+ * able to quiesce another txg, so we must signal it.
+ */
+ txg = tx->tx_quiesced_txg;
+ tx->tx_quiesced_txg = 0;
+ tx->tx_syncing_txg = txg;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ rw_exit(&tx->tx_suspend);
+
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+ start = lbolt;
+ spa_sync(dp->dp_spa, txg);
+ delta = lbolt - start;
+
+ written = dp->dp_space_towrite[txg & TXG_MASK];
+ dp->dp_space_towrite[txg & TXG_MASK] = 0;
+ ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
+
+ /*
+ * If the write limit max has not been explicitly set, set it
+ * to a fraction of available phisical memory (default 1/8th).
+ * Note that we must inflate the limit because the spa
+ * inflates write sizes to account for data replication.
+ * Check this each sync phase to catch changing memory size.
+ */
+ if (zfs_write_limit_inflated == 0 ||
+ (zfs_write_limit_shift && zfs_write_limit_max !=
+ physmem * PAGESIZE >> zfs_write_limit_shift)) {
+ zfs_write_limit_max =
+ physmem * PAGESIZE >> zfs_write_limit_shift;
+ zfs_write_limit_inflated =
+ spa_get_asize(dp->dp_spa, zfs_write_limit_max);
+ if (zfs_write_limit_min > zfs_write_limit_inflated)
+ zfs_write_limit_inflated = zfs_write_limit_min;
+ }
+
+ /*
+ * Attempt to keep the sync time consistant by adjusting the
+ * amount of write traffic allowed into each transaction group.
+ */
+ target = zfs_txg_synctime * hz;
+ if (delta > target) {
+ uint64_t old = MIN(dp->dp_write_limit, written);
+
+ dp->dp_write_limit = MAX(zfs_write_limit_min,
+ old * target / delta);
+ } else if (written >= dp->dp_write_limit &&
+ delta >> 3 < target >> 3) {
+ uint64_t rescale =
+ MIN((100 * target) / delta, 200);
+
+ dp->dp_write_limit = MIN(zfs_write_limit_inflated,
+ written * rescale / 100);
+ }
+
+ mutex_enter(&tx->tx_sync_lock);
+ rw_enter(&tx->tx_suspend, RW_WRITER);
+ tx->tx_synced_txg = txg;
+ tx->tx_syncing_txg = 0;
+ rw_exit(&tx->tx_suspend);
+ cv_broadcast(&tx->tx_sync_done_cv);
+ }
+}
+
+static void
+txg_quiesce_thread(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We quiesce when there's someone waiting on us.
+ * However, we can only have one txg in "quiescing" or
+ * "quiesced, waiting to sync" state. So we wait until
+ * the "quiesced, waiting to sync" txg has been consumed
+ * by the sync thread.
+ */
+ while (!tx->tx_exiting &&
+ (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+ tx->tx_quiesced_txg != 0))
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+ txg = tx->tx_open_txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+ txg_quiesce(dp, txg);
+ mutex_enter(&tx->tx_sync_lock);
+
+ /*
+ * Hand this txg off to the sync thread.
+ */
+ dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiesced_txg = txg;
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ }
+}
+
+/*
+ * Delay this thread by 'ticks' if we are still in the open transaction
+ * group and there is already a waiting txg quiesing or quiesced. Abort
+ * the delay if this txg stalls or enters the quiesing state.
+ */
+void
+txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int timeout = lbolt + ticks;
+
+ /* don't delay if this txg could transition to quiesing immediately */
+ if (tx->tx_open_txg > txg ||
+ tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
+ return;
+
+ mutex_enter(&tx->tx_sync_lock);
+ if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
+ mutex_exit(&tx->tx_sync_lock);
+ return;
+ }
+
+ while (lbolt < timeout &&
+ tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
+ (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
+ timeout);
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT(tx->tx_threads == 2);
+ if (txg == 0)
+ txg = tx->tx_open_txg;
+ if (tx->tx_sync_txg_waiting < txg)
+ tx->tx_sync_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_synced_txg < txg) {
+ dprintf("broadcasting sync more "
+ "tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT(tx->tx_threads == 2);
+ if (txg == 0)
+ txg = tx->tx_open_txg + 1;
+ if (tx->tx_quiesce_txg_waiting < txg)
+ tx->tx_quiesce_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_open_txg < txg) {
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+int
+txg_stalled(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+void
+txg_suspend(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ /* XXX some code paths suspend when they are already suspended! */
+ rw_enter(&tx->tx_suspend, RW_READER);
+}
+
+void
+txg_resume(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ rw_exit(&tx->tx_suspend);
+}
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, size_t offset)
+{
+ int t;
+
+ mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ tl->tl_offset = offset;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ tl->tl_head[t] = NULL;
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ ASSERT(txg_list_empty(tl, t));
+
+ mutex_destroy(&tl->tl_lock);
+}
+
+int
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+ return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+/*
+ * Add an entry to the list.
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ int already_on_list;
+
+ mutex_enter(&tl->tl_lock);
+ already_on_list = tn->tn_member[t];
+ if (!already_on_list) {
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = tl->tl_head[t];
+ tl->tl_head[t] = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (already_on_list);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn;
+ void *p = NULL;
+
+ mutex_enter(&tl->tl_lock);
+ if ((tn = tl->tl_head[t]) != NULL) {
+ p = (char *)tn - tl->tl_offset;
+ tl->tl_head[t] = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn, **tp;
+
+ mutex_enter(&tl->tl_lock);
+
+ for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+ if ((char *)tn - tl->tl_offset == p) {
+ *tp = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ mutex_exit(&tl->tl_lock);
+ return (p);
+ }
+ }
+
+ mutex_exit(&tl->tl_lock);
+
+ return (NULL);
+}
+
+int
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ return (tn->tn_member[t]);
+}
+
+/*
+ * Walk a txg list -- only safe if you know it's not changing.
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = tl->tl_head[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ tn = tn->tn_next[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
diff --git a/zfs/lib/libzpool/uberblock.c b/zfs/lib/libzpool/uberblock.c
new file mode 100644
index 000000000..3afb08a5b
--- /dev/null
+++ b/zfs/lib/libzpool/uberblock.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)uberblock.c 1.3 06/04/06 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+ byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+ if (ub->ub_magic != UBERBLOCK_MAGIC)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Update the uberblock and return a boolean value indicating whether
+ * anything changed in this transaction group.
+ */
+int
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
+{
+ ASSERT(ub->ub_txg < txg);
+
+ /*
+ * We explicitly do not set ub_version here, so that older versions
+ * continue to be written with the previous uberblock version.
+ */
+ ub->ub_magic = UBERBLOCK_MAGIC;
+ ub->ub_txg = txg;
+ ub->ub_guid_sum = rvd->vdev_guid_sum;
+ ub->ub_timestamp = gethrestime_sec();
+
+ return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/zfs/lib/libzpool/unique.c b/zfs/lib/libzpool/unique.c
new file mode 100644
index 000000000..ea7ec8854
--- /dev/null
+++ b/zfs/lib/libzpool/unique.c
@@ -0,0 +1,116 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)unique.c 1.3 07/08/02 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx;
+
+typedef struct unique {
+ avl_node_t un_link;
+ uint64_t un_value;
+} unique_t;
+
+#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+ const unique_t *una = a;
+ const unique_t *unb = b;
+
+ if (una->un_value < unb->un_value)
+ return (-1);
+ if (una->un_value > unb->un_value)
+ return (+1);
+ return (0);
+}
+
+void
+unique_init(void)
+{
+ avl_create(&unique_avl, unique_compare,
+ sizeof (unique_t), offsetof(unique_t, un_link));
+ mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+unique_fini(void)
+{
+ avl_destroy(&unique_avl);
+ mutex_destroy(&unique_mtx);
+}
+
+uint64_t
+unique_create(void)
+{
+ uint64_t value = unique_insert(0);
+ unique_remove(value);
+ return (value);
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+ avl_index_t idx;
+ unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+ un->un_value = value;
+
+ mutex_enter(&unique_mtx);
+ while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+ avl_find(&unique_avl, un, &idx)) {
+ mutex_exit(&unique_mtx);
+ (void) random_get_pseudo_bytes((void*)&un->un_value,
+ sizeof (un->un_value));
+ un->un_value &= UNIQUE_MASK;
+ mutex_enter(&unique_mtx);
+ }
+
+ avl_insert(&unique_avl, un, idx);
+ mutex_exit(&unique_mtx);
+
+ return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+ unique_t un_tofind;
+ unique_t *un;
+
+ un_tofind.un_value = value;
+ mutex_enter(&unique_mtx);
+ un = avl_find(&unique_avl, &un_tofind, NULL);
+ if (un != NULL) {
+ avl_remove(&unique_avl, un);
+ kmem_free(un, sizeof (unique_t));
+ }
+ mutex_exit(&unique_mtx);
+}
diff --git a/zfs/lib/libzpool/util.c b/zfs/lib/libzpool/util.c
new file mode 100644
index 000000000..e7187d0d0
--- /dev/null
+++ b/zfs/lib/libzpool/util.c
@@ -0,0 +1,151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+#include <assert.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/spa.h>
+#include <sys/fs/zfs.h>
+#include <sys/refcount.h>
+
+/*
+ * Routines needed by more than one client of libzpool.
+ */
+
+void
+nicenum(uint64_t num, char *buf)
+{
+ uint64_t n = num;
+ int index = 0;
+ char u;
+
+ while (n >= 1024) {
+ n = (n + (1024 / 2)) / 1024; /* Round up or down */
+ index++;
+ }
+
+ u = " KMGTPE"[index];
+
+ if (index == 0) {
+ (void) sprintf(buf, "%llu", (u_longlong_t)n);
+ } else if (n < 10 && (num & (num - 1)) != 0) {
+ (void) sprintf(buf, "%.2f%c",
+ (double)num / (1ULL << 10 * index), u);
+ } else if (n < 100 && (num & (num - 1)) != 0) {
+ (void) sprintf(buf, "%.1f%c",
+ (double)num / (1ULL << 10 * index), u);
+ } else {
+ (void) sprintf(buf, "%llu%c", (u_longlong_t)n, u);
+ }
+}
+
+static void
+show_vdev_stats(const char *desc, nvlist_t *nv, int indent)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ vdev_stat_t *vs;
+ uint64_t sec;
+ uint64_t is_log = 0;
+ char used[6], avail[6];
+ char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6];
+ char *prefix = "";
+
+ if (indent == 0) {
+ (void) printf(" "
+ " capacity operations bandwidth ---- errors ----\n");
+ (void) printf("description "
+ "used avail read write read write read write cksum\n");
+ }
+
+ VERIFY(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &c) == 0);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
+
+ if (is_log)
+ prefix = "log ";
+
+ sec = MAX(1, vs->vs_timestamp / NANOSEC);
+
+ nicenum(vs->vs_alloc, used);
+ nicenum(vs->vs_space - vs->vs_alloc, avail);
+ nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops);
+ nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops);
+ nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes);
+ nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes);
+ nicenum(vs->vs_read_errors, rerr);
+ nicenum(vs->vs_write_errors, werr);
+ nicenum(vs->vs_checksum_errors, cerr);
+
+ (void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n",
+ indent, "",
+ prefix,
+ indent + strlen(prefix) - 19 - (vs->vs_space ? 0 : 12), desc,
+ vs->vs_space ? 6 : 0, vs->vs_space ? used : "",
+ vs->vs_space ? 6 : 0, vs->vs_space ? avail : "",
+ rops, wops, rbytes, wbytes, rerr, werr, cerr);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return;
+
+ for (c = 0; c < children; c++) {
+ nvlist_t *cnv = child[c];
+ char *cname, *tname;
+ uint64_t np;
+ if (nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &cname) &&
+ nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &cname))
+ cname = "<unknown>";
+ tname = calloc(1, strlen(cname) + 2);
+ (void) strcpy(tname, cname);
+ if (nvlist_lookup_uint64(cnv, ZPOOL_CONFIG_NPARITY, &np) == 0)
+ tname[strlen(tname)] = '0' + np;
+ show_vdev_stats(tname, cnv, indent + 2);
+ free(tname);
+ }
+}
+
+void
+show_pool_stats(spa_t *spa)
+{
+ nvlist_t *config, *nvroot;
+ char *name;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ spa_config_exit(spa, FTAG);
+
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &name) == 0);
+
+ show_vdev_stats(name, nvroot, 0);
+}
diff --git a/zfs/lib/libzpool/vdev.c b/zfs/lib/libzpool/vdev.c
new file mode 100644
index 000000000..ec92c2201
--- /dev/null
+++ b/zfs/lib/libzpool/vdev.c
@@ -0,0 +1,2207 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev.c 1.33 07/11/27 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device management.
+ */
+
+static vdev_ops_t *vdev_ops_table[] = {
+ &vdev_root_ops,
+ &vdev_raidz_ops,
+ &vdev_mirror_ops,
+ &vdev_replacing_ops,
+ &vdev_spare_ops,
+ &vdev_disk_ops,
+ &vdev_file_ops,
+ &vdev_missing_ops,
+ NULL
+};
+
+/* maximum scrub/resilver I/O queue */
+int zfs_scrub_limit = 70;
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+ vdev_ops_t *ops, **opspp;
+
+ for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+ if (strcmp(ops->vdev_op_type, type) == 0)
+ break;
+
+ return (ops);
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children. This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
+ uint64_t csize;
+ uint64_t c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ asize = MAX(asize, csize);
+ }
+
+ return (asize);
+}
+
+/*
+ * Get the replaceable or attachable device size.
+ * If the parent is a mirror or raidz, the replaceable size is the minimum
+ * psize of all its children. For the rest, just return our own psize.
+ *
+ * e.g.
+ * psize rsize
+ * root - -
+ * mirror/raidz - -
+ * disk1 20g 20g
+ * disk2 40g 20g
+ * disk3 80g 80g
+ */
+uint64_t
+vdev_get_rsize(vdev_t *vd)
+{
+ vdev_t *pvd, *cvd;
+ uint64_t c, rsize;
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If our parent is NULL or the root, just return our own psize.
+ */
+ if (pvd == NULL || pvd->vdev_parent == NULL)
+ return (vd->vdev_psize);
+
+ rsize = 0;
+
+ for (c = 0; c < pvd->vdev_children; c++) {
+ cvd = pvd->vdev_child[c];
+ rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
+ }
+
+ return (rsize);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, RW_READER) ||
+ curthread == spa->spa_scrub_thread);
+
+ if (vdev < rvd->vdev_children)
+ return (rvd->vdev_child[vdev]);
+
+ return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+ int c;
+ vdev_t *mvd;
+
+ if (vd->vdev_guid == guid)
+ return (vd);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+ size_t oldsize, newsize;
+ uint64_t id = cvd->vdev_id;
+ vdev_t **newchild;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+ ASSERT(cvd->vdev_parent == NULL);
+
+ cvd->vdev_parent = pvd;
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+ oldsize = pvd->vdev_children * sizeof (vdev_t *);
+ pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+ newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+ newchild = kmem_zalloc(newsize, KM_SLEEP);
+ if (pvd->vdev_child != NULL) {
+ bcopy(pvd->vdev_child, newchild, oldsize);
+ kmem_free(pvd->vdev_child, oldsize);
+ }
+
+ pvd->vdev_child = newchild;
+ pvd->vdev_child[id] = cvd;
+
+ cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+ ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf)
+ cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+ int c;
+ uint_t id = cvd->vdev_id;
+
+ ASSERT(cvd->vdev_parent == pvd);
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id < pvd->vdev_children);
+ ASSERT(pvd->vdev_child[id] == cvd);
+
+ pvd->vdev_child[id] = NULL;
+ cvd->vdev_parent = NULL;
+
+ for (c = 0; c < pvd->vdev_children; c++)
+ if (pvd->vdev_child[c])
+ break;
+
+ if (c == pvd->vdev_children) {
+ kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+ pvd->vdev_child = NULL;
+ pvd->vdev_children = 0;
+ }
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf)
+ cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+ vdev_t **newchild, *cvd;
+ int oldc = pvd->vdev_children;
+ int newc, c;
+
+ ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
+
+ for (c = newc = 0; c < oldc; c++)
+ if (pvd->vdev_child[c])
+ newc++;
+
+ newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+ for (c = newc = 0; c < oldc; c++) {
+ if ((cvd = pvd->vdev_child[c]) != NULL) {
+ newchild[newc] = cvd;
+ cvd->vdev_id = newc++;
+ }
+ }
+
+ kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+ pvd->vdev_child = newchild;
+ pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+static vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+ vdev_t *vd;
+
+ vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+
+ if (spa->spa_root_vdev == NULL) {
+ ASSERT(ops == &vdev_root_ops);
+ spa->spa_root_vdev = vd;
+ }
+
+ if (guid == 0) {
+ if (spa->spa_root_vdev == vd) {
+ /*
+ * The root vdev's guid will also be the pool guid,
+ * which must be unique among all pools.
+ */
+ while (guid == 0 || spa_guid_exists(guid, 0))
+ guid = spa_get_random(-1ULL);
+ } else {
+ /*
+ * Any other vdev's guid must be unique within the pool.
+ */
+ while (guid == 0 ||
+ spa_guid_exists(spa_guid(spa), guid))
+ guid = spa_get_random(-1ULL);
+ }
+ ASSERT(!spa_guid_exists(spa_guid(spa), guid));
+ }
+
+ vd->vdev_spa = spa;
+ vd->vdev_id = id;
+ vd->vdev_guid = guid;
+ vd->vdev_guid_sum = guid;
+ vd->vdev_ops = ops;
+ vd->vdev_state = VDEV_STATE_CLOSED;
+
+ mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
+ space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ txg_list_create(&vd->vdev_ms_list,
+ offsetof(struct metaslab, ms_txg_node));
+ txg_list_create(&vd->vdev_dtl_list,
+ offsetof(struct vdev, vdev_dtl_node));
+ vd->vdev_stat.vs_timestamp = gethrtime();
+ vdev_queue_init(vd);
+ vdev_cache_init(vd);
+
+ return (vd);
+}
+
+/*
+ * Allocate a new vdev. The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+int
+vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
+ int alloctype)
+{
+ vdev_ops_t *ops;
+ char *type;
+ uint64_t guid = 0, islog, nparity;
+ vdev_t *vd;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (EINVAL);
+
+ if ((ops = vdev_getops(type)) == NULL)
+ return (EINVAL);
+
+ /*
+ * If this is a load, get the vdev guid from the nvlist.
+ * Otherwise, vdev_alloc_common() will generate one for us.
+ */
+ if (alloctype == VDEV_ALLOC_LOAD) {
+ uint64_t label_id;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+ label_id != id)
+ return (EINVAL);
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_SPARE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_L2CACHE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
+ }
+
+ /*
+ * The first allocated vdev must be of type 'root'.
+ */
+ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
+ return (EINVAL);
+
+ /*
+ * Determine whether we're a log vdev.
+ */
+ islog = 0;
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
+ if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
+ return (ENOTSUP);
+
+ /*
+ * Set the nparity property for RAID-Z vdevs.
+ */
+ nparity = -1ULL;
+ if (ops == &vdev_raidz_ops) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ &nparity) == 0) {
+ /*
+ * Currently, we can only support 2 parity devices.
+ */
+ if (nparity == 0 || nparity > 2)
+ return (EINVAL);
+ /*
+ * Older versions can only support 1 parity device.
+ */
+ if (nparity == 2 &&
+ spa_version(spa) < SPA_VERSION_RAID6)
+ return (ENOTSUP);
+ } else {
+ /*
+ * We require the parity to be specified for SPAs that
+ * support multiple parity levels.
+ */
+ if (spa_version(spa) >= SPA_VERSION_RAID6)
+ return (EINVAL);
+ /*
+ * Otherwise, we default to 1 parity device for RAID-Z.
+ */
+ nparity = 1;
+ }
+ } else {
+ nparity = 0;
+ }
+ ASSERT(nparity != -1ULL);
+
+ vd = vdev_alloc_common(spa, id, guid, ops);
+
+ vd->vdev_islog = islog;
+ vd->vdev_nparity = nparity;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+ vd->vdev_path = spa_strdup(vd->vdev_path);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+ vd->vdev_devid = spa_strdup(vd->vdev_devid);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ &vd->vdev_physpath) == 0)
+ vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+
+ /*
+ * Set the whole_disk property. If it's not specified, leave the value
+ * as -1.
+ */
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &vd->vdev_wholedisk) != 0)
+ vd->vdev_wholedisk = -1ULL;
+
+ /*
+ * Look for the 'not present' flag. This will only be set if the device
+ * was not present at the time of import.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &vd->vdev_not_present);
+
+ /*
+ * Get the alignment requirement.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+
+ /*
+ * If we're a top-level vdev, try to load the allocation parameters.
+ */
+ if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ &vd->vdev_ms_array);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ &vd->vdev_ms_shift);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ &vd->vdev_asize);
+ }
+
+ /*
+ * If we're a leaf vdev, try to load the DTL object and other state.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+ &vd->vdev_dtl.smo_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+ &vd->vdev_offline);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+ &vd->vdev_unspare);
+ /*
+ * When importing a pool, we want to ignore the persistent fault
+ * state, as the diagnosis made on another system may not be
+ * valid in the current context.
+ */
+ if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
+ &vd->vdev_faulted);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+ &vd->vdev_degraded);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
+ &vd->vdev_removed);
+ }
+ }
+
+ /*
+ * Add ourselves to the parent's list of children.
+ */
+ vdev_add_child(parent, vd);
+
+ *vdp = vd;
+
+ return (0);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+ int c;
+ spa_t *spa = vd->vdev_spa;
+
+ /*
+ * vdev_free() implies closing the vdev first. This is simpler than
+ * trying to ensure complicated semantics for all callers.
+ */
+ vdev_close(vd);
+
+
+ ASSERT(!list_link_active(&vd->vdev_dirty_node));
+
+ /*
+ * Free all children.
+ */
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_free(vd->vdev_child[c]);
+
+ ASSERT(vd->vdev_child == NULL);
+ ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd == vd->vdev_top)
+ vdev_metaslab_fini(vd);
+
+ ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
+ ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
+ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+
+ /*
+ * Remove this vdev from its parent's child list.
+ */
+ vdev_remove_child(vd->vdev_parent, vd);
+
+ ASSERT(vd->vdev_parent == NULL);
+
+ /*
+ * Clean up vdev structure.
+ */
+ vdev_queue_fini(vd);
+ vdev_cache_fini(vd);
+
+ if (vd->vdev_path)
+ spa_strfree(vd->vdev_path);
+ if (vd->vdev_devid)
+ spa_strfree(vd->vdev_devid);
+ if (vd->vdev_physpath)
+ spa_strfree(vd->vdev_physpath);
+
+ if (vd->vdev_isspare)
+ spa_spare_remove(vd);
+ if (vd->vdev_isl2cache)
+ spa_l2cache_remove(vd);
+
+ txg_list_destroy(&vd->vdev_ms_list);
+ txg_list_destroy(&vd->vdev_dtl_list);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_unload(&vd->vdev_dtl_map);
+ space_map_destroy(&vd->vdev_dtl_map);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ space_map_destroy(&vd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_stat_lock);
+
+ if (vd == spa->spa_root_vdev)
+ spa->spa_root_vdev = NULL;
+
+ kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+ spa_t *spa = svd->vdev_spa;
+ metaslab_t *msp;
+ vdev_t *vd;
+ int t;
+
+ ASSERT(tvd == tvd->vdev_top);
+
+ tvd->vdev_ms_array = svd->vdev_ms_array;
+ tvd->vdev_ms_shift = svd->vdev_ms_shift;
+ tvd->vdev_ms_count = svd->vdev_ms_count;
+
+ svd->vdev_ms_array = 0;
+ svd->vdev_ms_shift = 0;
+ svd->vdev_ms_count = 0;
+
+ tvd->vdev_mg = svd->vdev_mg;
+ tvd->vdev_ms = svd->vdev_ms;
+
+ svd->vdev_mg = NULL;
+ svd->vdev_ms = NULL;
+
+ if (tvd->vdev_mg != NULL)
+ tvd->vdev_mg->mg_vd = tvd;
+
+ tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+ tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+ tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
+
+ svd->vdev_stat.vs_alloc = 0;
+ svd->vdev_stat.vs_space = 0;
+ svd->vdev_stat.vs_dspace = 0;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+ while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+ if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+ (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+ }
+
+ if (list_link_active(&svd->vdev_dirty_node)) {
+ vdev_config_clean(svd);
+ vdev_config_dirty(tvd);
+ }
+
+ tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
+ svd->vdev_deflate_ratio = 0;
+
+ tvd->vdev_islog = svd->vdev_islog;
+ svd->vdev_islog = 0;
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ return;
+
+ vd->vdev_top = tvd;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+ spa_t *spa = cvd->vdev_spa;
+ vdev_t *pvd = cvd->vdev_parent;
+ vdev_t *mvd;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+
+ mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_ashift = cvd->vdev_ashift;
+ mvd->vdev_state = cvd->vdev_state;
+
+ vdev_remove_child(pvd, cvd);
+ vdev_add_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_children;
+ vdev_add_child(mvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (mvd == mvd->vdev_top)
+ vdev_top_transfer(cvd, mvd);
+
+ return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+ vdev_t *mvd = cvd->vdev_parent;
+ vdev_t *pvd = mvd->vdev_parent;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+
+ ASSERT(mvd->vdev_children == 1);
+ ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+ mvd->vdev_ops == &vdev_replacing_ops ||
+ mvd->vdev_ops == &vdev_spare_ops);
+ cvd->vdev_ashift = mvd->vdev_ashift;
+
+ vdev_remove_child(mvd, cvd);
+ vdev_remove_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_id;
+ vdev_add_child(pvd, cvd);
+ /*
+ * If we created a new toplevel vdev, then we need to change the child's
+ * vdev GUID to match the old toplevel vdev. Otherwise, we could have
+ * detached an offline device, and when we go to import the pool we'll
+ * think we have two toplevel vdevs, instead of a different version of
+ * the same toplevel vdev.
+ */
+ if (cvd->vdev_top == cvd) {
+ pvd->vdev_guid_sum -= cvd->vdev_guid;
+ cvd->vdev_guid_sum -= cvd->vdev_guid;
+ cvd->vdev_guid = mvd->vdev_guid;
+ cvd->vdev_guid_sum += mvd->vdev_guid;
+ pvd->vdev_guid_sum += cvd->vdev_guid;
+ }
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (cvd == cvd->vdev_top)
+ vdev_top_transfer(mvd, cvd);
+
+ ASSERT(mvd->vdev_children == 0);
+ vdev_free(mvd);
+}
+
+int
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ metaslab_class_t *mc;
+ uint64_t m;
+ uint64_t oldc = vd->vdev_ms_count;
+ uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+ metaslab_t **mspp;
+ int error;
+
+ if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ return (0);
+
+ dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
+
+ ASSERT(oldc <= newc);
+
+ if (vd->vdev_islog)
+ mc = spa->spa_log_class;
+ else
+ mc = spa->spa_normal_class;
+
+ if (vd->vdev_mg == NULL)
+ vd->vdev_mg = metaslab_group_create(mc, vd);
+
+ mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+
+ if (oldc != 0) {
+ bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+ kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
+ }
+
+ vd->vdev_ms = mspp;
+ vd->vdev_ms_count = newc;
+
+ for (m = oldc; m < newc; m++) {
+ space_map_obj_t smo = { 0, 0, 0 };
+ if (txg == 0) {
+ uint64_t object = 0;
+ error = dmu_read(mos, vd->vdev_ms_array,
+ m * sizeof (uint64_t), sizeof (uint64_t), &object);
+ if (error)
+ return (error);
+ if (object != 0) {
+ dmu_buf_t *db;
+ error = dmu_bonus_hold(mos, object, FTAG, &db);
+ if (error)
+ return (error);
+ ASSERT3U(db->db_size, >=, sizeof (smo));
+ bcopy(db->db_data, &smo, sizeof (smo));
+ ASSERT3U(smo.smo_object, ==, object);
+ dmu_buf_rele(db, FTAG);
+ }
+ }
+ vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
+ m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+ }
+
+ return (0);
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+ uint64_t m;
+ uint64_t count = vd->vdev_ms_count;
+
+ if (vd->vdev_ms != NULL) {
+ for (m = 0; m < count; m++)
+ if (vd->vdev_ms[m] != NULL)
+ metaslab_fini(vd->vdev_ms[m]);
+ kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+ vd->vdev_ms = NULL;
+ }
+}
+
+int
+vdev_probe(vdev_t *vd)
+{
+ if (vd == NULL)
+ return (EINVAL);
+
+ /*
+ * Right now we only support status checks on the leaf vdevs.
+ */
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (vd->vdev_ops->vdev_op_probe(vd));
+
+ return (0);
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+ int error;
+ int c;
+ uint64_t osize = 0;
+ uint64_t asize, psize;
+ uint64_t ashift = 0;
+
+ ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+ vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+ vd->vdev_state == VDEV_STATE_OFFLINE);
+
+ if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
+ vd->vdev_fault_arg >>= 1;
+ else
+ vd->vdev_fault_mode = VDEV_FAULT_NONE;
+
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+
+ if (!vd->vdev_removed && vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
+ return (ENXIO);
+ } else if (vd->vdev_offline) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
+ return (ENXIO);
+ }
+
+ error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+
+ if (zio_injection_enabled && error == 0)
+ error = zio_handle_device_injection(vd, ENXIO);
+
+ if (error) {
+ if (vd->vdev_removed &&
+ vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
+ vd->vdev_removed = B_FALSE;
+
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ vd->vdev_stat.vs_aux);
+ return (error);
+ }
+
+ vd->vdev_removed = B_FALSE;
+
+ if (vd->vdev_degraded) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_ERR_EXCEEDED);
+ } else {
+ vd->vdev_state = VDEV_STATE_HEALTHY;
+ }
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_NONE);
+ break;
+ }
+
+ osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+
+ if (vd->vdev_children == 0) {
+ if (osize < SPA_MINDEVSIZE) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (EOVERFLOW);
+ }
+ psize = osize;
+ asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+ } else {
+ if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
+ (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (EOVERFLOW);
+ }
+ psize = 0;
+ asize = osize;
+ }
+
+ vd->vdev_psize = psize;
+
+ if (vd->vdev_asize == 0) {
+ /*
+ * This is the first-ever open, so use the computed values.
+ * For testing purposes, a higher ashift can be requested.
+ */
+ vd->vdev_asize = asize;
+ vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
+ } else {
+ /*
+ * Make sure the alignment requirement hasn't increased.
+ */
+ if (ashift > vd->vdev_top->vdev_ashift) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure the device hasn't shrunk.
+ */
+ if (asize < vd->vdev_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
+ /*
+ * If all children are healthy and the asize has increased,
+ * then we've experienced dynamic LUN growth.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+ asize > vd->vdev_asize) {
+ vd->vdev_asize = asize;
+ }
+ }
+
+ /*
+ * Ensure we can issue some IO before declaring the
+ * vdev open for business.
+ */
+ error = vdev_probe(vd);
+ if (error) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_OPEN_FAILED);
+ return (error);
+ }
+
+ /*
+ * If this is a top-level vdev, compute the raidz-deflation
+ * ratio. Note, we hard-code in 128k (1<<17) because it is the
+ * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
+ * changes, this algorithm must never change, or we will
+ * inconsistently account for existing bp's.
+ */
+ if (vd->vdev_top == vd) {
+ vd->vdev_deflate_ratio = (1<<17) /
+ (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
+ }
+
+ /*
+ * This allows the ZFS DE to close cases appropriately. If a device
+ * goes away and later returns, we want to close the associated case.
+ * But it's not enough to simply post this only when a device goes from
+ * CANT_OPEN -> HEALTHY. If we reboot the system and the device is
+ * back, we also need to close the case (otherwise we will try to replay
+ * it). So we have to post this notifier every time. Since this only
+ * occurs during pool open or error recovery, this should not be an
+ * issue.
+ */
+ zfs_post_ok(vd->vdev_spa, vd);
+
+ return (0);
+}
+
+/*
+ * Called once the vdevs are all opened, this routine validates the label
+ * contents. This needs to be done before vdev_load() so that we don't
+ * inadvertently do repair I/Os to the wrong device.
+ *
+ * This function will only return failure if one of the vdevs indicates that it
+ * has since been destroyed or exported. This is only possible if
+ * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
+ * will be updated but the function will return 0.
+ */
+int
+vdev_validate(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ int c;
+ nvlist_t *label;
+ uint64_t guid;
+ uint64_t state;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vdev_validate(vd->vdev_child[c]) != 0)
+ return (EBADF);
+
+ /*
+ * If the device has already failed, or was marked offline, don't do
+ * any further validation. Otherwise, label I/O will fail and we will
+ * overwrite the previous state.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) {
+
+ if ((label = vdev_label_read_config(vd)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &guid) != 0 || guid != spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &guid) != 0 || guid != vd->vdev_guid) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ nvlist_free(label);
+
+ if (spa->spa_load_state == SPA_LOAD_OPEN &&
+ state != POOL_STATE_ACTIVE)
+ return (EBADF);
+ }
+
+ /*
+ * If we were able to open and validate a vdev that was previously
+ * marked permanently unavailable, clear that state now.
+ */
+ if (vd->vdev_not_present)
+ vd->vdev_not_present = 0;
+
+ return (0);
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+ vd->vdev_ops->vdev_op_close(vd);
+
+ vdev_cache_purge(vd);
+
+ /*
+ * We record the previous state before we close it, so that if we are
+ * doing a reopen(), we don't generate FMA ereports if we notice that
+ * it's still faulted.
+ */
+ vd->vdev_prevstate = vd->vdev_state;
+
+ if (vd->vdev_offline)
+ vd->vdev_state = VDEV_STATE_OFFLINE;
+ else
+ vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+}
+
+void
+vdev_reopen(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ vdev_close(vd);
+ (void) vdev_open(vd);
+
+ /*
+ * Call vdev_validate() here to make sure we have the same device.
+ * Otherwise, a device with an invalid label could be successfully
+ * opened in response to vdev_reopen().
+ */
+ (void) vdev_validate(vd);
+
+ /*
+ * Reassess parent vdev's health.
+ */
+ vdev_propagate_state(vd);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
+{
+ int error;
+
+ /*
+ * Normally, partial opens (e.g. of a mirror) are allowed.
+ * For a create, however, we want to fail the request if
+ * there are any components we can't open.
+ */
+ error = vdev_open(vd);
+
+ if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_close(vd);
+ return (error ? error : ENXIO);
+ }
+
+ /*
+ * Recursively initialize all labels.
+ */
+ if ((error = vdev_label_init(vd, txg, isreplacing ?
+ VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
+ vdev_close(vd);
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * The is the latter half of vdev_create(). It is distinct because it
+ * involves initiating transactions in order to do metaslab creation.
+ * For creation, we want to try to create all vdevs at once and then undo it
+ * if anything fails; this is much harder if we have pending transactions.
+ */
+void
+vdev_init(vdev_t *vd, uint64_t txg)
+{
+ /*
+ * Aim for roughly 200 metaslabs per vdev.
+ */
+ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+ vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+
+ /*
+ * Initialize the vdev's metaslabs. This can't fail because
+ * there's nothing to read when creating all new metaslabs.
+ */
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+}
+
+void
+vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
+{
+ ASSERT(vd == vd->vdev_top);
+ ASSERT(ISP2(flags));
+
+ if (flags & VDD_METASLAB)
+ (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
+
+ if (flags & VDD_DTL)
+ (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
+
+ (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
+}
+
+void
+vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+ mutex_enter(sm->sm_lock);
+ if (!space_map_contains(sm, txg, size))
+ space_map_add(sm, txg, size);
+ mutex_exit(sm->sm_lock);
+}
+
+int
+vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+ int dirty;
+
+ /*
+ * Quick test without the lock -- covers the common case that
+ * there are no dirty time segments.
+ */
+ if (sm->sm_space == 0)
+ return (0);
+
+ mutex_enter(sm->sm_lock);
+ dirty = space_map_contains(sm, txg, size);
+ mutex_exit(sm->sm_lock);
+
+ return (dirty);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+{
+ spa_t *spa = vd->vdev_spa;
+ int c;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ if (vd->vdev_children == 0) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ /*
+ * We're successfully scrubbed everything up to scrub_txg.
+ * Therefore, excise all old DTLs up to that point, then
+ * fold in the DTLs for everything we couldn't scrub.
+ */
+ if (scrub_txg != 0) {
+ space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
+ space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+ }
+ if (scrub_done)
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ mutex_exit(&vd->vdev_dtl_lock);
+ if (txg != 0)
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+ return;
+ }
+
+ /*
+ * Make sure the DTLs are always correct under the scrub lock.
+ */
+ if (vd == spa->spa_root_vdev)
+ mutex_enter(&spa->spa_scrub_lock);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
+ space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ }
+
+ if (vd == spa->spa_root_vdev)
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+vdev_dtl_load(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_obj_t *smo = &vd->vdev_dtl;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *db;
+ int error;
+
+ ASSERT(vd->vdev_children == 0);
+
+ if (smo->smo_object == 0)
+ return (0);
+
+ if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
+ return (error);
+
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(db->db_data, smo, sizeof (*smo));
+ dmu_buf_rele(db, FTAG);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (error);
+}
+
+void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_obj_t *smo = &vd->vdev_dtl;
+ space_map_t *sm = &vd->vdev_dtl_map;
+ objset_t *mos = spa->spa_meta_objset;
+ space_map_t smsync;
+ kmutex_t smlock;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+
+ dprintf("%s in txg %llu pass %d\n",
+ vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ if (vd->vdev_detached) {
+ if (smo->smo_object != 0) {
+ int err = dmu_object_free(mos, smo->smo_object, tx);
+ ASSERT3U(err, ==, 0);
+ smo->smo_object = 0;
+ }
+ dmu_tx_commit(tx);
+ dprintf("detach %s committed in txg %llu\n",
+ vdev_description(vd), txg);
+ return;
+ }
+
+ if (smo->smo_object == 0) {
+ ASSERT(smo->smo_objsize == 0);
+ ASSERT(smo->smo_alloc == 0);
+ smo->smo_object = dmu_object_alloc(mos,
+ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+ ASSERT(smo->smo_object != 0);
+ vdev_config_dirty(vd->vdev_top);
+ }
+
+ mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+
+ space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
+ &smlock);
+
+ mutex_enter(&smlock);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_walk(sm, space_map_add, &smsync);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ space_map_truncate(smo, mos, tx);
+ space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
+
+ space_map_destroy(&smsync);
+
+ mutex_exit(&smlock);
+ mutex_destroy(&smlock);
+
+ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(smo, db->db_data, sizeof (*smo));
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+}
+
+void
+vdev_load(vdev_t *vd)
+{
+ int c;
+
+ /*
+ * Recursively load all children.
+ */
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_load(vd->vdev_child[c]);
+
+ /*
+ * If this is a top-level vdev, initialize its metaslabs.
+ */
+ if (vd == vd->vdev_top &&
+ (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
+ vdev_metaslab_init(vd, 0) != 0))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+
+ /*
+ * If this is a leaf vdev, load its DTL.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+}
+
+/*
+ * The special vdev case is used for hot spares and l2cache devices. Its
+ * sole purpose it to set the vdev state for the associated vdev. To do this,
+ * we make sure that we can open the underlying device, then try to read the
+ * label, and make sure that the label is sane and that it hasn't been
+ * repurposed to another pool.
+ */
+int
+vdev_validate_aux(vdev_t *vd)
+{
+ nvlist_t *label;
+ uint64_t guid, version;
+ uint64_t state;
+
+ if ((label = vdev_label_read_config(vd)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (-1);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
+ version > SPA_VERSION ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
+ guid != vd->vdev_guid ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (-1);
+ }
+
+ /*
+ * We don't actually check the pool state here. If it's in fact in
+ * use by another pool, we update this fact on the fly when requested.
+ */
+ nvlist_free(label);
+ return (0);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+ metaslab_t *msp;
+
+ dprintf("%s txg %llu\n", vdev_description(vd), txg);
+
+ while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ metaslab_sync_done(msp, txg);
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *lvd;
+ metaslab_t *msp;
+ dmu_tx_t *tx;
+
+ dprintf("%s txg %llu pass %d\n",
+ vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+ if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
+ ASSERT(vd == vd->vdev_top);
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+ ASSERT(vd->vdev_ms_array != 0);
+ vdev_config_dirty(vd);
+ dmu_tx_commit(tx);
+ }
+
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
+ metaslab_sync(msp, txg);
+ (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+ }
+
+ while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+ vdev_dtl_sync(lvd, txg);
+
+ (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+const char *
+vdev_description(vdev_t *vd)
+{
+ if (vd == NULL || vd->vdev_ops == NULL)
+ return ("<unknown>");
+
+ if (vd->vdev_path != NULL)
+ return (vd->vdev_path);
+
+ if (vd->vdev_parent == NULL)
+ return (spa_name(vd->vdev_spa));
+
+ return (vd->vdev_ops->vdev_op_type);
+}
+
+/*
+ * Mark the given vdev faulted. A faulted vdev behaves as if the device could
+ * not be opened, and no I/O is attempted.
+ */
+int
+vdev_fault(spa_t *spa, uint64_t guid)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * Faulted state takes precedence over degraded.
+ */
+ vd->vdev_faulted = 1ULL;
+ vd->vdev_degraded = 0ULL;
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
+
+ /*
+ * If marking the vdev as faulted cause the toplevel vdev to become
+ * unavailable, then back off and simply mark the vdev as degraded
+ * instead.
+ */
+ if (vdev_is_dead(vd->vdev_top)) {
+ vd->vdev_degraded = 1ULL;
+ vd->vdev_faulted = 0ULL;
+
+ /*
+ * If we reopen the device and it's not dead, only then do we
+ * mark it degraded.
+ */
+ vdev_reopen(vd);
+
+ if (vdev_readable(vd)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_ERR_EXCEEDED);
+ }
+ }
+
+ vdev_config_dirty(vd->vdev_top);
+
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ return (0);
+}
+
+/*
+ * Mark the given vdev degraded. A degraded vdev is purely an indication to the
+ * user that something is wrong. The vdev continues to operate as normal as far
+ * as I/O is concerned.
+ */
+int
+vdev_degrade(spa_t *spa, uint64_t guid)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * If the vdev is already faulted, then don't do anything.
+ */
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+ return (0);
+ }
+
+ vd->vdev_degraded = 1ULL;
+ if (!vdev_is_dead(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_ERR_EXCEEDED);
+ vdev_config_dirty(vd->vdev_top);
+
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ return (0);
+}
+
+/*
+ * Online the given vdev. If 'unspare' is set, it implies two things. First,
+ * any attached spare device should be detached when the device finishes
+ * resilvering. Second, the online should be treated like a 'test' online case,
+ * so no FMA events are generated if the device fails to open.
+ */
+int
+vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
+ vdev_state_t *newstate)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ vd->vdev_offline = B_FALSE;
+ vd->vdev_tmpoffline = B_FALSE;
+ vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ?
+ B_TRUE : B_FALSE;
+ vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ?
+ B_TRUE : B_FALSE;
+ vdev_reopen(vd->vdev_top);
+ vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+
+ if (newstate)
+ *newstate = vd->vdev_state;
+ if ((flags & ZFS_ONLINE_UNSPARE) &&
+ !vdev_is_dead(vd) && vd->vdev_parent &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
+
+ vdev_config_dirty(vd->vdev_top);
+
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ /*
+ * Must hold spa_namespace_lock in order to post resilver sysevent
+ * w/pool name.
+ */
+ mutex_enter(&spa_namespace_lock);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * If the device isn't already offline, try to offline it.
+ */
+ if (!vd->vdev_offline) {
+ /*
+ * If this device's top-level vdev has a non-empty DTL,
+ * don't allow the device to be offlined.
+ *
+ * XXX -- make this more precise by allowing the offline
+ * as long as the remaining devices don't have any DTL holes.
+ */
+ if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * Offline this device and reopen its top-level vdev.
+ * If this action results in the top-level vdev becoming
+ * unusable, undo it and fail the request.
+ */
+ vd->vdev_offline = B_TRUE;
+ vdev_reopen(vd->vdev_top);
+ if (vdev_is_dead(vd->vdev_top)) {
+ vd->vdev_offline = B_FALSE;
+ vdev_reopen(vd->vdev_top);
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+ }
+ }
+
+ vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ?
+ B_TRUE : B_FALSE;
+
+ vdev_config_dirty(vd->vdev_top);
+
+ return (spa_vdev_exit(spa, NULL, txg, 0));
+}
+
+/*
+ * Clear the error counts associated with this vdev. Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked. We also clear all
+ * children. If 'vd' is NULL, then the user wants to clear all vdevs.
+ * If reopen is specified then attempt to reopen the vdev if the vdev is
+ * faulted or degraded.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted)
+{
+ int c;
+
+ if (vd == NULL)
+ vd = spa->spa_root_vdev;
+
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_is_failing = B_FALSE;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_clear(spa, vd->vdev_child[c], reopen_wanted);
+
+ /*
+ * If we're in the FAULTED state, then clear the persistent state and
+ * attempt to reopen the device. We also mark the vdev config dirty, so
+ * that the new faulted state is written out to disk.
+ */
+ if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded)) {
+ vd->vdev_faulted = vd->vdev_degraded = 0;
+ vdev_reopen(vd);
+ vdev_config_dirty(vd->vdev_top);
+
+ if (vd->vdev_faulted)
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+ spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
+ }
+}
+
+int
+vdev_readable(vdev_t *vd)
+{
+ /* XXPOLICY */
+ return (!vdev_is_dead(vd));
+}
+
+int
+vdev_writeable(vdev_t *vd)
+{
+ return (!vdev_is_dead(vd) && !vd->vdev_is_failing);
+}
+
+int
+vdev_is_dead(vdev_t *vd)
+{
+ return (vd->vdev_state < VDEV_STATE_DEGRADED);
+}
+
+int
+vdev_error_inject(vdev_t *vd, zio_t *zio)
+{
+ int error = 0;
+
+ if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
+ return (0);
+
+ if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
+ return (0);
+
+ switch (vd->vdev_fault_mode) {
+ case VDEV_FAULT_RANDOM:
+ if (spa_get_random(vd->vdev_fault_arg) == 0)
+ error = EIO;
+ break;
+
+ case VDEV_FAULT_COUNT:
+ if ((int64_t)--vd->vdev_fault_arg <= 0)
+ vd->vdev_fault_mode = VDEV_FAULT_NONE;
+ error = EIO;
+ break;
+ }
+
+ return (error);
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int c, t;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+ vs->vs_state = vd->vdev_state;
+ vs->vs_rsize = vdev_get_rsize(vd);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ /*
+ * If we're getting stats on the root vdev, aggregate the I/O counts
+ * over all top-level vdevs (i.e. the direct children of the root).
+ */
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+ vdev_stat_t *cvs = &cvd->vdev_stat;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ for (t = 0; t < ZIO_TYPES; t++) {
+ vs->vs_ops[t] += cvs->vs_ops[t];
+ vs->vs_bytes[t] += cvs->vs_bytes[t];
+ }
+ vs->vs_read_errors += cvs->vs_read_errors;
+ vs->vs_write_errors += cvs->vs_write_errors;
+ vs->vs_checksum_errors += cvs->vs_checksum_errors;
+ vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ vs->vs_scrub_errors += cvs->vs_scrub_errors;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ }
+}
+
+void
+vdev_clear_stats(vdev_t *vd)
+{
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_space = 0;
+ vd->vdev_stat.vs_dspace = 0;
+ vd->vdev_stat.vs_alloc = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_stat_update(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *pvd;
+ uint64_t txg = zio->io_txg;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ zio_type_t type = zio->io_type;
+ int flags = zio->io_flags;
+
+ if (zio->io_error == 0) {
+ if (!(flags & ZIO_FLAG_IO_BYPASS)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_ops[type]++;
+ vs->vs_bytes[type] += zio->io_size;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ if ((flags & ZIO_FLAG_IO_REPAIR) &&
+ zio->io_delegate_list == NULL) {
+ mutex_enter(&vd->vdev_stat_lock);
+ if (flags & ZIO_FLAG_SCRUB_THREAD)
+ vs->vs_scrub_repaired += zio->io_size;
+ else
+ vs->vs_self_healed += zio->io_size;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ return;
+ }
+
+ if (flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ if (vdev_readable(vd)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ if (type == ZIO_TYPE_READ) {
+ if (zio->io_error == ECKSUM)
+ vs->vs_checksum_errors++;
+ else
+ vs->vs_read_errors++;
+ }
+ if (type == ZIO_TYPE_WRITE)
+ vs->vs_write_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (type == ZIO_TYPE_WRITE) {
+ if (txg == 0 || vd->vdev_children != 0)
+ return;
+ if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
+ }
+ if (!(flags & ZIO_FLAG_IO_REPAIR)) {
+ if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+ return;
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+ }
+ }
+}
+
+void
+vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
+{
+ int c;
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
+
+ mutex_enter(&vd->vdev_stat_lock);
+
+ if (type == POOL_SCRUB_NONE) {
+ /*
+ * Update completion and end time. Leave everything else alone
+ * so we can report what happened during the previous scrub.
+ */
+ vs->vs_scrub_complete = complete;
+ vs->vs_scrub_end = gethrestime_sec();
+ } else {
+ vs->vs_scrub_type = type;
+ vs->vs_scrub_complete = 0;
+ vs->vs_scrub_examined = 0;
+ vs->vs_scrub_repaired = 0;
+ vs->vs_scrub_errors = 0;
+ vs->vs_scrub_start = gethrestime_sec();
+ vs->vs_scrub_end = 0;
+ }
+
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Update the in-core space usage stats for this vdev and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
+ boolean_t update_root)
+{
+ int64_t dspace_delta = space_delta;
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(vd == vd->vdev_top);
+
+ /*
+ * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
+ * factor. We must calculate this here and not at the root vdev
+ * because the root vdev's psize-to-asize is simply the max of its
+ * childrens', thus not accurate enough for us.
+ */
+ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
+ dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_space += space_delta;
+ vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ if (update_root) {
+ ASSERT(rvd == vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ /*
+ * Don't count non-normal (e.g. intent log) space as part of
+ * the pool's capacity.
+ */
+ if (vd->vdev_mg->mg_class != spa->spa_normal_class)
+ return;
+
+ mutex_enter(&rvd->vdev_stat_lock);
+ rvd->vdev_stat.vs_space += space_delta;
+ rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&rvd->vdev_stat_lock);
+ }
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ /*
+ * The dirty list is protected by the config lock. The caller must
+ * either hold the config lock as writer, or must be the sync thread
+ * (which holds the lock as reader). There's only one sync thread,
+ * so this is sufficient to ensure mutual exclusion.
+ */
+ ASSERT(spa_config_held(spa, RW_WRITER) ||
+ dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_config_dirty(rvd->vdev_child[c]);
+ } else {
+ ASSERT(vd == vd->vdev_top);
+
+ if (!list_link_active(&vd->vdev_dirty_node))
+ list_insert_head(&spa->spa_dirty_list, vd);
+ }
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, RW_WRITER) ||
+ dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ ASSERT(list_link_active(&vd->vdev_dirty_node));
+ list_remove(&spa->spa_dirty_list, vd);
+}
+
+void
+vdev_propagate_state(vdev_t *vd)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int degraded = 0, faulted = 0;
+ int corrupted = 0;
+ int c;
+ vdev_t *child;
+
+ if (vd->vdev_children > 0) {
+ for (c = 0; c < vd->vdev_children; c++) {
+ child = vd->vdev_child[c];
+ if (vdev_is_dead(child) && !vdev_readable(child))
+ faulted++;
+ else if (child->vdev_state <= VDEV_STATE_DEGRADED)
+ degraded++;
+
+ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+ corrupted++;
+ }
+
+ vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+ /*
+ * Root special: if there is a toplevel vdev that cannot be
+ * opened due to corrupted metadata, then propagate the root
+ * vdev's aux state as 'corrupt' rather than 'insufficient
+ * replicas'.
+ */
+ if (corrupted && vd == rvd &&
+ rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+ vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ }
+
+ if (vd->vdev_parent && !vd->vdev_islog)
+ vdev_propagate_state(vd->vdev_parent);
+}
+
+/*
+ * Set a vdev's state. If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
+ */
+void
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
+{
+ uint64_t save_state;
+
+ if (state == vd->vdev_state) {
+ vd->vdev_stat.vs_aux = aux;
+ return;
+ }
+
+ save_state = vd->vdev_state;
+
+ vd->vdev_state = state;
+ vd->vdev_stat.vs_aux = aux;
+
+ /*
+ * If we are setting the vdev state to anything but an open state, then
+ * always close the underlying device. Otherwise, we keep accessible
+ * but invalid devices open forever. We don't call vdev_close() itself,
+ * because that implies some extra checks (offline, etc) that we don't
+ * want here. This is limited to leaf devices, because otherwise
+ * closing the device will affect other children.
+ */
+ if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_close(vd);
+
+ if (vd->vdev_removed &&
+ state == VDEV_STATE_CANT_OPEN &&
+ (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
+ /*
+ * If the previous state is set to VDEV_STATE_REMOVED, then this
+ * device was previously marked removed and someone attempted to
+ * reopen it. If this failed due to a nonexistent device, then
+ * keep the device in the REMOVED state. We also let this be if
+ * it is one of our special test online cases, which is only
+ * attempting to online the device and shouldn't generate an FMA
+ * fault.
+ */
+ vd->vdev_state = VDEV_STATE_REMOVED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+ } else if (state == VDEV_STATE_REMOVED) {
+ /*
+ * Indicate to the ZFS DE that this device has been removed, and
+ * any recent errors should be ignored.
+ */
+ zfs_post_remove(vd->vdev_spa, vd);
+ vd->vdev_removed = B_TRUE;
+ } else if (state == VDEV_STATE_CANT_OPEN) {
+ /*
+ * If we fail to open a vdev during an import, we mark it as
+ * "not available", which signifies that it was never there to
+ * begin with. Failure to open such a device is not considered
+ * an error.
+ */
+ if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
+ vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_not_present = 1;
+
+ /*
+ * Post the appropriate ereport. If the 'prevstate' field is
+ * set to something other than VDEV_STATE_UNKNOWN, it indicates
+ * that this is part of a vdev_reopen(). In this case, we don't
+ * want to post the ereport if the device was already in the
+ * CANT_OPEN state beforehand.
+ *
+ * If the 'checkremove' flag is set, then this is an attempt to
+ * online the device in response to an insertion event. If we
+ * hit this case, then we have detected an insertion event for a
+ * faulted or offline device that wasn't in the removed state.
+ * In this scenario, we don't post an ereport because we are
+ * about to replace the device, or attempt an online with
+ * vdev_forcefault, which will generate the fault for us.
+ */
+ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
+ !vd->vdev_not_present && !vd->vdev_checkremove &&
+ vd != vd->vdev_spa->spa_root_vdev) {
+ const char *class;
+
+ switch (aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+ break;
+ case VDEV_AUX_CORRUPT_DATA:
+ class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+ break;
+ case VDEV_AUX_NO_REPLICAS:
+ class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+ break;
+ case VDEV_AUX_BAD_GUID_SUM:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+ break;
+ case VDEV_AUX_TOO_SMALL:
+ class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+ break;
+ case VDEV_AUX_BAD_LABEL:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+ break;
+ default:
+ class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+ }
+
+ zfs_ereport_post(class, vd->vdev_spa,
+ vd, NULL, save_state, 0);
+ }
+
+ /* Erase any notion of persistent removed state */
+ vd->vdev_removed = B_FALSE;
+ } else {
+ vd->vdev_removed = B_FALSE;
+ }
+
+ if (!isopen)
+ vdev_propagate_state(vd);
+}
diff --git a/zfs/lib/libzpool/vdev_cache.c b/zfs/lib/libzpool/vdev_cache.c
new file mode 100644
index 000000000..370e8a890
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_cache.c
@@ -0,0 +1,435 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_cache.c 1.7 08/01/10 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/kstat.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache. When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result. In the best case, this can turn 128 back-to-back 512-byte
+ * reads into a single 64k read followed by 127 cache hits; this reduces
+ * latency dramatically. In the worst case, it can turn an isolated 512-byte
+ * read into a 64k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth. A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region. Currently, only
+ * metadata I/O is inflated. A futher enhancement could take advantage of
+ * more semantic information about the I/O. And it could use something
+ * faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate. This reserves a cache entry for the specified region.
+ * We separate the allocate and fill operations so that multiple threads
+ * don't generate I/O for the same cache miss.
+ *
+ * (2) Fill. When the I/O for a cache miss completes, the fill routine
+ * places the data in the previously allocated cache entry.
+ *
+ * (3) Read. Read data from the cache.
+ *
+ * (4) Write. Update cache contents after write completion.
+ *
+ * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
+ * if the total cache size exceeds zfs_vdev_cache_size.
+ */
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * All i/os smaller than zfs_vdev_cache_max will be turned into
+ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
+ * track buffer). At most zfs_vdev_cache_size bytes will be kept in each
+ * vdev's vdev_cache.
+ */
+int zfs_vdev_cache_max = 1<<14; /* 16KB */
+int zfs_vdev_cache_size = 10ULL << 20; /* 10MB */
+int zfs_vdev_cache_bshift = 16;
+
+#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */
+
+kstat_t *vdc_ksp = NULL;
+
+typedef struct vdc_stats {
+ kstat_named_t vdc_stat_delegations;
+ kstat_named_t vdc_stat_hits;
+ kstat_named_t vdc_stat_misses;
+} vdc_stats_t;
+
+static vdc_stats_t vdc_stats = {
+ { "delegations", KSTAT_DATA_UINT64 },
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 }
+};
+
+#define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1);
+
+static int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = a1;
+ const vdev_cache_entry_t *ve2 = a2;
+
+ if (ve1->ve_offset < ve2->ve_offset)
+ return (-1);
+ if (ve1->ve_offset > ve2->ve_offset)
+ return (1);
+ return (0);
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = a1;
+ const vdev_cache_entry_t *ve2 = a2;
+
+ if (ve1->ve_lastused < ve2->ve_lastused)
+ return (-1);
+ if (ve1->ve_lastused > ve2->ve_lastused)
+ return (1);
+
+ /*
+ * Among equally old entries, sort by offset to ensure uniqueness.
+ */
+ return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT(ve->ve_fill_io == NULL);
+ ASSERT(ve->ve_data != NULL);
+
+ dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
+ vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused,
+ ve->ve_hits, ve->ve_missed_update);
+
+ avl_remove(&vc->vc_lastused_tree, ve);
+ avl_remove(&vc->vc_offset_tree, ve);
+ zio_buf_free(ve->ve_data, VCBS);
+ kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache. At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
+ vdev_cache_entry_t *ve;
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+ if (zfs_vdev_cache_size == 0)
+ return (NULL);
+
+ /*
+ * If adding a new entry would exceed the cache size,
+ * evict the oldest entry (LRU).
+ */
+ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
+ zfs_vdev_cache_size) {
+ ve = avl_first(&vc->vc_lastused_tree);
+ if (ve->ve_fill_io != NULL) {
+ dprintf("can't evict in %p, still filling\n", vc);
+ return (NULL);
+ }
+ ASSERT(ve->ve_hits != 0);
+ vdev_cache_evict(vc, ve);
+ }
+
+ ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+ ve->ve_offset = offset;
+ ve->ve_lastused = lbolt;
+ ve->ve_data = zio_buf_alloc(VCBS);
+
+ avl_add(&vc->vc_offset_tree, ve);
+ avl_add(&vc->vc_lastused_tree, ve);
+
+ return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT(ve->ve_fill_io == NULL);
+
+ if (ve->ve_lastused != lbolt) {
+ avl_remove(&vc->vc_lastused_tree, ve);
+ ve->ve_lastused = lbolt;
+ avl_add(&vc->vc_lastused_tree, ve);
+ }
+
+ ve->ve_hits++;
+ bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve = zio->io_private;
+ zio_t *dio;
+
+ ASSERT(zio->io_size == VCBS);
+
+ /*
+ * Add data to the cache.
+ */
+ mutex_enter(&vc->vc_lock);
+
+ ASSERT(ve->ve_fill_io == zio);
+ ASSERT(ve->ve_offset == zio->io_offset);
+ ASSERT(ve->ve_data == zio->io_data);
+
+ ve->ve_fill_io = NULL;
+
+ /*
+ * Even if this cache line was invalidated by a missed write update,
+ * any reads that were queued up before the missed update are still
+ * valid, so we can satisfy them from this line before we evict it.
+ */
+ for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
+ vdev_cache_hit(vc, ve, dio);
+
+ if (zio->io_error || ve->ve_missed_update)
+ vdev_cache_evict(vc, ve);
+
+ mutex_exit(&vc->vc_lock);
+
+ while ((dio = zio->io_delegate_list) != NULL) {
+ zio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = zio->io_error;
+ zio_execute(dio);
+ }
+}
+
+/*
+ * Read data from the cache. Returns 0 on cache hit, errno on a miss.
+ */
+int
+vdev_cache_read(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+ zio_t *fio;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+ return (EINVAL);
+
+ if (zio->io_size > zfs_vdev_cache_max)
+ return (EOVERFLOW);
+
+ /*
+ * If the I/O straddles two or more cache blocks, don't cache it.
+ */
+ if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
+ return (EXDEV);
+
+ ASSERT(cache_phase + zio->io_size <= VCBS);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = cache_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
+
+ if (ve != NULL) {
+ if (ve->ve_missed_update) {
+ mutex_exit(&vc->vc_lock);
+ return (ESTALE);
+ }
+
+ if ((fio = ve->ve_fill_io) != NULL) {
+ zio->io_delegate_next = fio->io_delegate_list;
+ fio->io_delegate_list = zio;
+ zio_vdev_io_bypass(zio);
+ mutex_exit(&vc->vc_lock);
+ VDCSTAT_BUMP(vdc_stat_delegations);
+ return (0);
+ }
+
+ vdev_cache_hit(vc, ve, zio);
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_execute(zio);
+ VDCSTAT_BUMP(vdc_stat_hits);
+ return (0);
+ }
+
+ ve = vdev_cache_allocate(zio);
+
+ if (ve == NULL) {
+ mutex_exit(&vc->vc_lock);
+ return (ENOMEM);
+ }
+
+ fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+ ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
+ vdev_cache_fill, ve);
+
+ ve->ve_fill_io = fio;
+ fio->io_delegate_list = zio;
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_nowait(fio);
+ VDCSTAT_BUMP(vdc_stat_misses);
+
+ return (0);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t io_start = zio->io_offset;
+ uint64_t io_end = io_start + zio->io_size;
+ uint64_t min_offset = P2ALIGN(io_start, VCBS);
+ uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
+ avl_index_t where;
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = min_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+ if (ve == NULL)
+ ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+ while (ve != NULL && ve->ve_offset < max_offset) {
+ uint64_t start = MAX(ve->ve_offset, io_start);
+ uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
+
+ if (ve->ve_fill_io != NULL) {
+ ve->ve_missed_update = 1;
+ } else {
+ bcopy((char *)zio->io_data + start - io_start,
+ ve->ve_data + start - ve->ve_offset, end - start);
+ }
+ ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+ }
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_purge(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve;
+
+ mutex_enter(&vc->vc_lock);
+ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+ vdev_cache_evict(vc, ve);
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_offset_node));
+
+ avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_lastused_node));
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ vdev_cache_purge(vd);
+
+ avl_destroy(&vc->vc_offset_tree);
+ avl_destroy(&vc->vc_lastused_tree);
+
+ mutex_destroy(&vc->vc_lock);
+}
+
+void
+vdev_cache_stat_init(void)
+{
+ vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (vdc_ksp != NULL) {
+ vdc_ksp->ks_data = &vdc_stats;
+ kstat_install(vdc_ksp);
+ }
+}
+
+void
+vdev_cache_stat_fini(void)
+{
+ if (vdc_ksp != NULL) {
+ kstat_delete(vdc_ksp);
+ vdc_ksp = NULL;
+ }
+}
diff --git a/zfs/lib/libzpool/vdev_disk.c b/zfs/lib/libzpool/vdev_disk.c
new file mode 100644
index 000000000..5f73c9aa2
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_disk.c
@@ -0,0 +1,639 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_disk.c 1.15 08/04/09 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/refcount.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/sunldi.h>
+
+/*
+ * Virtual device vector for disks.
+ */
+
+extern ldi_ident_t zfs_li;
+
+typedef struct vdev_disk_buf {
+ buf_t vdb_buf;
+ zio_t *vdb_io;
+} vdev_disk_buf_t;
+
+static int
+vdev_disk_open_common(vdev_t *vd)
+{
+ vdev_disk_t *dvd;
+ dev_t dev;
+ int error;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+ /*
+ * When opening a disk device, we want to preserve the user's original
+ * intent. We always want to open the device by the path the user gave
+ * us, even if it is one of multiple paths to the save device. But we
+ * also want to be able to survive disks being removed/recabled.
+ * Therefore the sequence of opening devices is:
+ *
+ * 1. Try opening the device by path. For legacy pools without the
+ * 'whole_disk' property, attempt to fix the path by appending 's0'.
+ *
+ * 2. If the devid of the device matches the stored value, return
+ * success.
+ *
+ * 3. Otherwise, the device may have moved. Try opening the device
+ * by the devid instead.
+ *
+ */
+ if (vd->vdev_devid != NULL) {
+ if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
+ &dvd->vd_minor) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+ }
+
+ error = EINVAL; /* presume failure */
+
+ if (vd->vdev_path != NULL) {
+ ddi_devid_t devid;
+
+ if (vd->vdev_wholedisk == -1ULL) {
+ size_t len = strlen(vd->vdev_path) + 3;
+ char *buf = kmem_alloc(len, KM_SLEEP);
+ ldi_handle_t lh;
+
+ (void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+ if (ldi_open_by_name(buf, spa_mode, kcred,
+ &lh, zfs_li) == 0) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = buf;
+ vd->vdev_wholedisk = 1ULL;
+ (void) ldi_close(lh, spa_mode, kcred);
+ } else {
+ kmem_free(buf, len);
+ }
+ }
+
+ error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
+ &dvd->vd_lh, zfs_li);
+
+ /*
+ * Compare the devid to the stored value.
+ */
+ if (error == 0 && vd->vdev_devid != NULL &&
+ ldi_get_devid(dvd->vd_lh, &devid) == 0) {
+ if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
+ error = EINVAL;
+ (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+ dvd->vd_lh = NULL;
+ }
+ ddi_devid_free(devid);
+ }
+
+ /*
+ * If we succeeded in opening the device, but 'vdev_wholedisk'
+ * is not yet set, then this must be a slice.
+ */
+ if (error == 0 && vd->vdev_wholedisk == -1ULL)
+ vd->vdev_wholedisk = 0;
+ }
+
+ /*
+ * If we were unable to open by path, or the devid check fails, open by
+ * devid instead.
+ */
+ if (error != 0 && vd->vdev_devid != NULL)
+ error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
+ spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+ /*
+ * If all else fails, then try opening by physical path (if available)
+ * or the logical path (if we failed due to the devid check). While not
+ * as reliable as the devid, this will give us something, and the higher
+ * level vdev validation will prevent us from opening the wrong device.
+ */
+ if (error) {
+ if (vd->vdev_physpath != NULL &&
+ (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV)
+ error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode,
+ kcred, &dvd->vd_lh, zfs_li);
+
+ /*
+ * Note that we don't support the legacy auto-wholedisk support
+ * as above. This hasn't been used in a very long time and we
+ * don't need to propagate its oddities to this edge condition.
+ */
+ if (error && vd->vdev_path != NULL)
+ error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
+ &dvd->vd_lh, zfs_li);
+ }
+
+ if (error)
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+
+ return (error);
+}
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_disk_t *dvd;
+ struct dk_minfo dkm;
+ int error;
+ dev_t dev;
+ int otyp;
+
+ error = vdev_disk_open_common(vd);
+ if (error)
+ return (error);
+
+ dvd = vd->vdev_tsd;
+ /*
+ * Once a device is opened, verify that the physical device path (if
+ * available) is up to date.
+ */
+ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
+ ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
+ char *physpath, *minorname;
+
+ physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ minorname = NULL;
+ if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
+ ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
+ (vd->vdev_physpath == NULL ||
+ strcmp(vd->vdev_physpath, physpath) != 0)) {
+ if (vd->vdev_physpath)
+ spa_strfree(vd->vdev_physpath);
+ (void) strlcat(physpath, ":", MAXPATHLEN);
+ (void) strlcat(physpath, minorname, MAXPATHLEN);
+ vd->vdev_physpath = spa_strdup(physpath);
+ }
+ if (minorname)
+ kmem_free(minorname, strlen(minorname) + 1);
+ kmem_free(physpath, MAXPATHLEN);
+ }
+
+ /*
+ * Determine the actual size of the device.
+ */
+ if (ldi_get_size(dvd->vd_lh, psize) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (EINVAL);
+ }
+
+ /*
+ * If we own the whole disk, try to enable disk write caching.
+ * We ignore errors because it's OK if we can't do it.
+ */
+ if (vd->vdev_wholedisk == 1) {
+ int wce = 1;
+ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
+ FKIOCTL, kcred, NULL);
+ }
+
+ /*
+ * Determine the device's minimum transfer size.
+ * If the ioctl isn't supported, assume DEV_BSIZE.
+ */
+ if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
+ FKIOCTL, kcred, NULL) != 0)
+ dkm.dki_lbsize = DEV_BSIZE;
+
+ *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
+
+ /*
+ * Clear the nowritecache bit, so that on a vdev_reopen() we will
+ * try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *vd)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ if (dvd == NULL)
+ return;
+
+ if (dvd->vd_minor != NULL)
+ ddi_devid_str_free(dvd->vd_minor);
+
+ if (dvd->vd_devid != NULL)
+ ddi_devid_free(dvd->vd_devid);
+
+ if (dvd->vd_lh != NULL)
+ (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+
+ kmem_free(dvd, sizeof (vdev_disk_t));
+ vd->vdev_tsd = NULL;
+}
+
+int
+vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
+ uint64_t offset, int flags)
+{
+ buf_t *bp;
+ int error = 0;
+
+ if (vd_lh == NULL)
+ return (EINVAL);
+
+ ASSERT(flags & B_READ || flags & B_WRITE);
+
+ bp = getrbuf(KM_SLEEP);
+ bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
+ bp->b_bcount = size;
+ bp->b_un.b_addr = (void *)data;
+ bp->b_lblkno = lbtodb(offset);
+ bp->b_bufsize = size;
+
+ error = ldi_strategy(vd_lh, bp);
+ ASSERT(error == 0);
+ if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
+ error = EIO;
+ freerbuf(bp);
+
+ return (error);
+}
+
+static int
+vdev_disk_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
+ int flags)
+{
+ int error = 0;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ if (vd == NULL || dvd == NULL || dvd->vd_lh == NULL)
+ return (EINVAL);
+
+ error = vdev_disk_physio(dvd->vd_lh, data, size, offset, flags);
+
+ if (zio_injection_enabled && error == 0)
+ error = zio_handle_device_injection(vd, EIO);
+
+ return (error);
+}
+
+/*
+ * Determine if the underlying device is accessible by reading and writing
+ * to a known location. We must be able to do this during syncing context
+ * and thus we cannot set the vdev state directly.
+ */
+static int
+vdev_disk_probe(vdev_t *vd)
+{
+ uint64_t offset;
+ vdev_t *nvd;
+ int l, error = 0, retries = 0;
+ char *vl_pad;
+
+ if (vd == NULL)
+ return (EINVAL);
+
+ /* Hijack the current vdev */
+ nvd = vd;
+
+ /*
+ * Pick a random label to rewrite.
+ */
+ l = spa_get_random(VDEV_LABELS);
+ ASSERT(l < VDEV_LABELS);
+
+ offset = vdev_label_offset(vd->vdev_psize, l,
+ offsetof(vdev_label_t, vl_pad));
+
+ vl_pad = kmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP);
+
+ /*
+ * Try to read and write to a special location on the
+ * label. We use the existing vdev initially and only
+ * try to create and reopen it if we encounter a failure.
+ */
+ while ((error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
+ offset, B_READ)) != 0 && retries == 0) {
+
+ nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+ if (vd->vdev_path)
+ nvd->vdev_path = spa_strdup(vd->vdev_path);
+ if (vd->vdev_physpath)
+ nvd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+ if (vd->vdev_devid)
+ nvd->vdev_devid = spa_strdup(vd->vdev_devid);
+ nvd->vdev_wholedisk = vd->vdev_wholedisk;
+ nvd->vdev_guid = vd->vdev_guid;
+ retries++;
+
+ error = vdev_disk_open_common(nvd);
+ if (error)
+ break;
+ }
+
+ if (!error) {
+ error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
+ offset, B_WRITE);
+ }
+
+ /* Clean up if we allocated a new vdev */
+ if (retries) {
+ vdev_disk_close(nvd);
+ if (nvd->vdev_path)
+ spa_strfree(nvd->vdev_path);
+ if (nvd->vdev_physpath)
+ spa_strfree(nvd->vdev_physpath);
+ if (nvd->vdev_devid)
+ spa_strfree(nvd->vdev_devid);
+ kmem_free(nvd, sizeof (vdev_t));
+ }
+ kmem_free(vl_pad, VDEV_SKIP_SIZE);
+
+ /* Reset the failing flag */
+ if (!error)
+ vd->vdev_is_failing = B_FALSE;
+
+ return (error);
+}
+
+static void
+vdev_disk_io_intr(buf_t *bp)
+{
+ vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
+ zio_t *zio = vdb->vdb_io;
+
+ if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
+ zio->io_error = EIO;
+
+ kmem_free(vdb, sizeof (vdev_disk_buf_t));
+
+ zio_interrupt(zio);
+}
+
+static void
+vdev_disk_ioctl_done(void *zio_arg, int error)
+{
+ zio_t *zio = zio_arg;
+
+ zio->io_error = error;
+
+ zio_interrupt(zio);
+}
+
+static int
+vdev_disk_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_buf_t *vdb;
+ buf_t *bp;
+ int flags, error;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = ENXIO;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ switch (zio->io_cmd) {
+
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ if (vd->vdev_nowritecache) {
+ zio->io_error = ENOTSUP;
+ break;
+ }
+
+ zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
+ zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE;
+ zio->io_dk_callback.dkc_cookie = zio;
+
+ error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
+ (uintptr_t)&zio->io_dk_callback,
+ FKIOCTL, kcred, NULL);
+
+ if (error == 0) {
+ /*
+ * The ioctl will be done asychronously,
+ * and will call vdev_disk_ioctl_done()
+ * upon completion.
+ */
+ return (ZIO_PIPELINE_STOP);
+ }
+
+ if (error == ENOTSUP || error == ENOTTY) {
+ /*
+ * If we get ENOTSUP or ENOTTY, we know that
+ * no future attempts will ever succeed.
+ * In this case we set a persistent bit so
+ * that we don't bother with the ioctl in the
+ * future.
+ */
+ vd->vdev_nowritecache = B_TRUE;
+ }
+ zio->io_error = error;
+
+ break;
+
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return (ZIO_PIPELINE_STOP);
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return (ZIO_PIPELINE_STOP);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ else
+ error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
+
+ if (error) {
+ zio->io_error = error;
+ zio_interrupt(zio);
+ return (ZIO_PIPELINE_STOP);
+ }
+
+ flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
+ flags |= B_BUSY | B_NOCACHE;
+ if (zio->io_flags & ZIO_FLAG_FAILFAST)
+ flags |= B_FAILFAST;
+
+ vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
+
+ vdb->vdb_io = zio;
+ bp = &vdb->vdb_buf;
+
+ bioinit(bp);
+ bp->b_flags = flags;
+ bp->b_bcount = zio->io_size;
+ bp->b_un.b_addr = zio->io_data;
+ bp->b_lblkno = lbtodb(zio->io_offset);
+ bp->b_bufsize = zio->io_size;
+ bp->b_iodone = (int (*)())vdev_disk_io_intr;
+
+ error = ldi_strategy(dvd->vd_lh, bp);
+ /* ldi_strategy() will return non-zero only on programming errors */
+ ASSERT(error == 0);
+
+ return (ZIO_PIPELINE_STOP);
+}
+
+static int
+vdev_disk_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+ /*
+ * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
+ * the device has been removed. If this is the case, then we trigger an
+ * asynchronous removal of the device. Otherwise, probe the device and
+ * make sure it's still accessible.
+ */
+ if (zio->io_error == EIO) {
+ vdev_t *vd = zio->io_vd;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ int state;
+
+ state = DKIO_NONE;
+ if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
+ FKIOCTL, kcred, NULL) == 0 &&
+ state != DKIO_INSERTED) {
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ } else if (vdev_probe(vd) != 0) {
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vd->vdev_is_failing = B_TRUE;
+ }
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_disk_open,
+ vdev_disk_close,
+ vdev_disk_probe,
+ vdev_default_asize,
+ vdev_disk_io_start,
+ vdev_disk_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+/*
+ * Given the root disk device pathname, read the label from the device,
+ * and construct a configuration nvlist.
+ */
+nvlist_t *
+vdev_disk_read_rootlabel(char *devpath)
+{
+ nvlist_t *config = NULL;
+ ldi_handle_t vd_lh;
+ vdev_label_t *label;
+ uint64_t s, size;
+ int l;
+
+ /*
+ * Read the device label and build the nvlist.
+ */
+ if (ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, zfs_li))
+ return (NULL);
+
+ if (ldi_get_size(vd_lh, &s))
+ return (NULL);
+
+ size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
+ label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+ uint64_t offset, state, txg = 0;
+
+ /* read vdev label */
+ offset = vdev_label_offset(size, l, 0);
+ if (vdev_disk_physio(vd_lh, (caddr_t)label,
+ VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE +
+ VDEV_PHYS_SIZE, offset, B_READ) != 0)
+ continue;
+
+ if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
+ sizeof (label->vl_vdev_phys.vp_nvlist), &config, 0) != 0) {
+ config = NULL;
+ continue;
+ }
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 || state >= POOL_STATE_DESTROYED) {
+ nvlist_free(config);
+ config = NULL;
+ continue;
+ }
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0 || txg == 0) {
+ nvlist_free(config);
+ config = NULL;
+ continue;
+ }
+
+ break;
+ }
+
+ kmem_free(label, sizeof (vdev_label_t));
+ return (config);
+}
diff --git a/zfs/lib/libzpool/vdev_file.c b/zfs/lib/libzpool/vdev_file.c
new file mode 100644
index 000000000..974c4cdab
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_file.c
@@ -0,0 +1,340 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_file.c 1.7 07/11/27 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static int
+vdev_file_open_common(vdev_t *vd)
+{
+ vdev_file_t *vf;
+ vnode_t *vp;
+ int error;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+ error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
+ spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (vp->v_type != VREG) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (ENODEV);
+ }
+#endif
+
+ return (0);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_file_t *vf;
+ vattr_t vattr;
+ int error;
+
+ if ((error = vdev_file_open_common(vd)) != 0)
+ return (error);
+
+ vf = vd->vdev_tsd;
+
+ /*
+ * Determine the physical size of the file.
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ *psize = vattr.va_size;
+ *ashift = SPA_MINBLOCKSHIFT;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vf == NULL)
+ return;
+
+ if (vf->vf_vnode != NULL) {
+ (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
+ (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
+ VN_RELE(vf->vf_vnode);
+ }
+
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+static int
+vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
+ enum uio_rw rw)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+ ssize_t resid;
+ int error = 0;
+
+ if (vd == NULL || vf == NULL || vf->vf_vnode == NULL)
+ return (EINVAL);
+
+ ASSERT(rw == UIO_READ || rw == UIO_WRITE);
+
+ error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, &resid);
+ if (error || resid != 0)
+ return (EIO);
+ return (0);
+}
+
+/*
+ * Determine if the underlying device is accessible by reading and writing
+ * to a known location. We must be able to do this during syncing context
+ * and thus we cannot set the vdev state directly.
+ */
+static int
+vdev_file_probe(vdev_t *vd)
+{
+ vdev_t *nvd;
+ char *vl_boot;
+ uint64_t offset;
+ int l, error = 0, retries = 0;
+
+ if (vd == NULL)
+ return (EINVAL);
+
+ /* Hijack the current vdev */
+ nvd = vd;
+
+ /*
+ * Pick a random label to rewrite.
+ */
+ l = spa_get_random(VDEV_LABELS);
+ ASSERT(l < VDEV_LABELS);
+
+ offset = vdev_label_offset(vd->vdev_psize, l,
+ offsetof(vdev_label_t, vl_boot_header));
+
+ vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);
+
+ while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
+ offset, UIO_READ)) != 0 && retries == 0) {
+
+ /*
+ * If we failed with the vdev that was passed in then
+ * try allocating a new one and try again.
+ */
+ nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+ if (vd->vdev_path)
+ nvd->vdev_path = spa_strdup(vd->vdev_path);
+ retries++;
+
+ error = vdev_file_open_common(nvd);
+ if (error)
+ break;
+ }
+
+ if ((spa_mode & FWRITE) && !error) {
+ error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
+ offset, UIO_WRITE);
+ }
+
+ if (retries) {
+ vdev_file_close(nvd);
+ if (nvd->vdev_path)
+ spa_strfree(nvd->vdev_path);
+ kmem_free(nvd, sizeof (vdev_t));
+ }
+ kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);
+
+ if (!error)
+ vd->vdev_is_failing = B_FALSE;
+
+ return (error);
+}
+
+static int
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+ ssize_t resid;
+ int error;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = ENXIO;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+ kcred, NULL);
+ dprintf("fsync(%s) = %d\n", vdev_description(vd),
+ zio->io_error);
+ break;
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ /*
+ * In the kernel, don't bother double-caching, but in userland,
+ * we want to test the vdev_cache code.
+ */
+#ifndef _KERNEL
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return (ZIO_PIPELINE_STOP);
+#endif
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return (ZIO_PIPELINE_STOP);
+
+ /* XXPOLICY */
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ else
+ error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
+ if (error) {
+ zio->io_error = error;
+ zio_interrupt(zio);
+ return (ZIO_PIPELINE_STOP);
+ }
+
+ zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+ UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
+ zio->io_size, zio->io_offset, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, &resid);
+
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = ENOSPC;
+
+ zio_interrupt(zio);
+
+ return (ZIO_PIPELINE_STOP);
+}
+
+static int
+vdev_file_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(vd, EIO);
+
+ /*
+ * If an error has been encountered then attempt to probe the device
+ * to determine if it's still accessible.
+ */
+ if (zio->io_error == EIO && vdev_probe(vd) != 0)
+ vd->vdev_is_failing = B_TRUE;
+
+ vdev_queue_io_done(zio);
+
+#ifndef _KERNEL
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+#endif
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+vdev_ops_t vdev_file_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_file_probe,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ VDEV_TYPE_FILE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_file_probe,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+#endif
diff --git a/zfs/lib/libzpool/vdev_label.c b/zfs/lib/libzpool/vdev_label.c
new file mode 100644
index 000000000..7dcf1facd
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_label.c
@@ -0,0 +1,1045 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_label.c 1.18 07/12/12 SMI"
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ * 1. Uniquely identify this device as part of a ZFS pool and confirm its
+ * identity within the pool.
+ *
+ * 2. Verify that all the devices given in a configuration are present
+ * within the pool.
+ *
+ * 3. Determine the uberblock for the pool.
+ *
+ * 4. In case of an import operation, determine the configuration of the
+ * toplevel vdev of which it is a part.
+ *
+ * 5. If an import operation cannot find all the devices in the pool,
+ * provide enough information to the administrator to determine which
+ * devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases. The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point. To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced. Assuming we have
+ * labels and an uberblock with the following transaction groups:
+ *
+ * L1 UB L2
+ * +------+ +------+ +------+
+ * | | | | | |
+ * | t10 | | t10 | | t10 |
+ * | | | | | |
+ * +------+ +------+ +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10). Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ * 1. For each vdev, update 'L1' to the new label
+ * 2. Update the uberblock
+ * 3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group. If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid. If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced. If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool. This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure. The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information. It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated. When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ * version ZFS on-disk version
+ * name Pool name
+ * state Pool state
+ * txg Transaction group in which this label was written
+ * pool_guid Unique identifier for this pool
+ * vdev_tree An nvlist describing vdev tree.
+ *
+ * Each leaf device label also contains the following:
+ *
+ * top_guid Unique ID for top-level vdev in which this is contained
+ * guid Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+ ASSERT(offset < sizeof (vdev_label_t));
+ ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
+
+ return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+ 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ zio_nowait(zio_read_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ B_TRUE));
+}
+
+static void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ zio_nowait(zio_write_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
+ boolean_t isspare, boolean_t isl2cache)
+{
+ nvlist_t *nv = NULL;
+
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+ vd->vdev_ops->vdev_op_type) == 0);
+ if (!isspare && !isl2cache)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
+ == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+
+ if (vd->vdev_path != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
+ vd->vdev_path) == 0);
+
+ if (vd->vdev_devid != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
+ vd->vdev_devid) == 0);
+
+ if (vd->vdev_physpath != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ vd->vdev_physpath) == 0);
+
+ if (vd->vdev_nparity != 0) {
+ ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
+ VDEV_TYPE_RAIDZ) == 0);
+
+ /*
+ * Make sure someone hasn't managed to sneak a fancy new vdev
+ * into a crufty old storage pool.
+ */
+ ASSERT(vd->vdev_nparity == 1 ||
+ (vd->vdev_nparity == 2 &&
+ spa_version(spa) >= SPA_VERSION_RAID6));
+
+ /*
+ * Note that we'll add the nparity tag even on storage pools
+ * that only support a single parity device -- older software
+ * will just ignore it.
+ */
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ vd->vdev_nparity) == 0);
+ }
+
+ if (vd->vdev_wholedisk != -1ULL)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ vd->vdev_wholedisk) == 0);
+
+ if (vd->vdev_not_present)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+
+ if (vd->vdev_isspare)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
+
+ if (!isspare && !isl2cache && vd == vd->vdev_top) {
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+ vd->vdev_ashift) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
+ vd->vdev_islog) == 0);
+ }
+
+ if (vd->vdev_dtl.smo_object != 0)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ vd->vdev_dtl.smo_object) == 0);
+
+ if (getstats) {
+ vdev_stat_t vs;
+ vdev_get_stats(vd, &vs);
+ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t **child;
+ int c;
+
+ child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+ KM_SLEEP);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ child[c] = vdev_config_generate(spa, vd->vdev_child[c],
+ getstats, isspare, isl2cache);
+
+ VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, vd->vdev_children) == 0);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ nvlist_free(child[c]);
+
+ kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+
+ } else {
+ if (vd->vdev_offline && !vd->vdev_tmpoffline)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+ B_TRUE) == 0);
+ if (vd->vdev_faulted)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
+ B_TRUE) == 0);
+ if (vd->vdev_degraded)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+ B_TRUE) == 0);
+ if (vd->vdev_removed)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED,
+ B_TRUE) == 0);
+ if (vd->vdev_unspare)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+ B_TRUE) == 0);
+ }
+
+ return (nv);
+}
+
+nvlist_t *
+vdev_label_read_config(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *config = NULL;
+ vdev_phys_t *vp;
+ zio_t *zio;
+ int l;
+
+ ASSERT(spa_config_held(spa, RW_READER) ||
+ spa_config_held(spa, RW_WRITER));
+
+ if (!vdev_readable(vd))
+ return (NULL);
+
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
+
+ vdev_label_read(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL);
+
+ if (zio_wait(zio) == 0 &&
+ nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
+ &config, 0) == 0)
+ break;
+
+ if (config != NULL) {
+ nvlist_free(config);
+ config = NULL;
+ }
+ }
+
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+
+ return (config);
+}
+
+/*
+ * Determine if a device is in use. The 'spare_guid' parameter will be filled
+ * in with the device guid if this spare is active elsewhere on the system.
+ */
+static boolean_t
+vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
+ uint64_t *spare_guid, uint64_t *l2cache_guid)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t state, pool_guid, device_guid, txg, spare_pool;
+ uint64_t vdtxg = 0;
+ nvlist_t *label;
+
+ if (spare_guid)
+ *spare_guid = 0ULL;
+ if (l2cache_guid)
+ *l2cache_guid = 0ULL;
+
+ /*
+ * Read the label, if any, and perform some basic sanity checks.
+ */
+ if ((label = vdev_label_read_config(vd)) == NULL)
+ return (B_FALSE);
+
+ (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ &vdtxg);
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &device_guid) != 0) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0)) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * Check to see if this device indeed belongs to the pool it claims to
+ * be a part of. The only way this is allowed is if the device is a hot
+ * spare (which we check for later on).
+ */
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ !spa_guid_exists(pool_guid, device_guid) &&
+ !spa_spare_exists(device_guid, NULL) &&
+ !spa_l2cache_exists(device_guid, NULL))
+ return (B_FALSE);
+
+ /*
+ * If the transaction group is zero, then this an initialized (but
+ * unused) label. This is only an error if the create transaction
+ * on-disk is the same as the one we're using now, in which case the
+ * user has attempted to add the same vdev multiple times in the same
+ * transaction.
+ */
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ txg == 0 && vdtxg == crtxg)
+ return (B_TRUE);
+
+ /*
+ * Check to see if this is a spare device. We do an explicit check for
+ * spa_has_spare() here because it may be on our pending list of spares
+ * to add. We also check if it is an l2cache device.
+ */
+ if (spa_spare_exists(device_guid, &spare_pool) ||
+ spa_has_spare(spa, device_guid)) {
+ if (spare_guid)
+ *spare_guid = device_guid;
+
+ switch (reason) {
+ case VDEV_LABEL_CREATE:
+ case VDEV_LABEL_L2CACHE:
+ return (B_TRUE);
+
+ case VDEV_LABEL_REPLACE:
+ return (!spa_has_spare(spa, device_guid) ||
+ spare_pool != 0ULL);
+
+ case VDEV_LABEL_SPARE:
+ return (spa_has_spare(spa, device_guid));
+ }
+ }
+
+ /*
+ * Check to see if this is an l2cache device.
+ */
+ if (spa_l2cache_exists(device_guid, NULL))
+ return (B_TRUE);
+
+ /*
+ * If the device is marked ACTIVE, then this device is in use by another
+ * pool on the system.
+ */
+ return (state == POOL_STATE_ACTIVE);
+}
+
+/*
+ * Initialize a vdev label. We check to make sure each leaf device is not in
+ * use, and writable. We put down an initial label which we will later
+ * overwrite with a complete label. Note that it's important to do this
+ * sequentially, not in parallel, so that we catch cases of multiple use of the
+ * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
+ * itself.
+ */
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ vdev_boot_header_t *vb;
+ uberblock_t *ub;
+ zio_t *zio;
+ int l, c, n;
+ char *buf;
+ size_t buflen;
+ int error;
+ uint64_t spare_guid, l2cache_guid;
+ int flags = ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((error = vdev_label_init(vd->vdev_child[c],
+ crtxg, reason)) != 0)
+ return (error);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ /*
+ * Dead vdevs cannot be initialized.
+ */
+ if (vdev_is_dead(vd))
+ return (EIO);
+
+ /*
+ * Determine if the vdev is in use.
+ */
+ if (reason != VDEV_LABEL_REMOVE &&
+ vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
+ return (EBUSY);
+
+ ASSERT(reason != VDEV_LABEL_REMOVE ||
+ vdev_inuse(vd, crtxg, reason, NULL, NULL));
+
+ /*
+ * If this is a request to add or replace a spare or l2cache device
+ * that is in use elsewhere on the system, then we must update the
+ * guid (which was initialized to a random value) to reflect the
+ * actual GUID (which is shared between multiple pools).
+ */
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
+ spare_guid != 0ULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ for (; pvd != NULL; pvd = pvd->vdev_parent) {
+ pvd->vdev_guid_sum -= vd->vdev_guid;
+ pvd->vdev_guid_sum += spare_guid;
+ }
+
+ vd->vdev_guid = vd->vdev_guid_sum = spare_guid;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding a spare, then it's already
+ * labeled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_SPARE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE);
+ }
+
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
+ l2cache_guid != 0ULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ for (; pvd != NULL; pvd = pvd->vdev_parent) {
+ pvd->vdev_guid_sum -= vd->vdev_guid;
+ pvd->vdev_guid_sum += l2cache_guid;
+ }
+
+ vd->vdev_guid = vd->vdev_guid_sum = l2cache_guid;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding an l2cache, then it's
+ * already labeled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_L2CACHE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE);
+ }
+
+ /*
+ * Initialize its label.
+ */
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ bzero(vp, sizeof (vdev_phys_t));
+
+ /*
+ * Generate a label describing the pool and our top-level vdev.
+ * We mark it as being from txg 0 to indicate that it's not
+ * really part of an active pool just yet. The labels will
+ * be written again with a meaningful txg by spa_sync().
+ */
+ if (reason == VDEV_LABEL_SPARE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
+ /*
+ * For inactive hot spares, we generate a special label that
+ * identifies as a mutually shared hot spare. We write the
+ * label if we are adding a hot spare, or if we are removing an
+ * active hot spare (in which case we want to revert the
+ * labels).
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_SPARE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ } else if (reason == VDEV_LABEL_L2CACHE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
+ /*
+ * For level 2 ARC devices, add a special label.
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_L2CACHE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ } else {
+ label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+
+ /*
+ * Add our creation time. This allows us to detect multiple
+ * vdev uses as described above, and automatically expires if we
+ * fail.
+ */
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ crtxg) == 0);
+ }
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
+ if (error != 0) {
+ nvlist_free(label);
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+ /* EFAULT means nvlist_pack ran out of room */
+ return (error == EFAULT ? ENAMETOOLONG : EINVAL);
+ }
+
+ /*
+ * Initialize boot block header.
+ */
+ vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
+ bzero(vb, sizeof (vdev_boot_header_t));
+ vb->vb_magic = VDEV_BOOT_MAGIC;
+ vb->vb_version = VDEV_BOOT_VERSION;
+ vb->vb_offset = VDEV_BOOT_OFFSET;
+ vb->vb_size = VDEV_BOOT_SIZE;
+
+ /*
+ * Initialize uberblock template.
+ */
+ ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
+ bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+ *ub = spa->spa_uberblock;
+ ub->ub_txg = 0;
+
+ /*
+ * Write everything in parallel.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ vdev_label_write(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL, flags);
+
+ vdev_label_write(zio, vd, l, vb,
+ offsetof(vdev_label_t, vl_boot_header),
+ sizeof (vdev_boot_header_t), NULL, NULL, flags);
+
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ vdev_label_write(zio, vd, l, ub,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
+ }
+ }
+
+ error = zio_wait(zio);
+
+ nvlist_free(label);
+ zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
+ zio_buf_free(vb, sizeof (vdev_boot_header_t));
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+
+ /*
+ * If this vdev hasn't been previously identified as a spare, then we
+ * mark it as such only if a) we are labeling it as a spare, or b) it
+ * exists as a spare elsewhere in the system. Do the same for
+ * level 2 ARC devices.
+ */
+ if (error == 0 && !vd->vdev_isspare &&
+ (reason == VDEV_LABEL_SPARE ||
+ spa_spare_exists(vd->vdev_guid, NULL)))
+ spa_spare_add(vd);
+
+ if (error == 0 && !vd->vdev_isl2cache &&
+ (reason == VDEV_LABEL_L2CACHE ||
+ spa_l2cache_exists(vd->vdev_guid, NULL)))
+ spa_l2cache_add(vd);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk. We've
+ * written the first uberblock for txg + 1, and then we lose power. When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline. If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a new seconds we'll have two
+ * conflicting uberblocks on disk with the same txg. The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+{
+ if (ub1->ub_txg < ub2->ub_txg)
+ return (-1);
+ if (ub1->ub_txg > ub2->ub_txg)
+ return (1);
+
+ if (ub1->ub_timestamp < ub2->ub_timestamp)
+ return (-1);
+ if (ub1->ub_timestamp > ub2->ub_timestamp)
+ return (1);
+
+ return (0);
+}
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+ uberblock_t *ub = zio->io_data;
+ uberblock_t *ubbest = zio->io_private;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
+
+ if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
+ mutex_enter(&spa->spa_uberblock_lock);
+ if (vdev_uberblock_compare(ub, ubbest) > 0)
+ *ubbest = *ub;
+ mutex_exit(&spa->spa_uberblock_lock);
+ }
+
+ zio_buf_free(zio->io_data, zio->io_size);
+}
+
+void
+vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+{
+ int l, c, n;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ vdev_label_read(zio, vd, l,
+ zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_load_done, ubbest);
+ }
+ }
+}
+
+/*
+ * On success, increment root zio's count of good writes.
+ * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
+ atomic_add_64(good_writes, 1);
+}
+
+/*
+ * Write the uberblock to all labels of all leaves of the specified vdev.
+ */
+static void
+vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd)
+{
+ int l, c, n;
+ uberblock_t *ubbuf;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_sync(zio, ub, vd->vdev_child[c]);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
+
+ ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
+ bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
+ *ubbuf = *ub;
+
+ for (l = 0; l < VDEV_LABELS; l++)
+ vdev_label_write(zio, vd, l, ubbuf,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_sync_done, zio->io_private,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE);
+
+ zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
+}
+
+int
+vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
+{
+ spa_t *spa = svd[0]->vdev_spa;
+ int v;
+ zio_t *zio;
+ uint64_t good_writes = 0;
+
+ zio = zio_root(spa, NULL, &good_writes, flags);
+
+ for (v = 0; v < svdcount; v++)
+ vdev_uberblock_sync(zio, ub, svd[v]);
+
+ (void) zio_wait(zio);
+
+ /*
+ * Flush the uberblocks to disk. This ensures that the odd labels
+ * are no longer needed (because the new uberblocks and the even
+ * labels are safely on disk), so it is safe to overwrite them.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (v = 0; v < svdcount; v++)
+ zio_flush(zio, svd[v]);
+
+ (void) zio_wait(zio);
+
+ return (good_writes >= 1 ? 0 : EIO);
+}
+
+/*
+ * On success, increment the count of good writes for our top-level vdev.
+ */
+static void
+vdev_label_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (zio->io_error == 0)
+ atomic_add_64(good_writes, 1);
+}
+
+/*
+ * If there weren't enough good writes, indicate failure to the parent.
+ */
+static void
+vdev_label_sync_top_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_private;
+
+ if (*good_writes == 0)
+ zio->io_error = EIO;
+
+ kmem_free(good_writes, sizeof (uint64_t));
+}
+
+/*
+ * Write all even or odd labels to all leaves of the specified vdev.
+ */
+static void
+vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
+{
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ char *buf;
+ size_t buflen;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_label_sync(zio, vd->vdev_child[c], l, txg);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ /*
+ * Generate a label describing the top-level config to which we belong.
+ */
+ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
+
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ bzero(vp, sizeof (vdev_phys_t));
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
+ for (; l < VDEV_LABELS; l += 2) {
+ vdev_label_write(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t),
+ vdev_label_sync_done, zio->io_private,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE);
+ }
+ }
+
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+ nvlist_free(label);
+}
+
+int
+vdev_label_sync_list(spa_t *spa, int l, int flags, uint64_t txg)
+{
+ list_t *dl = &spa->spa_dirty_list;
+ vdev_t *vd;
+ zio_t *zio;
+ int error;
+
+ /*
+ * Write the new labels to disk.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
+ uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
+ KM_SLEEP);
+ zio_t *vio = zio_null(zio, spa, vdev_label_sync_top_done,
+ good_writes, flags);
+ vdev_label_sync(vio, vd, l, txg);
+ zio_nowait(vio);
+ }
+
+ error = zio_wait(zio);
+
+ /*
+ * Flush the new labels to disk.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
+ zio_flush(zio, vd);
+
+ (void) zio_wait(zio);
+
+ return (error);
+}
+
+/*
+ * Sync the uberblock and any changes to the vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent. The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+{
+ spa_t *spa = svd[0]->vdev_spa;
+ uberblock_t *ub = &spa->spa_uberblock;
+ vdev_t *vd;
+ zio_t *zio;
+ int error;
+ int flags = ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL;
+
+ ASSERT(ub->ub_txg <= txg);
+
+ /*
+ * If this isn't a resync due to I/O errors,
+ * and nothing changed in this transaction group,
+ * and the vdev configuration hasn't changed,
+ * then there's nothing to do.
+ */
+ if (ub->ub_txg < txg &&
+ uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE &&
+ list_is_empty(&spa->spa_dirty_list))
+ return (0);
+
+ if (txg > spa_freeze_txg(spa))
+ return (0);
+
+ ASSERT(txg <= spa->spa_final_txg);
+
+ /*
+ * Flush the write cache of every disk that's been written to
+ * in this transaction group. This ensures that all blocks
+ * written in this txg will be committed to stable storage
+ * before any uberblock that references them.
+ */
+ zio = zio_root(spa, NULL, NULL, flags);
+
+ for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+ vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
+ zio_flush(zio, vd);
+
+ (void) zio_wait(zio);
+
+ /*
+ * Sync out the even labels (L0, L2) for every dirty vdev. If the
+ * system dies in the middle of this process, that's OK: all of the
+ * even labels that made it to disk will be newer than any uberblock,
+ * and will therefore be considered invalid. The odd labels (L1, L3),
+ * which have not yet been touched, will still be valid. We flush
+ * the new labels to disk to ensure that all even-label updates
+ * are committed to stable storage before the uberblock update.
+ */
+ if ((error = vdev_label_sync_list(spa, 0, flags, txg)) != 0)
+ return (error);
+
+ /*
+ * Sync the uberblocks to all vdevs in svd[].
+ * If the system dies in the middle of this step, there are two cases
+ * to consider, and the on-disk state is consistent either way:
+ *
+ * (1) If none of the new uberblocks made it to disk, then the
+ * previous uberblock will be the newest, and the odd labels
+ * (which had not yet been touched) will be valid with respect
+ * to that uberblock.
+ *
+ * (2) If one or more new uberblocks made it to disk, then they
+ * will be the newest, and the even labels (which had all
+ * been successfully committed) will be valid with respect
+ * to the new uberblocks.
+ */
+ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
+ return (error);
+
+ /*
+ * Sync out odd labels for every dirty vdev. If the system dies
+ * in the middle of this process, the even labels and the new
+ * uberblocks will suffice to open the pool. The next time
+ * the pool is opened, the first thing we'll do -- before any
+ * user data is modified -- is mark every vdev dirty so that
+ * all labels will be brought up to date. We flush the new labels
+ * to disk to ensure that all odd-label updates are committed to
+ * stable storage before the next transaction group begins.
+ */
+ return (vdev_label_sync_list(spa, 1, flags, txg));
+}
diff --git a/zfs/lib/libzpool/vdev_mirror.c b/zfs/lib/libzpool/vdev_mirror.c
new file mode 100644
index 000000000..16063fa9a
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_mirror.c
@@ -0,0 +1,496 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_mirror.c 1.9 07/11/27 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for mirroring.
+ */
+
+typedef struct mirror_child {
+ vdev_t *mc_vd;
+ uint64_t mc_offset;
+ int mc_error;
+ short mc_tried;
+ short mc_skipped;
+} mirror_child_t;
+
+typedef struct mirror_map {
+ int mm_children;
+ int mm_replacing;
+ int mm_preferred;
+ int mm_root;
+ mirror_child_t mm_child[1];
+} mirror_map_t;
+
+int vdev_mirror_shift = 21;
+
+static mirror_map_t *
+vdev_mirror_map_alloc(zio_t *zio)
+{
+ mirror_map_t *mm = NULL;
+ mirror_child_t *mc;
+ vdev_t *vd = zio->io_vd;
+ int c, d;
+
+ if (vd == NULL) {
+ dva_t *dva = zio->io_bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+
+ c = BP_GET_NDVAS(zio->io_bp);
+
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm->mm_children = c;
+ mm->mm_replacing = B_FALSE;
+ mm->mm_preferred = spa_get_random(c);
+ mm->mm_root = B_TRUE;
+
+ /*
+ * Check the other, lower-index DVAs to see if they're on
+ * the same vdev as the child we picked. If they are, use
+ * them since they are likely to have been allocated from
+ * the primary metaslab in use at the time, and hence are
+ * more likely to have locality with single-copy data.
+ */
+ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
+ if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
+ mm->mm_preferred = d;
+ }
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+ mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+ }
+ } else {
+ c = vd->vdev_children;
+
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm->mm_children = c;
+ mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+ mm->mm_preferred = mm->mm_replacing ? 0 :
+ (zio->io_offset >> vdev_mirror_shift) % c;
+ mm->mm_root = B_FALSE;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ mc->mc_vd = vd->vdev_child[c];
+ mc->mc_offset = zio->io_offset;
+ }
+ }
+
+ zio->io_vsd = mm;
+ return (mm);
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+
+ kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+ zio->io_vsd = NULL;
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ uint64_t c;
+ int numerrors = 0;
+ int ret, lasterror = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((ret = vdev_open(cvd)) != 0) {
+ lasterror = ret;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *ashift = MAX(*ashift, cvd->vdev_ashift);
+ }
+
+ if (numerrors == vd->vdev_children) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+ uint64_t c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ if (zio->io_error == 0) {
+ zio_t *pio = zio->io_parent;
+ mutex_enter(&pio->io_lock);
+ ASSERT3U(zio->io_size, >=, pio->io_size);
+ bcopy(zio->io_data, pio->io_data, pio->io_size);
+ mutex_exit(&pio->io_lock);
+ }
+
+ zio_buf_free(zio->io_data, zio->io_size);
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_repair_done(zio_t *zio)
+{
+ ASSERT(zio->io_private == zio->io_parent);
+ vdev_mirror_map_free(zio->io_private);
+}
+
+/*
+ * Try to find a child whose DTL doesn't contain the block we want to read.
+ * If we can't, try the read on any vdev we haven't already tried.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
+ uint64_t txg = zio->io_txg;
+ int i, c;
+
+ ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+
+ /*
+ * Try to find a child whose DTL doesn't contain the block to read.
+ * If a child is known to be completely inaccessible (indicated by
+ * vdev_readable() returning B_FALSE), don't even try.
+ */
+ for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
+ if (c >= mm->mm_children)
+ c = 0;
+ mc = &mm->mm_child[c];
+ if (mc->mc_tried || mc->mc_skipped)
+ continue;
+ if (vdev_is_dead(mc->mc_vd) && !vdev_readable(mc->mc_vd)) {
+ mc->mc_error = ENXIO;
+ mc->mc_tried = 1; /* don't even try */
+ mc->mc_skipped = 1;
+ continue;
+ }
+ if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
+ return (c);
+ mc->mc_error = ESTALE;
+ mc->mc_skipped = 1;
+ }
+
+ /*
+ * Every device is either missing or has this txg in its DTL.
+ * Look for any child we haven't already tried before giving up.
+ */
+ for (c = 0; c < mm->mm_children; c++)
+ if (!mm->mm_child[c].mc_tried)
+ return (c);
+
+ /*
+ * Every child failed. There's no place left to look.
+ */
+ return (-1);
+}
+
+static int
+vdev_mirror_io_start(zio_t *zio)
+{
+ mirror_map_t *mm;
+ mirror_child_t *mc;
+ int c, children;
+
+ mm = vdev_mirror_map_alloc(zio);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+ /*
+ * For scrubbing reads we need to allocate a read
+ * buffer for each child and issue reads to all
+ * children. If any child succeeds, it will copy its
+ * data into zio->io_data in vdev_mirror_scrub_done.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio_buf_alloc(zio->io_size), zio->io_size,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL,
+ vdev_mirror_scrub_done, mc));
+ }
+ return (zio_wait_for_children_done(zio));
+ }
+ /*
+ * For normal reads just pick one child.
+ */
+ c = vdev_mirror_child_select(zio);
+ children = (c >= 0);
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ /*
+ * If this is a resilvering I/O to a replacing vdev,
+ * only the last child should be written -- unless the
+ * first child happens to have a DTL entry here as well.
+ * All other writes go to all children.
+ */
+ if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
+ !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
+ zio->io_txg, 1)) {
+ c = mm->mm_children - 1;
+ children = 1;
+ } else {
+ c = 0;
+ children = mm->mm_children;
+ }
+ }
+
+ while (children--) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
+ c++;
+ }
+
+ return (zio_wait_for_children_done(zio));
+}
+
+static int
+vdev_mirror_io_done(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
+ int c;
+ int good_copies = 0;
+ int unexpected_errors = 0;
+
+ zio->io_error = 0;
+ zio->io_numerrors = 0;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_tried && mc->mc_error == 0) {
+ good_copies++;
+ continue;
+ }
+
+ /*
+ * We preserve any EIOs because those may be worth retrying;
+ * whereas ECKSUM and ENXIO are more likely to be persistent.
+ */
+ if (mc->mc_error) {
+ if (zio->io_error != EIO)
+ zio->io_error = mc->mc_error;
+ if (!mc->mc_skipped)
+ unexpected_errors++;
+ zio->io_numerrors++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * XXX -- for now, treat partial writes as success.
+ * XXX -- For a replacing vdev, we need to make sure the
+ * new child succeeds.
+ */
+ /* XXPOLICY */
+ if (good_copies != 0)
+ zio->io_error = 0;
+ vdev_mirror_map_free(zio);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * If we don't have a good copy yet, keep trying other children.
+ */
+ /* XXPOLICY */
+ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+ ASSERT(c >= 0 && c < mm->mm_children);
+ mc = &mm->mm_child[c];
+ dprintf("retrying i/o (err=%d) on child %s\n",
+ zio->io_error, vdev_description(mc->mc_vd));
+ zio->io_error = 0;
+ zio_vdev_io_redone(zio);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+ ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_mirror_child_done, mc));
+ return (zio_wait_for_children_done(zio));
+ }
+
+ /* XXPOLICY */
+ if (good_copies)
+ zio->io_error = 0;
+ else
+ ASSERT(zio->io_error != 0);
+
+ if (good_copies && (spa_mode & FWRITE) &&
+ (unexpected_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER) ||
+ ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
+ zio_t *rio;
+
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ *
+ * We issue all repair I/Os as children of 'rio' to arrange
+ * that vdev_mirror_map_free(zio) will be invoked after all
+ * repairs complete, but before we advance to the next stage.
+ */
+ rio = zio_null(zio, zio->io_spa,
+ vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
+
+ for (c = 0; c < mm->mm_children; c++) {
+ /*
+ * Don't rewrite known good children.
+ * Not only is it unnecessary, it could
+ * actually be harmful: if the system lost
+ * power while rewriting the only good copy,
+ * there would be no good copies left!
+ */
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_error == 0) {
+ if (mc->mc_tried)
+ continue;
+ if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
+ zio->io_txg, 1))
+ continue;
+ mc->mc_error = ESTALE;
+ }
+
+ dprintf("resilvered %s @ 0x%llx error %d\n",
+ vdev_description(mc->mc_vd), mc->mc_offset,
+ mc->mc_error);
+
+ zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
+ mc->mc_offset, zio->io_data, zio->io_size,
+ ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+ }
+
+ zio_nowait(rio);
+
+ return (zio_wait_for_children_done(zio));
+ }
+
+ vdev_mirror_map_free(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted == vd->vdev_children)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_mirror_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ NULL,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_MIRROR, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ NULL,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_REPLACING, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_spare_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ NULL,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_SPARE, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/zfs/lib/libzpool/vdev_missing.c b/zfs/lib/libzpool/vdev_missing.c
new file mode 100644
index 000000000..2039f7380
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_missing.c
@@ -0,0 +1,96 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_missing.c 1.3 07/11/27 SMI"
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import. It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing. We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ /*
+ * Really this should just fail. But then the root vdev will be in the
+ * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+ * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
+ * will fail the GUID sum check before ever trying to open the pool.
+ */
+ *psize = SPA_MINDEVSIZE;
+ *ashift = SPA_MINBLOCKSHIFT;
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static int
+vdev_missing_io_start(zio_t *zio)
+{
+ zio->io_error = ENOTSUP;
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/* ARGSUSED */
+static int
+vdev_missing_io_done(zio_t *zio)
+{
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/* ARGSUSED */
+static int
+vdev_missing_probe(vdev_t *vd)
+{
+ return (0);
+}
+
+vdev_ops_t vdev_missing_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_missing_probe,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ VDEV_TYPE_MISSING, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/zfs/lib/libzpool/vdev_queue.c b/zfs/lib/libzpool/vdev_queue.c
new file mode 100644
index 000000000..db79c8dd2
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_queue.c
@@ -0,0 +1,320 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_queue.c 1.6 07/11/27 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * zfs_vdev_max_pending is the maximum number of i/os concurrently
+ * pending to each device. zfs_vdev_min_pending is the initial number
+ * of i/os pending to each device (before it starts ramping up to
+ * max_pending).
+ */
+int zfs_vdev_max_pending = 35;
+int zfs_vdev_min_pending = 4;
+
+/* deadline = pri + (lbolt >> time_shift) */
+int zfs_vdev_time_shift = 6;
+
+/* exponential I/O issue ramp-up rate */
+int zfs_vdev_ramp_rate = 2;
+
+/*
+ * i/os will be aggregated into a single large i/o up to
+ * zfs_vdev_aggregation_limit bytes long.
+ */
+int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+
+/*
+ * Virtual device vector for disk I/O scheduling.
+ */
+int
+vdev_queue_deadline_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_deadline < z2->io_deadline)
+ return (-1);
+ if (z1->io_deadline > z2->io_deadline)
+ return (1);
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
+ sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+
+ avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ avl_destroy(&vq->vq_deadline_tree);
+ avl_destroy(&vq->vq_read_tree);
+ avl_destroy(&vq->vq_write_tree);
+ avl_destroy(&vq->vq_pending_tree);
+
+ mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_add(&vq->vq_deadline_tree, zio);
+ avl_add(zio->io_vdev_tree, zio);
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_remove(&vq->vq_deadline_tree, zio);
+ avl_remove(zio->io_vdev_tree, zio);
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+ zio_t *dio;
+ uint64_t offset = 0;
+
+ while ((dio = aio->io_delegate_list) != NULL) {
+ if (aio->io_type == ZIO_TYPE_READ)
+ bcopy((char *)aio->io_data + offset, dio->io_data,
+ dio->io_size);
+ offset += dio->io_size;
+ aio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = aio->io_error;
+ zio_execute(dio);
+ }
+ ASSERT3U(offset, ==, aio->io_size);
+
+ zio_buf_free(aio->io_data, aio->io_size);
+}
+
+#define IS_ADJACENT(io, nio) \
+ ((io)->io_offset + (io)->io_size == (nio)->io_offset)
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
+{
+ zio_t *fio, *lio, *aio, *dio;
+ avl_tree_t *tree;
+ uint64_t size;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
+ avl_numnodes(&vq->vq_deadline_tree) == 0)
+ return (NULL);
+
+ fio = lio = avl_first(&vq->vq_deadline_tree);
+
+ tree = fio->io_vdev_tree;
+ size = fio->io_size;
+
+ while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
+ size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ dio->io_delegate_next = fio;
+ fio = dio;
+ size += dio->io_size;
+ }
+
+ while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
+ size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ lio->io_delegate_next = dio;
+ lio = dio;
+ size += dio->io_size;
+ }
+
+ if (fio != lio) {
+ char *buf = zio_buf_alloc(size);
+ uint64_t offset = 0;
+ int nagg = 0;
+
+ ASSERT(size <= zfs_vdev_aggregation_limit);
+
+ aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
+ fio->io_offset, buf, size, fio->io_type,
+ ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_NOBOOKMARK,
+ vdev_queue_agg_io_done, NULL);
+
+ aio->io_delegate_list = fio;
+
+ for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+ ASSERT(dio->io_type == aio->io_type);
+ ASSERT(dio->io_vdev_tree == tree);
+ if (dio->io_type == ZIO_TYPE_WRITE)
+ bcopy(dio->io_data, buf + offset, dio->io_size);
+ offset += dio->io_size;
+ vdev_queue_io_remove(vq, dio);
+ zio_vdev_io_bypass(dio);
+ nagg++;
+ }
+
+ ASSERT(offset == size);
+
+ dprintf("%5s T=%llu off=%8llx agg=%3d "
+ "old=%5llx new=%5llx\n",
+ zio_type_name[fio->io_type],
+ fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
+
+ avl_add(&vq->vq_pending_tree, aio);
+
+ return (aio);
+ }
+
+ ASSERT(fio->io_vdev_tree == tree);
+ vdev_queue_io_remove(vq, fio);
+
+ avl_add(&vq->vq_pending_tree, fio);
+
+ return (fio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+ return (zio);
+
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ zio->io_vdev_tree = &vq->vq_read_tree;
+ else
+ zio->io_vdev_tree = &vq->vq_write_tree;
+
+ mutex_enter(&vq->vq_lock);
+
+ zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
+ zio->io_priority;
+
+ vdev_queue_io_add(vq, zio);
+
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
+
+ mutex_exit(&vq->vq_lock);
+
+ if (nio == NULL)
+ return (NULL);
+
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ return (NULL);
+ }
+
+ return (nio);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ int i;
+
+ mutex_enter(&vq->vq_lock);
+
+ avl_remove(&vq->vq_pending_tree, zio);
+
+ for (i = 0; i < zfs_vdev_ramp_rate; i++) {
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
+ if (nio == NULL)
+ break;
+ mutex_exit(&vq->vq_lock);
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ } else {
+ zio_vdev_io_reissue(nio);
+ zio_execute(nio);
+ }
+ mutex_enter(&vq->vq_lock);
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
diff --git a/zfs/lib/libzpool/vdev_raidz.c b/zfs/lib/libzpool/vdev_raidz.c
new file mode 100644
index 000000000..cb399f046
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_raidz.c
@@ -0,0 +1,1239 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_raidz.c 1.10 07/11/27 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+
+/*
+ * Virtual device vector for RAID-Z.
+ *
+ * This vdev supports both single and double parity. For single parity, we
+ * use a simple XOR of all the data columns. For double parity, we use both
+ * the simple XOR as well as a technique described in "The mathematics of
+ * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
+ * over the integers expressable in a single byte. Briefly, the operations on
+ * the field are defined as follows:
+ *
+ * o addition (+) is represented by a bitwise XOR
+ * o subtraction (-) is therefore identical to addition: A + B = A - B
+ * o multiplication of A by 2 is defined by the following bitwise expression:
+ * (A * 2)_7 = A_6
+ * (A * 2)_6 = A_5
+ * (A * 2)_5 = A_4
+ * (A * 2)_4 = A_3 + A_7
+ * (A * 2)_3 = A_2 + A_7
+ * (A * 2)_2 = A_1 + A_7
+ * (A * 2)_1 = A_0
+ * (A * 2)_0 = A_7
+ *
+ * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ *
+ * Observe that any number in the field (except for 0) can be expressed as a
+ * power of 2 -- a generator for the field. We store a table of the powers of
+ * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
+ * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
+ * than field addition). The inverse of a field element A (A^-1) is A^254.
+ *
+ * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
+ * can be expressed by field operations:
+ *
+ * P = D_0 + D_1 + ... + D_n-2 + D_n-1
+ * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
+ * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *
+ * See the reconstruction code below for how P and Q can used individually or
+ * in concert to recover missing data columns.
+ */
+
+typedef struct raidz_col {
+ uint64_t rc_devidx; /* child device index for I/O */
+ uint64_t rc_offset; /* device offset */
+ uint64_t rc_size; /* I/O size */
+ void *rc_data; /* I/O data */
+ int rc_error; /* I/O error for this device */
+ uint8_t rc_tried; /* Did we attempt this I/O column? */
+ uint8_t rc_skipped; /* Did we skip this I/O column? */
+} raidz_col_t;
+
+typedef struct raidz_map {
+ uint64_t rm_cols; /* Column count */
+ uint64_t rm_bigcols; /* Number of oversized columns */
+ uint64_t rm_asize; /* Actual total I/O size */
+ uint64_t rm_missingdata; /* Count of missing data devices */
+ uint64_t rm_missingparity; /* Count of missing parity devices */
+ uint64_t rm_firstdatacol; /* First data column/parity count */
+ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+} raidz_map_t;
+
+#define VDEV_RAIDZ_P 0
+#define VDEV_RAIDZ_Q 1
+
+#define VDEV_RAIDZ_MAXPARITY 2
+
+#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+
+/*
+ * These two tables represent powers and logs of 2 in the Galois field defined
+ * above. These values were computed by repeatedly multiplying by 2 as above.
+ */
+static const uint8_t vdev_raidz_pow2[256] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+ 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+ 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+ 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+ 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+ 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+ 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+ 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+ 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+ 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+ 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+ 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+ 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+ 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+ 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+ 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+ 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+ 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+ 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+ 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+ 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+ 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+ 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+ 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+ 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+ 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+ 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+ 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+ 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+ 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+ 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
+};
+static const uint8_t vdev_raidz_log2[256] = {
+ 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
+ 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
+ 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
+ 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
+ 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
+ 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
+ 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
+ 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
+ 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
+ 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
+ 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
+ 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
+ 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
+ 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
+ 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
+ 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
+ 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
+ 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
+ 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
+ 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
+ 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
+ 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
+ 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
+ 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
+ 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
+ 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
+ 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
+ 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
+ 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
+ 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
+ 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
+ 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
+};
+
+/*
+ * Multiply a given number by 2 raised to the given power.
+ */
+static uint8_t
+vdev_raidz_exp2(uint_t a, int exp)
+{
+ if (a == 0)
+ return (0);
+
+ ASSERT(exp >= 0);
+ ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
+
+ exp += vdev_raidz_log2[a];
+ if (exp > 255)
+ exp -= 255;
+
+ return (vdev_raidz_pow2[exp]);
+}
+
+static raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
+ uint64_t nparity)
+{
+ raidz_map_t *rm;
+ uint64_t b = zio->io_offset >> unit_shift;
+ uint64_t s = zio->io_size >> unit_shift;
+ uint64_t f = b % dcols;
+ uint64_t o = (b / dcols) << unit_shift;
+ uint64_t q, r, c, bc, col, acols, coff, devidx;
+
+ q = s / (dcols - nparity);
+ r = s - q * (dcols - nparity);
+ bc = (r == 0 ? 0 : r + nparity);
+
+ acols = (q == 0 ? bc : dcols);
+
+ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+
+ rm->rm_cols = acols;
+ rm->rm_bigcols = bc;
+ rm->rm_asize = 0;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+ rm->rm_firstdatacol = nparity;
+
+ for (c = 0; c < acols; c++) {
+ col = f + c;
+ coff = o;
+ if (col >= dcols) {
+ col -= dcols;
+ coff += 1ULL << unit_shift;
+ }
+ rm->rm_col[c].rc_devidx = col;
+ rm->rm_col[c].rc_offset = coff;
+ rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
+ rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_error = 0;
+ rm->rm_col[c].rc_tried = 0;
+ rm->rm_col[c].rc_skipped = 0;
+ rm->rm_asize += rm->rm_col[c].rc_size;
+ }
+
+ rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+
+ rm->rm_col[c].rc_data = zio->io_data;
+
+ for (c = c + 1; c < acols; c++)
+ rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
+ rm->rm_col[c - 1].rc_size;
+
+ /*
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
+ *
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ */
+ ASSERT(rm->rm_cols >= 2);
+ ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+
+ if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ devidx = rm->rm_col[0].rc_devidx;
+ o = rm->rm_col[0].rc_offset;
+ rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
+ rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+ rm->rm_col[1].rc_devidx = devidx;
+ rm->rm_col[1].rc_offset = o;
+ }
+
+ zio->io_vsd = rm;
+ return (rm);
+}
+
+static void
+vdev_raidz_map_free(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ int c;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+
+ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+ zio->io_vsd = NULL;
+}
+
+static void
+vdev_raidz_generate_parity_p(raidz_map_t *rm)
+{
+ uint64_t *p, *src, pcount, ccount, i;
+ int c;
+
+ pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccount == pcount);
+ for (i = 0; i < ccount; i++, p++, src++) {
+ *p = *src;
+ }
+ } else {
+ ASSERT(ccount <= pcount);
+ for (i = 0; i < ccount; i++, p++, src++) {
+ *p ^= *src;
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+{
+ uint64_t *q, *p, *src, pcount, ccount, mask, i;
+ int c;
+
+ pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccount == pcount || ccount == 0);
+ for (i = 0; i < ccount; i++, p++, q++, src++) {
+ *q = *src;
+ *p = *src;
+ }
+ for (; i < pcount; i++, p++, q++, src++) {
+ *q = 0;
+ *p = 0;
+ }
+ } else {
+ ASSERT(ccount <= pcount);
+
+ /*
+ * Rather than multiplying each byte individually (as
+ * described above), we are able to handle 8 at once
+ * by generating a mask based on the high bit in each
+ * byte and using that to conditionally XOR in 0x1d.
+ */
+ for (i = 0; i < ccount; i++, p++, q++, src++) {
+ mask = *q & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ *q ^= *src;
+ *p ^= *src;
+ }
+
+ /*
+ * Treat short columns as though they are full of 0s.
+ */
+ for (; i < pcount; i++, q++) {
+ mask = *q & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+{
+ uint64_t *dst, *src, xcount, ccount, count, i;
+ int c;
+
+ xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
+ ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
+ ASSERT(xcount > 0);
+
+ src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ dst = rm->rm_col[x].rc_data;
+ for (i = 0; i < xcount; i++, dst++, src++) {
+ *dst = *src;
+ }
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ dst = rm->rm_col[x].rc_data;
+
+ if (c == x)
+ continue;
+
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ count = MIN(ccount, xcount);
+
+ for (i = 0; i < count; i++, dst++, src++) {
+ *dst ^= *src;
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+{
+ uint64_t *dst, *src, xcount, ccount, count, mask, i;
+ uint8_t *b;
+ int c, j, exp;
+
+ xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
+ ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ dst = rm->rm_col[x].rc_data;
+
+ if (c == x)
+ ccount = 0;
+ else
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ count = MIN(ccount, xcount);
+
+ if (c == rm->rm_firstdatacol) {
+ for (i = 0; i < count; i++, dst++, src++) {
+ *dst = *src;
+ }
+ for (; i < xcount; i++, dst++) {
+ *dst = 0;
+ }
+
+ } else {
+ /*
+ * For an explanation of this, see the comment in
+ * vdev_raidz_generate_parity_pq() above.
+ */
+ for (i = 0; i < count; i++, dst++, src++) {
+ mask = *dst & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ *dst ^= *src;
+ }
+
+ for (; i < xcount; i++, dst++) {
+ mask = *dst & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ }
+ }
+ }
+
+ src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ dst = rm->rm_col[x].rc_data;
+ exp = 255 - (rm->rm_cols - 1 - x);
+
+ for (i = 0; i < xcount; i++, dst++, src++) {
+ *dst ^= *src;
+ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+ *b = vdev_raidz_exp2(*b, exp);
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+{
+ uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
+ void *pdata, *qdata;
+ uint64_t xsize, ysize, i;
+
+ ASSERT(x < y);
+ ASSERT(x >= rm->rm_firstdatacol);
+ ASSERT(y < rm->rm_cols);
+
+ ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
+
+ /*
+ * Move the parity data aside -- we're going to compute parity as
+ * though columns x and y were full of zeros -- Pxy and Qxy. We want to
+ * reuse the parity generation mechanism without trashing the actual
+ * parity so we make those columns appear to be full of zeros by
+ * setting their lengths to zero.
+ */
+ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ xsize = rm->rm_col[x].rc_size;
+ ysize = rm->rm_col[y].rc_size;
+
+ rm->rm_col[VDEV_RAIDZ_P].rc_data =
+ zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ rm->rm_col[VDEV_RAIDZ_Q].rc_data =
+ zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ rm->rm_col[x].rc_size = 0;
+ rm->rm_col[y].rc_size = 0;
+
+ vdev_raidz_generate_parity_pq(rm);
+
+ rm->rm_col[x].rc_size = xsize;
+ rm->rm_col[y].rc_size = ysize;
+
+ p = pdata;
+ q = qdata;
+ pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ xd = rm->rm_col[x].rc_data;
+ yd = rm->rm_col[y].rc_data;
+
+ /*
+ * We now have:
+ * Pxy = P + D_x + D_y
+ * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
+ *
+ * We can then solve for D_x:
+ * D_x = A * (P + Pxy) + B * (Q + Qxy)
+ * where
+ * A = 2^(x - y) * (2^(x - y) + 1)^-1
+ * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
+ *
+ * With D_x in hand, we can easily solve for D_y:
+ * D_y = P + Pxy + D_x
+ */
+
+ a = vdev_raidz_pow2[255 + x - y];
+ b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
+ tmp = 255 - vdev_raidz_log2[a ^ 1];
+
+ aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
+ bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
+
+ for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
+ *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
+ vdev_raidz_exp2(*q ^ *qxy, bexp);
+
+ if (i < ysize)
+ *yd = *p ^ *pxy ^ *xd;
+ }
+
+ zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
+ rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ /*
+ * Restore the saved parity data.
+ */
+ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
+ rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+}
+
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ uint64_t nparity = vd->vdev_nparity;
+ int c, error;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ ASSERT(nparity > 0);
+
+ if (nparity > VDEV_RAIDZ_MAXPARITY ||
+ vd->vdev_children < nparity + 1) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((error = vdev_open(cvd)) != 0) {
+ lasterror = error;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *ashift = MAX(*ashift, cvd->vdev_ashift);
+ }
+
+ *asize *= vd->vdev_children;
+
+ if (numerrors > nparity) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t cols = vd->vdev_children;
+ uint64_t nparity = vd->vdev_nparity;
+
+ asize = ((psize - 1) >> ashift) + 1;
+ asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
+ asize = roundup(asize, nparity + 1) << ashift;
+
+ return (asize);
+}
+
+static void
+vdev_raidz_child_done(zio_t *zio)
+{
+ raidz_col_t *rc = zio->io_private;
+
+ rc->rc_error = zio->io_error;
+ rc->rc_tried = 1;
+ rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_repair_done(zio_t *zio)
+{
+ ASSERT(zio->io_private == zio->io_parent);
+ vdev_raidz_map_free(zio->io_private);
+}
+
+static int
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ blkptr_t *bp = zio->io_bp;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c;
+
+ rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity);
+
+ ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * Generate RAID parity in the first virtual columns.
+ */
+ if (rm->rm_firstdatacol == 1)
+ vdev_raidz_generate_parity_p(rm);
+ else
+ vdev_raidz_generate_parity_pq(rm);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+
+ return (zio_wait_for_children_done(zio));
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * Iterate over the columns in reverse order so that we hit the parity
+ * last -- any errors along the way will force us to read the parity
+ * data.
+ */
+ for (c = rm->rm_cols - 1; c >= 0; c--) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ if (!vdev_readable(cvd)) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
+ else
+ rm->rm_missingparity++;
+ rc->rc_error = ENXIO;
+ rc->rc_tried = 1; /* don't even try */
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
+ else
+ rm->rm_missingparity++;
+ rc->rc_error = ESTALE;
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+ (zio->io_flags & ZIO_FLAG_SCRUB)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+ }
+
+ return (zio_wait_for_children_done(zio));
+}
+
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+{
+ vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
+ dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+ vdev_description(vd));
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+}
+
+/*
+ * Generate the parity from the data columns. If we tried and were able to
+ * read the parity without error, verify that the generated parity matches the
+ * data we read. If it doesn't, we fire off a checksum error. Return the
+ * number such failures.
+ */
+static int
+raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
+{
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int c, ret = 0;
+ raidz_col_t *rc;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ orig[c] = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig[c], rc->rc_size);
+ }
+
+ if (rm->rm_firstdatacol == 1)
+ vdev_raidz_generate_parity_p(rm);
+ else
+ vdev_raidz_generate_parity_pq(rm);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ ret++;
+ }
+ zio_buf_free(orig[c], rc->rc_size);
+ }
+
+ return (ret);
+}
+
+static uint64_t raidz_corrected_p;
+static uint64_t raidz_corrected_q;
+static uint64_t raidz_corrected_pq;
+
+static int
+vdev_raidz_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd;
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc, *rc1;
+ int unexpected_errors = 0;
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
+ int n, c, c1;
+
+ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+
+ zio->io_error = 0;
+ zio->io_numerrors = 0;
+
+ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
+ ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+
+ /*
+ * We preserve any EIOs because those may be worth retrying;
+ * whereas ECKSUM and ENXIO are more likely to be persistent.
+ */
+ if (rc->rc_error) {
+ if (zio->io_error != EIO)
+ zio->io_error = rc->rc_error;
+
+ if (c < rm->rm_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+
+ zio->io_numerrors++;
+ } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * If this is not a failfast write, and we were able to
+ * write enough columns to reconstruct the data, good enough.
+ */
+ /* XXPOLICY */
+ if (zio->io_numerrors <= rm->rm_firstdatacol &&
+ !(zio->io_flags & ZIO_FLAG_FAILFAST))
+ zio->io_error = 0;
+
+ vdev_raidz_map_free(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ /*
+ * There are three potential phases for a read:
+ * 1. produce valid data from the columns read
+ * 2. read all disks and try again
+ * 3. perform combinatorial reconstruction
+ *
+ * Each phase is progressively both more expensive and less likely to
+ * occur. If we encounter more errors than we can repair or all phases
+ * fail, we have no choice but to return an error.
+ */
+
+ /*
+ * If the number of errors we saw was correctable -- less than or equal
+ * to the number of parity disks read -- attempt to produce data that
+ * has a valid checksum. Naturally, this case applies in the absence of
+ * any errors.
+ */
+ if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
+ switch (data_errors) {
+ case 0:
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+
+ /*
+ * If we read parity information (unnecessarily
+ * as it happens since no reconstruction was
+ * needed) regenerate and verify the parity.
+ * We also regenerate parity when resilvering
+ * so we can write it out to the failed device
+ * later.
+ */
+ if (parity_errors + parity_untried <
+ rm->rm_firstdatacol ||
+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
+ n = raidz_parity_verify(zio, rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+ goto done;
+ }
+ break;
+
+ case 1:
+ /*
+ * We either attempt to read all the parity columns or
+ * none of them. If we didn't try to read parity, we
+ * wouldn't be here in the correctable case. There must
+ * also have been fewer parity errors than parity
+ * columns or, again, we wouldn't be in this code path.
+ */
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rm->rm_firstdatacol);
+
+ /*
+ * Find the column that reported the error.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+ vdev_raidz_reconstruct_p(rm, c);
+ } else {
+ ASSERT(rm->rm_firstdatacol > 1);
+ vdev_raidz_reconstruct_q(rm, c);
+ }
+
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
+ atomic_inc_64(&raidz_corrected_p);
+ else
+ atomic_inc_64(&raidz_corrected_q);
+
+ /*
+ * If there's more than one parity disk that
+ * was successfully read, confirm that the
+ * other parity disk produced the correct data.
+ * This routine is suboptimal in that it
+ * regenerates both the parity we wish to test
+ * as well as the parity we just used to
+ * perform the reconstruction, but this should
+ * be a relatively uncommon case, and can be
+ * optimized if it becomes a problem.
+ * We also regenerate parity when resilvering
+ * so we can write it out to the failed device
+ * later.
+ */
+ if (parity_errors < rm->rm_firstdatacol - 1 ||
+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
+ n = raidz_parity_verify(zio, rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+
+ goto done;
+ }
+ break;
+
+ case 2:
+ /*
+ * Two data column errors require double parity.
+ */
+ ASSERT(rm->rm_firstdatacol == 2);
+
+ /*
+ * Find the two columns that reported errors.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ for (c1 = c++; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ vdev_raidz_reconstruct_pq(rm, c1, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_pq);
+
+ goto done;
+ }
+ break;
+
+ default:
+ ASSERT(rm->rm_firstdatacol <= 2);
+ ASSERT(0);
+ }
+ }
+
+ /*
+ * This isn't a typical situation -- either we got a read error or
+ * a child silently returned bad data. Read every block so we can
+ * try again with as much data and parity as we can track down. If
+ * we've already been through once before, all children will be marked
+ * as tried so we'll proceed to combinatorial reconstruction.
+ */
+ unexpected_errors = 1;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ if (rm->rm_col[c].rc_tried)
+ continue;
+
+ zio->io_error = 0;
+ zio_vdev_io_redone(zio);
+ do {
+ rc = &rm->rm_col[c];
+ if (rc->rc_tried)
+ continue;
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx],
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ } while (++c < rm->rm_cols);
+ dprintf("rereading\n");
+
+ return (zio_wait_for_children_done(zio));
+ }
+
+ /*
+ * At this point we've attempted to reconstruct the data given the
+ * errors we detected, and we've attempted to read all columns. There
+ * must, therefore, be one or more additional problems -- silent errors
+ * resulting in invalid data rather than explicit I/O errors resulting
+ * in absent data. Before we attempt combinatorial reconstruction make
+ * sure we have a chance of coming up with the right answer.
+ */
+ if (zio->io_numerrors >= rm->rm_firstdatacol) {
+ ASSERT(zio->io_error != 0);
+ goto done;
+ }
+
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from parity P.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ void *orig;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+ vdev_raidz_reconstruct_p(rm, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_p);
+
+ /*
+ * If this child didn't know that it returned
+ * bad data, inform it.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ goto done;
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from parity Q.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ void *orig;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+ vdev_raidz_reconstruct_q(rm, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_q);
+
+ /*
+ * If this child didn't know that it returned
+ * bad data, inform it.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ goto done;
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ if (rm->rm_firstdatacol > 1 &&
+ rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
+ rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from both P and Q.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
+ void *orig, *orig1;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+
+ for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
+ rc1 = &rm->rm_col[c1];
+
+ orig1 = zio_buf_alloc(rc1->rc_size);
+ bcopy(rc1->rc_data, orig1, rc1->rc_size);
+
+ vdev_raidz_reconstruct_pq(rm, c, c1);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio_buf_free(orig1, rc1->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_pq);
+
+ /*
+ * If these children didn't know they
+ * returned bad data, inform them.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ if (rc1->rc_tried && rc1->rc_error == 0)
+ raidz_checksum_error(zio, rc1);
+
+ rc->rc_error = ECKSUM;
+ rc1->rc_error = ECKSUM;
+
+ goto done;
+ }
+
+ bcopy(orig1, rc1->rc_data, rc1->rc_size);
+ zio_buf_free(orig1, rc1->rc_size);
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ /*
+ * All combinations failed to checksum. Generate checksum ereports for
+ * all children.
+ */
+ zio->io_error = ECKSUM;
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
+ rc->rc_offset, rc->rc_size);
+ }
+ }
+
+done:
+ zio_checksum_verified(zio);
+
+ if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+ (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ zio_t *rio;
+
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ *
+ * We issue all repair I/Os as children of 'rio' to arrange
+ * that vdev_raidz_map_free(zio) will be invoked after all
+ * repairs complete, but before we advance to the next stage.
+ */
+ rio = zio_null(zio, zio->io_spa,
+ vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error == 0)
+ continue;
+
+ dprintf("%s resilvered %s @ 0x%llx error %d\n",
+ vdev_description(vd),
+ vdev_description(cvd),
+ zio->io_offset, rc->rc_error);
+
+ zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_CANFAIL, NULL, NULL));
+ }
+
+ zio_nowait(rio);
+
+ return (zio_wait_for_children_done(zio));
+ }
+
+ vdev_raidz_map_free(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted > vd->vdev_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+ vdev_raidz_open,
+ vdev_raidz_close,
+ NULL,
+ vdev_raidz_asize,
+ vdev_raidz_io_start,
+ vdev_raidz_io_done,
+ vdev_raidz_state_change,
+ VDEV_TYPE_RAIDZ, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/zfs/lib/libzpool/vdev_root.c b/zfs/lib/libzpool/vdev_root.c
new file mode 100644
index 000000000..3bb1fd209
--- /dev/null
+++ b/zfs/lib/libzpool/vdev_root.c
@@ -0,0 +1,130 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)vdev_root.c 1.5 07/10/24 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata. Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
+ * place to live. When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine. Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+static int
+too_many_errors(vdev_t *vd, int numerrors)
+{
+ ASSERT3U(numerrors, <=, vd->vdev_children);
+ return (numerrors == vd->vdev_children);
+}
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ int c;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ int error;
+
+ if ((error = vdev_open(cvd)) != 0) {
+ lasterror = error;
+ numerrors++;
+ continue;
+ }
+ }
+
+ if (numerrors > 0) {
+ if (!too_many_errors(vd, numerrors)) {
+ /* XXX - should not be explicitly setting this state */
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+ VDEV_AUX_NO_REPLICAS);
+ } else {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+ }
+
+ *asize = 0;
+ *ashift = 0;
+
+ return (0);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted) {
+ if (too_many_errors(vd, faulted))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+ VDEV_AUX_NO_REPLICAS);
+ } else if (degraded) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ } else {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ }
+}
+
+vdev_ops_t vdev_root_ops = {
+ vdev_root_open,
+ vdev_root_close,
+ NULL,
+ vdev_default_asize,
+ NULL, /* io_start - not applicable to the root */
+ NULL, /* io_done - not applicable to the root */
+ vdev_root_state_change,
+ VDEV_TYPE_ROOT, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/zfs/lib/libzpool/zap.c b/zfs/lib/libzpool/zap.c
new file mode 100644
index 000000000..f4f456ce8
--- /dev/null
+++ b/zfs/lib/libzpool/zap.c
@@ -0,0 +1,1085 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zap.c 1.13 07/11/19 SMI"
+
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+int fzap_default_block_shift = 14; /* 16k blocksize */
+
+static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+ uint64_t block_type;
+
+ block_type = *(uint64_t *)vbuf;
+
+ if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
+ zap_leaf_byteswap(vbuf, size);
+ else {
+ /* it's a ptrtbl block */
+ byteswap_uint64_array(vbuf, size);
+ }
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ zap_leaf_t *l;
+ int i;
+ zap_phys_t *zp;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ zap->zap_ismicro = FALSE;
+
+ (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
+ &zap->zap_f.zap_phys, zap_evict);
+
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+ zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
+
+ zp = zap->zap_f.zap_phys;
+ /*
+ * explicitly zero it since it might be coming from an
+ * initialized microzap
+ */
+ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
+ zp->zap_block_type = ZBT_HEADER;
+ zp->zap_magic = ZAP_MAGIC;
+
+ zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
+
+ zp->zap_freeblk = 2; /* block 1 will be the first leaf */
+ zp->zap_num_leafs = 1;
+ zp->zap_num_entries = 0;
+ zp->zap_salt = zap->zap_salt;
+ zp->zap_normflags = zap->zap_normflags;
+
+ /* block 1 will be the first leaf */
+ for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
+
+ /*
+ * set up block 1 - the first leaf
+ */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+
+ l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ l->l_dbuf = db;
+ l->l_phys = db->db_data;
+
+ zap_leaf_init(l, zp->zap_normflags != 0);
+
+ kmem_free(l, sizeof (zap_leaf_t));
+ dmu_buf_rele(db, FTAG);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+ if (RW_WRITE_HELD(&zap->zap_rwlock))
+ return (1);
+ if (rw_tryupgrade(&zap->zap_rwlock)) {
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static int
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+ void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+ dmu_tx_t *tx)
+{
+ uint64_t b, newblk;
+ dmu_buf_t *db_old, *db_new;
+ int err;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int hepb = 1<<(bs-4);
+ /* hepb = half the number of entries in a block */
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+ ASSERT(tbl->zt_numblks > 0);
+
+ if (tbl->zt_nextblk != 0) {
+ newblk = tbl->zt_nextblk;
+ } else {
+ newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
+ tbl->zt_nextblk = newblk;
+ ASSERT3U(tbl->zt_blks_copied, ==, 0);
+ dmu_prefetch(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs);
+ }
+
+ /*
+ * Copy the ptrtbl from the old to new location.
+ */
+
+ b = tbl->zt_blks_copied;
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + b) << bs, FTAG, &db_old);
+ if (err)
+ return (err);
+
+ /* first half of entries in old[b] go to new[2*b+0] */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+0) << bs, FTAG, &db_new));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func(db_old->db_data, db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ /* second half of entries in old[b] go to new[2*b+1] */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+1) << bs, FTAG, &db_new));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func((uint64_t *)db_old->db_data + hepb,
+ db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ dmu_buf_rele(db_old, FTAG);
+
+ tbl->zt_blks_copied++;
+
+ dprintf("copied block %llu of %llu\n",
+ tbl->zt_blks_copied, tbl->zt_numblks);
+
+ if (tbl->zt_blks_copied == tbl->zt_numblks) {
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
+
+ tbl->zt_blk = newblk;
+ tbl->zt_numblks *= 2;
+ tbl->zt_shift++;
+ tbl->zt_nextblk = 0;
+ tbl->zt_blks_copied = 0;
+
+ dprintf("finished; numblocks now %llu (%lluk entries)\n",
+ tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+ }
+
+ return (0);
+}
+
+static int
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+ dmu_tx_t *tx)
+{
+ int err;
+ uint64_t blk, off;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ dmu_buf_t *db;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+
+ dprintf("storing %llx at index %llx\n", val, idx);
+
+ blk = idx >> (bs-3);
+ off = idx & ((1<<(bs-3))-1);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db);
+ if (err)
+ return (err);
+ dmu_buf_will_dirty(db, tx);
+
+ if (tbl->zt_nextblk != 0) {
+ uint64_t idx2 = idx * 2;
+ uint64_t blk2 = idx2 >> (bs-3);
+ uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+ dmu_buf_t *db2;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+ if (err) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+ dmu_buf_will_dirty(db2, tx);
+ ((uint64_t *)db2->db_data)[off2] = val;
+ ((uint64_t *)db2->db_data)[off2+1] = val;
+ dmu_buf_rele(db2, FTAG);
+ }
+
+ ((uint64_t *)db->db_data)[off] = val;
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
+}
+
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
+{
+ uint64_t blk, off;
+ int err;
+ dmu_buf_t *db;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ blk = idx >> (bs-3);
+ off = idx & ((1<<(bs-3))-1);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db);
+ if (err)
+ return (err);
+ *valp = ((uint64_t *)db->db_data)[off];
+ dmu_buf_rele(db, FTAG);
+
+ if (tbl->zt_nextblk != 0) {
+ /*
+ * read the nextblk for the sake of i/o error checking,
+ * so that zap_table_load() will catch errors for
+ * zap_table_store.
+ */
+ blk = (idx*2) >> (bs-3);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+ dmu_buf_rele(db, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+ int i;
+ for (i = 0; i < n; i++) {
+ uint64_t lb = src[i];
+ dst[2*i+0] = lb;
+ dst[2*i+1] = lb;
+ }
+}
+
+static int
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+ /* In case things go horribly wrong. */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+ return (ENOSPC);
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ /*
+ * We are outgrowing the "embedded" ptrtbl (the one
+ * stored in the header block). Give it its own entire
+ * block, which will double the size of the ptrtbl.
+ */
+ uint64_t newblk;
+ dmu_buf_t *db_new;
+ int err;
+
+ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
+
+ newblk = zap_allocate_blocks(zap, 1);
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+ if (err)
+ return (err);
+ dmu_buf_will_dirty(db_new, tx);
+ zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ dmu_buf_rele(db_new, FTAG);
+
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+
+ ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+ (FZAP_BLOCK_SHIFT(zap)-3));
+
+ return (0);
+ } else {
+ return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ zap_ptrtbl_transfer, tx));
+ }
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+ ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
+ zap->zap_f.zap_phys->zap_num_entries += delta;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+static uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks)
+{
+ uint64_t newblk;
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ newblk = zap->zap_f.zap_phys->zap_freeblk;
+ zap->zap_f.zap_phys->zap_freeblk += nblocks;
+ return (newblk);
+}
+
+static zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+ void *winner;
+ zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ rw_init(&l->l_rwlock, 0, 0, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = zap_allocate_blocks(zap, 1);
+ l->l_dbuf = NULL;
+ l->l_phys = NULL;
+
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
+ winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+ ASSERT(winner == NULL);
+ dmu_buf_will_dirty(l->l_dbuf, tx);
+
+ zap_leaf_init(l, zap->zap_normflags != 0);
+
+ zap->zap_f.zap_phys->zap_num_leafs++;
+
+ return (l);
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+ ASSERT(!zap->zap_ismicro);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+ *count = zap->zap_f.zap_phys->zap_num_entries;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+ return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+void
+zap_put_leaf(zap_leaf_t *l)
+{
+ rw_exit(&l->l_rwlock);
+ dmu_buf_rele(l->l_dbuf, NULL);
+}
+
+_NOTE(ARGSUSED(0))
+static void
+zap_leaf_pageout(dmu_buf_t *db, void *vl)
+{
+ zap_leaf_t *l = vl;
+
+ rw_destroy(&l->l_rwlock);
+ kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+ zap_leaf_t *l, *winner;
+
+ ASSERT(blkid != 0);
+
+ l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+ rw_init(&l->l_rwlock, 0, 0, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = blkid;
+ l->l_bs = highbit(db->db_size)-1;
+ l->l_dbuf = db;
+ l->l_phys = NULL;
+
+ winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+
+ rw_exit(&l->l_rwlock);
+ if (winner != NULL) {
+ /* someone else set it first */
+ zap_leaf_pageout(NULL, l);
+ l = winner;
+ }
+
+ /*
+ * lhr_pad was previously used for the next leaf in the leaf
+ * chain. There should be no chained leafs (as we have removed
+ * support for them).
+ */
+ ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0);
+
+ /*
+ * There should be more hash entries than there can be
+ * chunks to put in the hash table
+ */
+ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
+
+ /* The chunks should begin at the end of the hash table */
+ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
+ &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+
+ /* The chunks should end at the end of the block */
+ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
+ (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
+
+ return (l);
+}
+
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+ zap_leaf_t **lp)
+{
+ dmu_buf_t *db;
+ zap_leaf_t *l;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ blkid << bs, NULL, &db);
+ if (err)
+ return (err);
+
+ ASSERT3U(db->db_object, ==, zap->zap_object);
+ ASSERT3U(db->db_offset, ==, blkid << bs);
+ ASSERT3U(db->db_size, ==, 1 << bs);
+ ASSERT(blkid != 0);
+
+ l = dmu_buf_get_user(db);
+
+ if (l == NULL)
+ l = zap_open_leaf(blkid, db);
+
+ rw_enter(&l->l_rwlock, lt);
+ /*
+ * Must lock before dirtying, otherwise l->l_phys could change,
+ * causing ASSERT below to fail.
+ */
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(l->l_blkid, ==, blkid);
+ ASSERT3P(l->l_dbuf, ==, db);
+ ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
+ ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ *lp = l;
+ return (0);
+}
+
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ ASSERT3U(idx, <,
+ (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+ *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+ return (0);
+ } else {
+ return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx, valp));
+ }
+}
+
+static int
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+ return (0);
+ } else {
+ return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx, blk, tx));
+ }
+}
+
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
+{
+ uint64_t idx, blk;
+ int err;
+
+ ASSERT(zap->zap_dbuf == NULL ||
+ zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
+ ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
+ idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ err = zap_idx_to_blk(zap, idx, &blk);
+ if (err != 0)
+ return (err);
+ err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
+
+ ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
+ (*lp)->l_phys->l_hdr.lh_prefix);
+ return (err);
+}
+
+static int
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
+{
+ zap_t *zap = zn->zn_zap;
+ uint64_t hash = zn->zn_hash;
+ zap_leaf_t *nl;
+ int prefix_diff, i, err;
+ uint64_t sibling;
+ int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+
+ ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ l->l_phys->l_hdr.lh_prefix);
+
+ if (zap_tryupgradedir(zap, tx) == 0 ||
+ old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ /* We failed to upgrade, or need to grow the pointer table */
+ objset_t *os = zap->zap_objset;
+ uint64_t object = zap->zap_object;
+
+ zap_put_leaf(l);
+ zap_unlockdir(zap);
+ err = zap_lockdir(os, object, tx, RW_WRITER,
+ FALSE, FALSE, &zn->zn_zap);
+ zap = zn->zn_zap;
+ if (err)
+ return (err);
+ ASSERT(!zap->zap_ismicro);
+
+ while (old_prefix_len ==
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ err = zap_grow_ptrtbl(zap, tx);
+ if (err)
+ return (err);
+ }
+
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err)
+ return (err);
+
+ if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
+ /* it split while our locks were down */
+ *lp = l;
+ return (0);
+ }
+ }
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ l->l_phys->l_hdr.lh_prefix);
+
+ prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+ (old_prefix_len + 1);
+ sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+
+ /* check for i/o errors before doing zap_leaf_split */
+ for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, sibling+i, &blk);
+ if (err)
+ return (err);
+ ASSERT3U(blk, ==, l->l_blkid);
+ }
+
+ nl = zap_create_leaf(zap, tx);
+ zap_leaf_split(l, nl, zap->zap_normflags != 0);
+
+ /* set sibling pointers */
+ for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+ ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
+ }
+
+ if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
+ /* we want the sibling */
+ zap_put_leaf(l);
+ *lp = nl;
+ } else {
+ zap_put_leaf(nl);
+ *lp = l;
+ }
+
+ return (0);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+{
+ zap_t *zap = zn->zn_zap;
+ int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+ int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
+ l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+
+ zap_put_leaf(l);
+
+ if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+ int err;
+
+ /*
+ * We are in the middle of growing the pointer table, or
+ * this leaf will soon make us grow it.
+ */
+ if (zap_tryupgradedir(zap, tx) == 0) {
+ objset_t *os = zap->zap_objset;
+ uint64_t zapobj = zap->zap_object;
+
+ zap_unlockdir(zap);
+ err = zap_lockdir(os, zapobj, tx,
+ RW_WRITER, FALSE, FALSE, &zn->zn_zap);
+ zap = zn->zn_zap;
+ if (err)
+ return;
+ }
+
+ /* could have finished growing while our locks were down */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+ (void) zap_grow_ptrtbl(zap, tx);
+ }
+}
+
+
+static int
+fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+{
+ if (name && strlen(name) > ZAP_MAXNAMELEN)
+ return (E2BIG);
+
+ /* Only integer sizes supported by C */
+ switch (integer_size) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ if (integer_size * num_integers > ZAP_MAXVALUELEN)
+ return (E2BIG);
+
+ return (0);
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+int
+fzap_lookup(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ char *realname, int rn_len, boolean_t *ncp)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+
+ err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+ (void) zap_entry_read_name(&zeh, rn_len, realname);
+ if (ncp) {
+ *ncp = zap_entry_normalization_conflict(&zeh,
+ zn, NULL, zn->zn_zap);
+ }
+ }
+
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_add_cd(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+ zap_t *zap = zn->zn_zap;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(!zap->zap_ismicro);
+ ASSERT(fzap_checksize(zn->zn_name_orij,
+ integer_size, num_integers) == 0);
+
+ err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ err = EEXIST;
+ goto out;
+ }
+ if (err != ENOENT)
+ goto out;
+
+ err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
+ integer_size, num_integers, val, &zeh);
+
+ if (err == 0) {
+ zap_increment_num_entries(zap, 1, tx);
+ } else if (err == EAGAIN) {
+ err = zap_expand_leaf(zn, l, tx, &l);
+ zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
+ if (err == 0)
+ goto retry;
+ }
+
+out:
+ if (zap != NULL)
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+ return (err);
+}
+
+int
+fzap_add(zap_name_t *zn,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ return (fzap_add_cd(zn, integer_size, num_integers,
+ val, ZAP_MAXCD, tx));
+}
+
+int
+fzap_update(zap_name_t *zn,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err, create;
+ zap_entry_handle_t zeh;
+ zap_t *zap = zn->zn_zap;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, zn, &zeh);
+ create = (err == ENOENT);
+ ASSERT(err == 0 || err == ENOENT);
+
+ if (create) {
+ err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
+ ZAP_MAXCD, integer_size, num_integers, val, &zeh);
+ if (err == 0)
+ zap_increment_num_entries(zap, 1, tx);
+ } else {
+ err = zap_entry_update(&zeh, integer_size, num_integers, val);
+ }
+
+ if (err == EAGAIN) {
+ err = zap_expand_leaf(zn, l, tx, &l);
+ zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
+ if (err == 0)
+ goto retry;
+ }
+
+ if (zap != NULL)
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+ return (err);
+}
+
+int
+fzap_length(zap_name_t *zn,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err != 0)
+ goto out;
+
+ if (integer_size)
+ *integer_size = zeh.zeh_integer_size;
+ if (num_integers)
+ *num_integers = zeh.zeh_num_integers;
+out:
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ int err;
+ zap_entry_handle_t zeh;
+
+ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, zn, &zeh);
+ if (err == 0) {
+ zap_entry_remove(&zeh);
+ zap_increment_num_entries(zn->zn_zap, -1, tx);
+ }
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
+ char *name)
+{
+ zap_cursor_t zc;
+ zap_attribute_t *za;
+ int err;
+
+ if (mask == 0)
+ mask = -1ULL;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, zapobj);
+ (err = zap_cursor_retrieve(&zc, za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((za->za_first_integer & mask) == (value & mask)) {
+ (void) strcpy(name, za->za_name);
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (zap_attribute_t));
+ return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err = ENOENT;
+ zap_entry_handle_t zeh;
+ zap_leaf_t *l;
+
+ /* retrieve the next entry at or after zc_hash/zc_cd */
+ /* if no entry, return ENOENT */
+
+ if (zc->zc_leaf &&
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
+ zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+
+again:
+ if (zc->zc_leaf == NULL) {
+ err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+ &zc->zc_leaf);
+ if (err != 0)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ }
+ l = zc->zc_leaf;
+
+ err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+ if (err == ENOENT) {
+ uint64_t nocare =
+ (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
+ zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+ zc->zc_cd = 0;
+ if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
+ zc->zc_hash = -1ULL;
+ } else {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ goto again;
+ }
+ }
+
+ if (err == 0) {
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+ za->za_integer_length = zeh.zeh_integer_size;
+ za->za_num_integers = zeh.zeh_num_integers;
+ if (zeh.zeh_num_integers == 0) {
+ za->za_first_integer = 0;
+ } else {
+ err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+ ASSERT(err == 0 || err == EOVERFLOW);
+ }
+ err = zap_entry_read_name(&zeh,
+ sizeof (za->za_name), za->za_name);
+ ASSERT(err == 0);
+
+ za->za_normalization_conflict =
+ zap_entry_normalization_conflict(&zeh,
+ NULL, za->za_name, zap);
+ }
+ rw_exit(&zc->zc_leaf->l_rwlock);
+ return (err);
+}
+
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+ int i, err;
+ uint64_t lastblk = 0;
+
+ /*
+ * NB: if a leaf has more pointers than an entire ptrtbl block
+ * can hold, then it'll be accounted for more than once, since
+ * we won't have lastblk.
+ */
+ for (i = 0; i < len; i++) {
+ zap_leaf_t *l;
+
+ if (tbl[i] == lastblk)
+ continue;
+ lastblk = tbl[i];
+
+ err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+ if (err == 0) {
+ zap_leaf_stats(zap, l, zs);
+ zap_put_leaf(l);
+ }
+ }
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ zs->zs_blocksize = 1ULL << bs;
+
+ /*
+ * Set zap_phys_t fields
+ */
+ zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
+ zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
+ zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
+ zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
+ zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
+ zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+
+ /*
+ * Set zap_ptrtbl fields
+ */
+ zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+ zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+ zs->zs_ptrtbl_blks_copied =
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
+ zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
+ zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+ zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ /* the ptrtbl is entirely in the header block. */
+ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
+ } else {
+ int b;
+
+ dmu_prefetch(zap->zap_objset, zap->zap_object,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+
+ for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+ b++) {
+ dmu_buf_t *db;
+ int err;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+ FTAG, &db);
+ if (err == 0) {
+ zap_stats_ptrtbl(zap, db->db_data,
+ 1<<(bs-3), zs);
+ dmu_buf_rele(db, FTAG);
+ }
+ }
+ }
+}
diff --git a/zfs/lib/libzpool/zap_leaf.c b/zfs/lib/libzpool/zap_leaf.c
new file mode 100644
index 000000000..132b7af62
--- /dev/null
+++ b/zfs/lib/libzpool/zap_leaf.c
@@ -0,0 +1,853 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zap_leaf.c 1.9 07/11/16 SMI"
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+
+#define CHAIN_END 0xffff /* end of the chunk chain */
+
+/* half the (current) minimum block size */
+#define MAX_ARRAY_BYTES (8<<10)
+
+#define LEAF_HASH(l, h) \
+ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
+ ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
+
+#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+ char *cp = a;
+ char *cpend = cp + n;
+
+ while (cp < cpend)
+ *cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+ switch (len) {
+ case 1:
+ *(uint8_t *)addr = value;
+ return;
+ case 2:
+ *(uint16_t *)addr = value;
+ return;
+ case 4:
+ *(uint32_t *)addr = value;
+ return;
+ case 8:
+ *(uint64_t *)addr = value;
+ return;
+ }
+ ASSERT(!"bad int len");
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+ switch (len) {
+ case 1:
+ return (*(uint8_t *)addr);
+ case 2:
+ return (*(uint16_t *)addr);
+ case 4:
+ return (*(uint32_t *)addr);
+ case 8:
+ return (*(uint64_t *)addr);
+ }
+ ASSERT(!"bad int len");
+ return (0xFEEDFACEDEADBEEFULL);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+{
+ int i;
+ zap_leaf_t l;
+ l.l_bs = highbit(size)-1;
+ l.l_phys = buf;
+
+ buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
+ buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
+ buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
+ buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
+ buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
+ buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
+ buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
+
+ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+ buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+ zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
+ struct zap_leaf_entry *le;
+
+ switch (lc->l_free.lf_type) {
+ case ZAP_CHUNK_ENTRY:
+ le = &lc->l_entry;
+
+ le->le_type = BSWAP_8(le->le_type);
+ le->le_int_size = BSWAP_8(le->le_int_size);
+ le->le_next = BSWAP_16(le->le_next);
+ le->le_name_chunk = BSWAP_16(le->le_name_chunk);
+ le->le_name_length = BSWAP_16(le->le_name_length);
+ le->le_value_chunk = BSWAP_16(le->le_value_chunk);
+ le->le_value_length = BSWAP_16(le->le_value_length);
+ le->le_cd = BSWAP_32(le->le_cd);
+ le->le_hash = BSWAP_64(le->le_hash);
+ break;
+ case ZAP_CHUNK_FREE:
+ lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
+ lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
+ break;
+ case ZAP_CHUNK_ARRAY:
+ lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
+ lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
+ /* la_array doesn't need swapping */
+ break;
+ default:
+ ASSERT(!"bad leaf type");
+ }
+ }
+}
+
+void
+zap_leaf_init(zap_leaf_t *l, boolean_t sort)
+{
+ int i;
+
+ l->l_bs = highbit(l->l_dbuf->db_size)-1;
+ zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
+ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
+ }
+ ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
+ l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
+ l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+ l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+ if (sort)
+ l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+ int chunk;
+
+ ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
+
+ chunk = l->l_phys->l_hdr.lh_freelist;
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
+
+ l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+
+ l->l_phys->l_hdr.lh_nfree--;
+
+ return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+ struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
+ ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
+
+ zlf->lf_type = ZAP_CHUNK_FREE;
+ zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+ bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+ l->l_phys->l_hdr.lh_freelist = chunk;
+
+ l->l_phys->l_hdr.lh_nfree++;
+}
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(zap_leaf_t *l, const char *buf,
+ int integer_size, int num_integers)
+{
+ uint16_t chunk_head;
+ uint16_t *chunkp = &chunk_head;
+ int byten = 0;
+ uint64_t value;
+ int shift = (integer_size-1)*8;
+ int len = num_integers;
+
+ ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
+
+ while (len > 0) {
+ uint16_t chunk = zap_leaf_chunk_alloc(l);
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int i;
+
+ la->la_type = ZAP_CHUNK_ARRAY;
+ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+ if (byten == 0)
+ value = ldv(integer_size, buf);
+ la->la_array[i] = value >> shift;
+ value <<= 8;
+ if (++byten == integer_size) {
+ byten = 0;
+ buf += integer_size;
+ if (--len == 0)
+ break;
+ }
+ }
+
+ *chunkp = chunk;
+ chunkp = &la->la_next;
+ }
+ *chunkp = CHAIN_END;
+
+ return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
+{
+ uint16_t chunk = *chunkp;
+
+ *chunkp = CHAIN_END;
+
+ while (chunk != CHAIN_END) {
+ int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
+ ZAP_CHUNK_ARRAY);
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ }
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
+ int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+ char *buf)
+{
+ int len = MIN(array_len, buf_len);
+ int byten = 0;
+ uint64_t value = 0;
+
+ ASSERT3U(array_int_len, <=, buf_int_len);
+
+ /* Fast path for one 8-byte integer */
+ if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ uint8_t *ip = la->la_array;
+ uint64_t *buf64 = (uint64_t *)buf;
+
+ *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
+ (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
+ (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
+ (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
+ return;
+ }
+
+ /* Fast path for an array of 1-byte integers (eg. the entry name) */
+ if (array_int_len == 1 && buf_int_len == 1 &&
+ buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
+ buf += ZAP_LEAF_ARRAY_BYTES;
+ chunk = la->la_next;
+ }
+ return;
+ }
+
+ while (len > 0) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int i;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+ value = (value << 8) | la->la_array[i];
+ byten++;
+ if (byten == array_int_len) {
+ stv(buf_int_len, buf, value);
+ byten = 0;
+ len--;
+ if (len == 0)
+ return;
+ buf += buf_int_len;
+ }
+ }
+ chunk = la->la_next;
+ }
+}
+
+/*
+ * Only to be used on 8-bit arrays.
+ * array_len is actual len in bytes (not encoded le_value_length).
+ * namenorm is null-terminated.
+ */
+static boolean_t
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len)
+{
+ int bseen = 0;
+
+ if (zn->zn_matchtype == MT_FIRST) {
+ char *thisname = kmem_alloc(array_len, KM_SLEEP);
+ boolean_t match;
+
+ zap_leaf_array_read(l, chunk, 1, array_len, 1,
+ array_len, thisname);
+ match = zap_match(zn, thisname);
+ kmem_free(thisname, array_len);
+ return (match);
+ }
+
+ /* Fast path for exact matching */
+ while (bseen < array_len) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
+ break;
+ chunk = la->la_next;
+ bseen += toread;
+ }
+ return (bseen == array_len);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
+{
+ uint16_t *chunkp;
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+again:
+ for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
+ *chunkp != CHAIN_END; chunkp = &le->le_next) {
+ uint16_t chunk = *chunkp;
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_hash != zn->zn_hash)
+ continue;
+
+ /*
+ * NB: the entry chain is always sorted by cd on
+ * normalized zap objects, so this will find the
+ * lowest-cd match for MT_FIRST.
+ */
+ ASSERT(zn->zn_matchtype == MT_EXACT ||
+ (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+ if (zap_leaf_array_match(l, zn, le->le_name_chunk,
+ le->le_name_length)) {
+ zeh->zeh_num_integers = le->le_value_length;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+ zeh->zeh_leaf = l;
+ return (0);
+ }
+ }
+
+ /*
+ * NB: we could of course do this in one pass, but that would be
+ * a pain. We'll see if MT_BEST is even used much.
+ */
+ if (zn->zn_matchtype == MT_BEST) {
+ zn->zn_matchtype = MT_FIRST;
+ goto again;
+ }
+
+ return (ENOENT);
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+#define HCD_GTEQ(h1, cd1, h2, cd2) \
+ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ uint64_t besth = -1ULL;
+ uint32_t bestcd = ZAP_MAXCD;
+ uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
+ uint16_t lh;
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+ for (chunk = l->l_phys->l_hash[lh];
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
+ HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
+ ASSERT3U(bestlh, >=, lh);
+ bestlh = lh;
+ besth = le->le_hash;
+ bestcd = le->le_cd;
+
+ zeh->zeh_num_integers = le->le_value_length;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_fakechunk = chunk;
+ zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+ zeh->zeh_leaf = l;
+ }
+ }
+ }
+
+ return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_int_size > integer_size)
+ return (EINVAL);
+
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
+ le->le_value_length, integer_size, num_integers, buf);
+
+ if (zeh->zeh_num_integers > num_integers)
+ return (EOVERFLOW);
+ return (0);
+
+}
+
+int
+zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+ le->le_name_length, 1, buflen, buf);
+ if (le->le_name_length > buflen)
+ return (EOVERFLOW);
+ return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+ int delta_chunks;
+ zap_leaf_t *l = zeh->zeh_leaf;
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
+
+ delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
+
+ if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
+ return (EAGAIN);
+
+ /*
+ * We should search other chained leaves (via
+ * zap_entry_remove,create?) otherwise returning EAGAIN will
+ * just send us into an infinite loop if we have to chain
+ * another leaf block, rather than being able to split this
+ * block.
+ */
+
+ zap_leaf_array_free(l, &le->le_value_chunk);
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_length = num_integers;
+ le->le_int_size = integer_size;
+ return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+ uint16_t entry_chunk;
+ struct zap_leaf_entry *le;
+ zap_leaf_t *l = zeh->zeh_leaf;
+
+ ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+ entry_chunk = *zeh->zeh_chunkp;
+ le = ZAP_LEAF_ENTRY(l, entry_chunk);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ zap_leaf_array_free(l, &le->le_name_chunk);
+ zap_leaf_array_free(l, &le->le_value_chunk);
+
+ *zeh->zeh_chunkp = le->le_next;
+ zap_leaf_chunk_free(l, entry_chunk);
+
+ l->l_phys->l_hdr.lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ uint16_t *chunkp;
+ struct zap_leaf_entry *le;
+ uint64_t namelen, valuelen;
+ int numchunks;
+
+ valuelen = integer_size * num_integers;
+ namelen = strlen(name) + 1;
+ ASSERT(namelen >= 2);
+
+ numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
+ ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+ if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
+ return (E2BIG);
+
+ if (cd == ZAP_MAXCD) {
+ /* find the lowest unused cd */
+ if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+ cd = 0;
+
+ for (chunk = *LEAF_HASH_ENTPTR(l, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ if (le->le_cd > cd)
+ break;
+ if (le->le_hash == h) {
+ ASSERT3U(cd, ==, le->le_cd);
+ cd++;
+ }
+ }
+ } else {
+ /* old unsorted format; do it the O(n^2) way */
+ for (cd = 0; cd < ZAP_MAXCD; cd++) {
+ for (chunk = *LEAF_HASH_ENTPTR(l, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ if (le->le_hash == h &&
+ le->le_cd == cd) {
+ break;
+ }
+ }
+ /* If this cd is not in use, we are good. */
+ if (chunk == CHAIN_END)
+ break;
+ }
+ }
+ /*
+ * we would run out of space in a block before we could
+ * have ZAP_MAXCD entries
+ */
+ ASSERT3U(cd, <, ZAP_MAXCD);
+ }
+
+ if (l->l_phys->l_hdr.lh_nfree < numchunks)
+ return (EAGAIN);
+
+ /* make the entry */
+ chunk = zap_leaf_chunk_alloc(l);
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ le->le_type = ZAP_CHUNK_ENTRY;
+ le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
+ le->le_name_length = namelen;
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_length = num_integers;
+ le->le_int_size = integer_size;
+ le->le_hash = h;
+ le->le_cd = cd;
+
+ /* link it into the hash chain */
+ /* XXX if we did the search above, we could just use that */
+ chunkp = zap_leaf_rehash_entry(l, chunk);
+
+ l->l_phys->l_hdr.lh_nentries++;
+
+ zeh->zeh_leaf = l;
+ zeh->zeh_num_integers = num_integers;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+
+ return (0);
+}
+
+/*
+ * Determine if there is another entry with the same normalized form.
+ * For performance purposes, either zn or name must be provided (the
+ * other can be NULL). Note, there usually won't be any hash
+ * conflicts, in which case we don't need the concatenated/normalized
+ * form of the name. But all callers have one of these on hand anyway,
+ * so might as well take advantage. A cleaner but slower interface
+ * would accept neither argument, and compute the normalized name as
+ * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
+ */
+boolean_t
+zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
+ const char *name, zap_t *zap)
+{
+ uint64_t chunk;
+ struct zap_leaf_entry *le;
+ boolean_t allocdzn = B_FALSE;
+
+ if (zap->zap_normflags == 0)
+ return (B_FALSE);
+
+ for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
+ if (le->le_hash != zeh->zeh_hash)
+ continue;
+ if (le->le_cd == zeh->zeh_cd)
+ continue;
+
+ if (zn == NULL) {
+ zn = zap_name_alloc(zap, name, MT_FIRST);
+ allocdzn = B_TRUE;
+ }
+ if (zap_leaf_array_match(zeh->zeh_leaf, zn,
+ le->le_name_chunk, le->le_name_length)) {
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_TRUE);
+ }
+ }
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_FALSE);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static uint16_t *
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+ struct zap_leaf_entry *le2;
+ uint16_t *chunkp;
+
+ /*
+ * keep the entry chain sorted by cd
+ * NB: this will not cause problems for unsorted leafs, though
+ * it is unnecessary there.
+ */
+ for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
+ *chunkp != CHAIN_END; chunkp = &le2->le_next) {
+ le2 = ZAP_LEAF_ENTRY(l, *chunkp);
+ if (le2->le_cd > le->le_cd)
+ break;
+ }
+
+ le->le_next = *chunkp;
+ *chunkp = entry;
+ return (chunkp);
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+ uint16_t new_chunk;
+ uint16_t *nchunkp = &new_chunk;
+
+ while (chunk != CHAIN_END) {
+ uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_array *nla =
+ &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int nextchunk = la->la_next;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
+
+ *nla = *la; /* structure assignment */
+
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ *nchunkp = nchunk;
+ nchunkp = &nla->la_next;
+ }
+ *nchunkp = CHAIN_END;
+ return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+{
+ struct zap_leaf_entry *le, *nle;
+ uint16_t chunk;
+
+ le = ZAP_LEAF_ENTRY(l, entry);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ chunk = zap_leaf_chunk_alloc(nl);
+ nle = ZAP_LEAF_ENTRY(nl, chunk);
+ *nle = *le; /* structure assignment */
+
+ (void) zap_leaf_rehash_entry(nl, chunk);
+
+ nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+ nle->le_value_chunk =
+ zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+ zap_leaf_chunk_free(l, entry);
+
+ l->l_phys->l_hdr.lh_nentries--;
+ nl->l_phys->l_hdr.lh_nentries++;
+}
+
+/*
+ * Transfer the entries whose hash prefix ends in 1 to the new leaf.
+ */
+void
+zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
+{
+ int i;
+ int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
+
+ /* set new prefix and prefix_len */
+ l->l_phys->l_hdr.lh_prefix <<= 1;
+ l->l_phys->l_hdr.lh_prefix_len++;
+ nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
+ nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+
+ /* break existing hash chains */
+ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+
+ if (sort)
+ l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+
+ /*
+ * Transfer entries whose hash bit 'bit' is set to nl; rehash
+ * the remaining entries
+ *
+ * NB: We could find entries via the hashtable instead. That
+ * would be O(hashents+numents) rather than O(numblks+numents),
+ * but this accesses memory more sequentially, and when we're
+ * called, the block is usually pretty full.
+ */
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
+ if (le->le_type != ZAP_CHUNK_ENTRY)
+ continue;
+
+ if (le->le_hash & (1ULL << bit))
+ zap_leaf_transfer_entry(l, i, nl);
+ else
+ (void) zap_leaf_rehash_entry(l, i);
+ }
+}
+
+void
+zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+ int i, n;
+
+ n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+ l->l_phys->l_hdr.lh_prefix_len;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_leafs_with_2n_pointers[n]++;
+
+
+ n = l->l_phys->l_hdr.lh_nentries/5;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_with_n5_entries[n]++;
+
+ n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
+ l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+ (1<<FZAP_BLOCK_SHIFT(zap));
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_n_tenths_full[n]++;
+
+ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+ int nentries = 0;
+ int chunk = l->l_phys->l_hash[i];
+
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(l, chunk);
+
+ n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
+ le->le_int_size);
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_entries_using_n_chunks[n]++;
+
+ chunk = le->le_next;
+ nentries++;
+ }
+
+ n = nentries;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_buckets_with_n_entries[n]++;
+ }
+}
diff --git a/zfs/lib/libzpool/zap_micro.c b/zfs/lib/libzpool/zap_micro.c
new file mode 100644
index 000000000..7aea76b31
--- /dev/null
+++ b/zfs/lib/libzpool/zap_micro.c
@@ -0,0 +1,1069 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zap_micro.c 1.12 08/04/27 SMI"
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/avl.h>
+
+#ifdef _KERNEL
+#include <sys/sunddi.h>
+#endif
+
+static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
+
+
+static uint64_t
+zap_hash(zap_t *zap, const char *normname)
+{
+ const uint8_t *cp;
+ uint8_t c;
+ uint64_t crc = zap->zap_salt;
+
+ /* NB: name must already be normalized, if necessary */
+
+ ASSERT(crc != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+ }
+
+ /*
+ * Only use 28 bits, since we need 4 bits in the cookie for the
+ * collision differentiator. We MUST use the high bits, since
+ * those are the ones that we first pay attention to when
+ * chosing the bucket.
+ */
+ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+ return (crc);
+}
+
+static int
+zap_normalize(zap_t *zap, const char *name, char *namenorm)
+{
+ size_t inlen, outlen;
+ int err;
+
+ inlen = strlen(name) + 1;
+ outlen = ZAP_MAXNAMELEN;
+
+ err = 0;
+ (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
+ zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST,
+ &err);
+
+ return (err);
+}
+
+boolean_t
+zap_match(zap_name_t *zn, const char *matchname)
+{
+ if (zn->zn_matchtype == MT_FIRST) {
+ char norm[ZAP_MAXNAMELEN];
+
+ if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
+ return (B_FALSE);
+
+ return (strcmp(zn->zn_name_norm, norm) == 0);
+ } else {
+ /* MT_BEST or MT_EXACT */
+ return (strcmp(zn->zn_name_orij, matchname) == 0);
+ }
+}
+
+void
+zap_name_free(zap_name_t *zn)
+{
+ kmem_free(zn, sizeof (zap_name_t));
+}
+
+/* XXX combine this with zap_lockdir()? */
+zap_name_t *
+zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+ zn->zn_zap = zap;
+ zn->zn_name_orij = name;
+ zn->zn_matchtype = mt;
+ if (zap->zap_normflags) {
+ if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
+ zap_name_free(zn);
+ return (NULL);
+ }
+ zn->zn_name_norm = zn->zn_normbuf;
+ } else {
+ if (mt != MT_EXACT) {
+ zap_name_free(zn);
+ return (NULL);
+ }
+ zn->zn_name_norm = zn->zn_name_orij;
+ }
+
+ zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
+ return (zn);
+}
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+ int i, max;
+ buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+ buf->mz_salt = BSWAP_64(buf->mz_salt);
+ buf->mz_normflags = BSWAP_64(buf->mz_normflags);
+ max = (size / MZAP_ENT_LEN) - 1;
+ for (i = 0; i < max; i++) {
+ buf->mz_chunk[i].mze_value =
+ BSWAP_64(buf->mz_chunk[i].mze_value);
+ buf->mz_chunk[i].mze_cd =
+ BSWAP_32(buf->mz_chunk[i].mze_cd);
+ }
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+ uint64_t block_type;
+
+ block_type = *(uint64_t *)buf;
+
+ if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
+ /* ASSERT(magic == ZAP_LEAF_MAGIC); */
+ mzap_byteswap(buf, size);
+ } else {
+ fzap_byteswap(buf, size);
+ }
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+ const mzap_ent_t *mze1 = arg1;
+ const mzap_ent_t *mze2 = arg2;
+
+ if (mze1->mze_hash > mze2->mze_hash)
+ return (+1);
+ if (mze1->mze_hash < mze2->mze_hash)
+ return (-1);
+ if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+ return (+1);
+ if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+ return (-1);
+ return (0);
+}
+
+static void
+mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+{
+ mzap_ent_t *mze;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(mzep->mze_cd < ZAP_MAXCD);
+
+ mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+ mze->mze_chunkid = chunkid;
+ mze->mze_hash = hash;
+ mze->mze_phys = *mzep;
+ avl_add(&zap->zap_m.zap_avl, mze);
+}
+
+static mzap_ent_t *
+mze_find(zap_name_t *zn)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
+
+ ASSERT(zn->zn_zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
+
+ if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
+ return (NULL);
+
+ mze_tofind.mze_hash = zn->zn_hash;
+ mze_tofind.mze_phys.mze_cd = 0;
+
+again:
+ mze = avl_find(avl, &mze_tofind, &idx);
+ if (mze == NULL)
+ mze = avl_nearest(avl, idx, AVL_AFTER);
+ for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
+ if (zap_match(zn, mze->mze_phys.mze_name))
+ return (mze);
+ }
+ if (zn->zn_matchtype == MT_BEST) {
+ zn->zn_matchtype = MT_FIRST;
+ goto again;
+ }
+ return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+ uint32_t cd;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_phys.mze_cd = 0;
+
+ cd = 0;
+ for (mze = avl_find(avl, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (mze->mze_phys.mze_cd != cd)
+ break;
+ cd++;
+ }
+
+ return (cd);
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ avl_remove(&zap->zap_m.zap_avl, mze);
+ kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+ mzap_ent_t *mze;
+ void *avlcookie = NULL;
+
+ while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
+ kmem_free(mze, sizeof (mzap_ent_t));
+ avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+ zap_t *winner;
+ zap_t *zap;
+ int i;
+
+ ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+ zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+ rw_init(&zap->zap_rwlock, 0, 0, 0);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ zap->zap_objset = os;
+ zap->zap_object = obj;
+ zap->zap_dbuf = db;
+
+ if (*(uint64_t *)db->db_data != ZBT_MICRO) {
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+ zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
+ } else {
+ zap->zap_ismicro = TRUE;
+ }
+
+ /*
+ * Make sure that zap_ismicro is set before we let others see
+ * it, because zap_lockdir() checks zap_ismicro without the lock
+ * held.
+ */
+ winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
+
+ if (winner != NULL) {
+ rw_exit(&zap->zap_rwlock);
+ rw_destroy(&zap->zap_rwlock);
+ if (!zap->zap_ismicro)
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+ kmem_free(zap, sizeof (zap_t));
+ return (winner);
+ }
+
+ if (zap->zap_ismicro) {
+ zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
+ zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
+ zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+ avl_create(&zap->zap_m.zap_avl, mze_compare,
+ sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze =
+ &zap->zap_m.zap_phys->mz_chunk[i];
+ if (mze->mze_name[0]) {
+ zap_name_t *zn;
+
+ zap->zap_m.zap_num_entries++;
+ zn = zap_name_alloc(zap, mze->mze_name,
+ MT_EXACT);
+ mze_insert(zap, i, zn->zn_hash, mze);
+ zap_name_free(zn);
+ }
+ }
+ } else {
+ zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
+ zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
+
+ ASSERT3U(sizeof (struct zap_leaf_header), ==,
+ 2*ZAP_LEAF_CHUNKSIZE);
+
+ /*
+ * The embedded pointer table should not overlap the
+ * other members.
+ */
+ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
+ &zap->zap_f.zap_phys->zap_salt);
+
+ /*
+ * The embedded pointer table should end at the end of
+ * the block
+ */
+ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
+ 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
+ (uintptr_t)zap->zap_f.zap_phys, ==,
+ zap->zap_dbuf->db_size);
+ }
+ rw_exit(&zap->zap_rwlock);
+ return (zap);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
+{
+ zap_t *zap;
+ dmu_buf_t *db;
+ krw_t lt;
+ int err;
+
+ *zapp = NULL;
+
+ err = dmu_buf_hold(os, obj, 0, NULL, &db);
+ if (err)
+ return (err);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+ }
+#endif
+
+ zap = dmu_buf_get_user(db);
+ if (zap == NULL)
+ zap = mzap_open(os, obj, db);
+
+ /*
+ * We're checking zap_ismicro without the lock held, in order to
+ * tell what type of lock we want. Once we have some sort of
+ * lock, see if it really is the right type. In practice this
+ * can only be different if it was upgraded from micro to fat,
+ * and micro wanted WRITER but fat only needs READER.
+ */
+ lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+ rw_enter(&zap->zap_rwlock, lt);
+ if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+ /* it was upgraded, now we only need reader */
+ ASSERT(lt == RW_WRITER);
+ ASSERT(RW_READER ==
+ (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
+ rw_downgrade(&zap->zap_rwlock);
+ lt = RW_READER;
+ }
+
+ zap->zap_objset = os;
+
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3P(zap->zap_dbuf, ==, db);
+
+ ASSERT(!zap->zap_ismicro ||
+ zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+ if (zap->zap_ismicro && tx && adding &&
+ zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+ uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+ if (newsz > MZAP_MAX_BLKSZ) {
+ dprintf("upgrading obj %llu: num_entries=%u\n",
+ obj, zap->zap_m.zap_num_entries);
+ *zapp = zap;
+ return (mzap_upgrade(zapp, tx));
+ }
+ err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
+ ASSERT3U(err, ==, 0);
+ zap->zap_m.zap_num_chunks =
+ db->db_size / MZAP_ENT_LEN - 1;
+ }
+
+ *zapp = zap;
+ return (0);
+}
+
+void
+zap_unlockdir(zap_t *zap)
+{
+ rw_exit(&zap->zap_rwlock);
+ dmu_buf_rele(zap->zap_dbuf, NULL);
+}
+
+static int
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
+{
+ mzap_phys_t *mzp;
+ int i, sz, nchunks, err;
+ zap_t *zap = *zapp;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ sz = zap->zap_dbuf->db_size;
+ mzp = kmem_alloc(sz, KM_SLEEP);
+ bcopy(zap->zap_dbuf->db_data, mzp, sz);
+ nchunks = zap->zap_m.zap_num_chunks;
+
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << fzap_default_block_shift, 0, tx);
+ if (err) {
+ kmem_free(mzp, sz);
+ return (err);
+ }
+
+ dprintf("upgrading obj=%llu with %u chunks\n",
+ zap->zap_object, nchunks);
+ /* XXX destroy the avl later, so we can use the stored hash value */
+ mze_destroy(zap);
+
+ fzap_upgrade(zap, tx);
+
+ for (i = 0; i < nchunks; i++) {
+ int err;
+ mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+ zap_name_t *zn;
+ if (mze->mze_name[0] == 0)
+ continue;
+ dprintf("adding %s=%llu\n",
+ mze->mze_name, mze->mze_value);
+ zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
+ err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
+ zap = zn->zn_zap; /* fzap_add_cd() may change zap */
+ zap_name_free(zn);
+ if (err)
+ break;
+ }
+ kmem_free(mzp, sz);
+ *zapp = zap;
+ return (err);
+}
+
+static void
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ mzap_phys_t *zp;
+
+ VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+ }
+#endif
+
+ dmu_buf_will_dirty(db, tx);
+ zp = db->db_data;
+ zp->mz_block_type = ZBT_MICRO;
+ zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
+ zp->mz_normflags = normflags;
+ dmu_buf_rele(db, FTAG);
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_claim_norm(os, obj,
+ 0, ot, bonustype, bonuslen, tx));
+}
+
+int
+zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
+ dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int err;
+
+ err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+ if (err != 0)
+ return (err);
+ mzap_create_impl(os, obj, normflags, tx);
+ return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
+}
+
+uint64_t
+zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+ mzap_create_impl(os, obj, normflags, tx);
+ return (obj);
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+ /*
+ * dmu_object_free will free the object number and free the
+ * data. Freeing the data will cause our pageout function to be
+ * called, which will destroy our data (zap_leaf_t's and zap_t).
+ */
+
+ return (dmu_object_free(os, zapobj, tx));
+}
+
+_NOTE(ARGSUSED(0))
+void
+zap_evict(dmu_buf_t *db, void *vzap)
+{
+ zap_t *zap = vzap;
+
+ rw_destroy(&zap->zap_rwlock);
+
+ if (zap->zap_ismicro)
+ mze_destroy(zap);
+ else
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+
+ kmem_free(zap, sizeof (zap_t));
+}
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_count(zap, count);
+ } else {
+ *count = zap->zap_m.zap_num_entries;
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+/*
+ * zn may be NULL; if not specified, it will be computed if needed.
+ * See also the comment above zap_entry_normalization_conflict().
+ */
+static boolean_t
+mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
+{
+ mzap_ent_t *other;
+ int direction = AVL_BEFORE;
+ boolean_t allocdzn = B_FALSE;
+
+ if (zap->zap_normflags == 0)
+ return (B_FALSE);
+
+again:
+ for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+ other && other->mze_hash == mze->mze_hash;
+ other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
+
+ if (zn == NULL) {
+ zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
+ MT_FIRST);
+ allocdzn = B_TRUE;
+ }
+ if (zap_match(zn, other->mze_phys.mze_name)) {
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_TRUE);
+ }
+ }
+
+ if (direction == AVL_BEFORE) {
+ direction = AVL_AFTER;
+ goto again;
+ }
+
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_FALSE);
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ return (zap_lookup_norm(os, zapobj, name, integer_size,
+ num_integers, buf, MT_EXACT, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc(zap, name, mt);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+
+ if (!zap->zap_ismicro) {
+ err = fzap_lookup(zn, integer_size, num_integers, buf,
+ realname, rn_len, ncp);
+ } else {
+ mze = mze_find(zn);
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ if (num_integers < 1) {
+ err = EOVERFLOW;
+ } else if (integer_size != 8) {
+ err = EINVAL;
+ } else {
+ *(uint64_t *)buf = mze->mze_phys.mze_value;
+ (void) strlcpy(realname,
+ mze->mze_phys.mze_name, rn_len);
+ if (ncp) {
+ *ncp = mzap_normalization_conflict(zap,
+ zn, mze);
+ }
+ }
+ }
+ }
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc(zap, name, MT_EXACT);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_length(zn, integer_size, num_integers);
+ } else {
+ mze = mze_find(zn);
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ if (integer_size)
+ *integer_size = 8;
+ if (num_integers)
+ *num_integers = 1;
+ }
+ }
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
+static void
+mzap_addent(zap_name_t *zn, uint64_t value)
+{
+ int i;
+ zap_t *zap = zn->zn_zap;
+ int start = zap->zap_m.zap_alloc_next;
+ uint32_t cd;
+
+ dprintf("obj=%llu %s=%llu\n", zap->zap_object,
+ zn->zn_name_orij, value);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
+ }
+#endif
+
+ cd = mze_find_unused_cd(zap, zn->zn_hash);
+ /* given the limited size of the microzap, this can't happen */
+ ASSERT(cd != ZAP_MAXCD);
+
+again:
+ for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ if (mze->mze_name[0] == 0) {
+ mze->mze_value = value;
+ mze->mze_cd = cd;
+ (void) strcpy(mze->mze_name, zn->zn_name_orij);
+ zap->zap_m.zap_num_entries++;
+ zap->zap_m.zap_alloc_next = i+1;
+ if (zap->zap_m.zap_alloc_next ==
+ zap->zap_m.zap_num_chunks)
+ zap->zap_m.zap_alloc_next = 0;
+ mze_insert(zap, i, zn->zn_hash, mze);
+ return;
+ }
+ }
+ if (start != 0) {
+ start = 0;
+ goto again;
+ }
+ ASSERT(!"out of entries!");
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+ const uint64_t *intval = val;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc(zap, name, MT_EXACT);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_add(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ err = mzap_upgrade(&zn->zn_zap, tx);
+ if (err == 0)
+ err = fzap_add(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ } else {
+ mze = mze_find(zn);
+ if (mze != NULL) {
+ err = EEXIST;
+ } else {
+ mzap_addent(zn, *intval);
+ }
+ }
+ ASSERT(zap == zn->zn_zap);
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ mzap_ent_t *mze;
+ const uint64_t *intval = val;
+ zap_name_t *zn;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc(zap, name, MT_EXACT);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_update(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ err = mzap_upgrade(&zn->zn_zap, tx);
+ if (err == 0)
+ err = fzap_update(zn, integer_size, num_integers,
+ val, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ } else {
+ mze = mze_find(zn);
+ if (mze != NULL) {
+ mze->mze_phys.mze_value = *intval;
+ zap->zap_m.zap_phys->mz_chunk
+ [mze->mze_chunkid].mze_value = *intval;
+ } else {
+ mzap_addent(zn, *intval);
+ }
+ }
+ ASSERT(zap == zn->zn_zap);
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+ return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
+}
+
+int
+zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
+ matchtype_t mt, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc(zap, name, mt);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ if (!zap->zap_ismicro) {
+ err = fzap_remove(zn, tx);
+ } else {
+ mze = mze_find(zn);
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ zap->zap_m.zap_num_entries--;
+ bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+ sizeof (mzap_ent_phys_t));
+ mze_remove(zap, mze);
+ }
+ }
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+/*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So use a small hash value so
+ * we can fit 4 bits of cd into the 32-bit cursor.
+ *
+ * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
+ */
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized)
+{
+ zc->zc_objset = os;
+ zc->zc_zap = NULL;
+ zc->zc_leaf = NULL;
+ zc->zc_zapobj = zapobj;
+ if (serialized == -1ULL) {
+ zc->zc_hash = -1ULL;
+ zc->zc_cd = 0;
+ } else {
+ zc->zc_hash = serialized << (64-ZAP_HASHBITS);
+ zc->zc_cd = serialized >> ZAP_HASHBITS;
+ if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
+ zc->zc_cd = 0;
+ }
+}
+
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zap_cursor_init_serialized(zc, os, zapobj, 0);
+}
+
+void
+zap_cursor_fini(zap_cursor_t *zc)
+{
+ if (zc->zc_zap) {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ zap_unlockdir(zc->zc_zap);
+ zc->zc_zap = NULL;
+ }
+ if (zc->zc_leaf) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+ zc->zc_objset = NULL;
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return (-1ULL);
+ ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
+ ASSERT(zc->zc_cd < ZAP_MAXCD);
+ return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
+ ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err;
+ avl_index_t idx;
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+
+ if (zc->zc_hash == -1ULL)
+ return (ENOENT);
+
+ if (zc->zc_zap == NULL) {
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, FALSE, &zc->zc_zap);
+ if (err)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ }
+ if (!zc->zc_zap->zap_ismicro) {
+ err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
+ } else {
+ err = ENOENT;
+
+ mze_tofind.mze_hash = zc->zc_hash;
+ mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+
+ mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+ if (mze == NULL) {
+ mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
+ idx, AVL_AFTER);
+ }
+ if (mze) {
+ ASSERT(0 == bcmp(&mze->mze_phys,
+ &zc->zc_zap->zap_m.zap_phys->mz_chunk
+ [mze->mze_chunkid], sizeof (mze->mze_phys)));
+
+ za->za_normalization_conflict =
+ mzap_normalization_conflict(zc->zc_zap, NULL, mze);
+ za->za_integer_length = 8;
+ za->za_num_integers = 1;
+ za->za_first_integer = mze->mze_phys.mze_value;
+ (void) strcpy(za->za_name, mze->mze_phys.mze_name);
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_phys.mze_cd;
+ err = 0;
+ } else {
+ zc->zc_hash = -1ULL;
+ }
+ }
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return;
+ zc->zc_cd++;
+ if (zc->zc_cd >= ZAP_MAXCD) {
+ zc->zc_cd = 0;
+ zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
+ if (zc->zc_hash == 0) /* EOF */
+ zc->zc_hash = -1ULL;
+ }
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+ int err;
+ zap_t *zap;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+
+ bzero(zs, sizeof (zap_stats_t));
+
+ if (zap->zap_ismicro) {
+ zs->zs_blocksize = zap->zap_dbuf->db_size;
+ zs->zs_num_entries = zap->zap_m.zap_num_entries;
+ zs->zs_num_blocks = 1;
+ } else {
+ fzap_get_stats(zap, zs);
+ }
+ zap_unlockdir(zap);
+ return (0);
+}
diff --git a/zfs/lib/libzpool/zfs_byteswap.c b/zfs/lib/libzpool/zfs_byteswap.c
new file mode 100644
index 000000000..2e98f84bb
--- /dev/null
+++ b/zfs/lib/libzpool/zfs_byteswap.c
@@ -0,0 +1,175 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zfs_byteswap.c 1.3 07/10/25 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+
+void
+zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
+{
+ int i;
+
+ for (i = 0; i != ace_cnt; i++, ace++) {
+ ace->a_who = BSWAP_32(ace->a_who);
+ ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+ ace->a_flags = BSWAP_16(ace->a_flags);
+ ace->a_type = BSWAP_16(ace->a_type);
+ }
+}
+
+/*
+ * swap ace_t and ace_oject_t
+ */
+void
+zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
+{
+ caddr_t end;
+ caddr_t ptr;
+ zfs_ace_t *zacep;
+ ace_t *acep;
+ uint16_t entry_type;
+ size_t entry_size;
+ int ace_type;
+
+ end = (caddr_t)buf + size;
+ ptr = buf;
+
+ while (ptr < end) {
+ if (zfs_layout) {
+ zacep = (zfs_ace_t *)ptr;
+ zacep->z_hdr.z_access_mask =
+ BSWAP_32(zacep->z_hdr.z_access_mask);
+ zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags);
+ ace_type = zacep->z_hdr.z_type =
+ BSWAP_16(zacep->z_hdr.z_type);
+ entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+ } else {
+ acep = (ace_t *)ptr;
+ acep->a_access_mask = BSWAP_32(acep->a_access_mask);
+ acep->a_flags = BSWAP_16(acep->a_flags);
+ ace_type = acep->a_type = BSWAP_16(acep->a_type);
+ acep->a_who = BSWAP_32(acep->a_who);
+ entry_type = acep->a_flags & ACE_TYPE_FLAGS;
+ }
+ switch (entry_type) {
+ case ACE_OWNER:
+ case ACE_EVERYONE:
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ entry_size = zfs_layout ?
+ sizeof (zfs_ace_hdr_t) : sizeof (ace_t);
+ break;
+ case ACE_IDENTIFIER_GROUP:
+ default:
+ if (zfs_layout) {
+ zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+ }
+ switch (ace_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ entry_size = zfs_layout ?
+ sizeof (zfs_object_ace_t) :
+ sizeof (ace_object_t);
+ break;
+ default:
+ entry_size = zfs_layout ? sizeof (zfs_ace_t) :
+ sizeof (ace_t);
+ break;
+ }
+ }
+ ptr = ptr + entry_size;
+ }
+}
+
+/* ARGSUSED */
+void
+zfs_oldacl_byteswap(void *buf, size_t size)
+{
+ int cnt;
+
+ /*
+ * Arggh, since we don't know how many ACEs are in
+ * the array, we have to swap the entire block
+ */
+
+ cnt = size / sizeof (ace_t);
+
+ zfs_oldace_byteswap((ace_t *)buf, cnt);
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+ zfs_ace_byteswap(buf, size, B_TRUE);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+ znode_phys_t *zp = buf;
+
+ ASSERT(size >= sizeof (znode_phys_t));
+
+ zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+ zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+ zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+ zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+ zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+ zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+ zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+ zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+ zp->zp_gen = BSWAP_64(zp->zp_gen);
+ zp->zp_mode = BSWAP_64(zp->zp_mode);
+ zp->zp_size = BSWAP_64(zp->zp_size);
+ zp->zp_parent = BSWAP_64(zp->zp_parent);
+ zp->zp_links = BSWAP_64(zp->zp_links);
+ zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+ zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+ zp->zp_flags = BSWAP_64(zp->zp_flags);
+ zp->zp_uid = BSWAP_64(zp->zp_uid);
+ zp->zp_gid = BSWAP_64(zp->zp_gid);
+ zp->zp_zap = BSWAP_64(zp->zp_zap);
+ zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+ zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+ zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+
+ zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+ zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size);
+ zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+ zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count);
+ if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
+ zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
+ ZFS_ACE_SPACE);
+ } else
+ zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
+ ACE_SLOT_CNT);
+}
diff --git a/zfs/lib/libzpool/zfs_fm.c b/zfs/lib/libzpool/zfs_fm.c
new file mode 100644
index 000000000..4be77103f
--- /dev/null
+++ b/zfs/lib/libzpool/zfs_fm.c
@@ -0,0 +1,355 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zfs_fm.c 1.6 08/04/01 SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/sysevent.h>
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports. The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * EREPORT POOL VDEV IO
+ * block X X X
+ * data X X
+ * device X X
+ * pool X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA.
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
+ * to chain together all ereports associated with a logical piece of data. For
+ * read I/Os, there are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ * +---------------+
+ * | Aggregate I/O | No associated logical data or device
+ * +---------------+
+ * |
+ * V
+ * +---------------+ Reads associated with a piece of logical data.
+ * | Read I/O | This includes reads on behalf of RAID-Z,
+ * +---------------+ mirrors, gang blocks, retries, etc.
+ * |
+ * V
+ * +---------------+ Reads associated with a particular device, but
+ * | Physical I/O | no logical data. Issued as part of vdev caching
+ * +---------------+ and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO. Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer. But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs. They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data. When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself. Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA. If a physical I/O is issued (by passing the
+ * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
+ * unique ENA will be generated. For an aggregate I/O, this pointer is set to
+ * NULL, and no ereport will be generated (since it doesn't actually correspond
+ * to any particular device or piece of data).
+ */
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+ uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+ nvlist_t *ereport, *detector;
+ uint64_t ena;
+ char class[64];
+
+ /*
+ * If we are doing a spa_tryimport(), ignore errors.
+ */
+ if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ return;
+
+ /*
+ * If we are in the middle of opening a pool, and the previous attempt
+ * failed, don't bother logging any new ereports - we're just going to
+ * get the same diagnosis anyway.
+ */
+ if (spa->spa_load_state != SPA_LOAD_NONE &&
+ spa->spa_last_open_failed)
+ return;
+
+ /*
+ * Ignore any errors from I/Os that we are going to retry anyway - we
+ * only generate errors from the final failure. Checksum errors are
+ * generated after the pipeline stage responsible for retrying the I/O
+ * (VDEV_IO_ASSESS), so this only applies to standard I/O errors.
+ */
+ if (zio && zio_should_retry(zio) && zio->io_error != ECKSUM)
+ return;
+
+ /*
+ * If this is not a read or write zio, ignore the error. This can occur
+ * if the DKIOCFLUSHWRITECACHE ioctl fails.
+ */
+ if (zio && zio->io_type != ZIO_TYPE_READ &&
+ zio->io_type != ZIO_TYPE_WRITE)
+ return;
+
+ if ((ereport = fm_nvlist_create(NULL)) == NULL)
+ return;
+
+ if ((detector = fm_nvlist_create(NULL)) == NULL) {
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ return;
+ }
+
+ /*
+ * Serialize ereport generation
+ */
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Determine the ENA to use for this event. If we are in a loading
+ * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
+ * a root zio-wide ENA. Otherwise, simply use a unique ENA.
+ */
+ if (spa->spa_load_state != SPA_LOAD_NONE) {
+ if (spa->spa_ena == 0)
+ spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+ ena = spa->spa_ena;
+ } else if (zio != NULL && zio->io_logical != NULL) {
+ if (zio->io_logical->io_ena == 0)
+ zio->io_logical->io_ena =
+ fm_ena_generate(0, FM_ENA_FMT1);
+ ena = zio->io_logical->io_ena;
+ } else {
+ ena = fm_ena_generate(0, FM_ENA_FMT1);
+ }
+
+ /*
+ * Construct the full class, detector, and other standard FMA fields.
+ */
+ (void) snprintf(class, sizeof (class), "%s.%s",
+ ZFS_ERROR_CLASS, subclass);
+
+ fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+ vd != NULL ? vd->vdev_guid : 0);
+
+ fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
+
+ /*
+ * Construct the per-ereport payload, depending on which parameters are
+ * passed in.
+ */
+
+ /*
+ * Generic payload members common to all ereports.
+ *
+ * The direct reference to spa_name is used rather than spa_name()
+ * because of the asynchronous nature of the zio pipeline. spa_name()
+ * asserts that the config lock is held in some form. This is always
+ * the case in I/O context, but because the check for RW_WRITER compares
+ * against 'curthread', we may be in an asynchronous context and blow
+ * this assert. Rather than loosen this assert, we acknowledge that all
+ * contexts in which this function is called (pool open, I/O) are safe,
+ * and dereference the name directly.
+ */
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
+ DATA_TYPE_STRING, spa->spa_name, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ DATA_TYPE_UINT64, spa_guid(spa),
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+ spa->spa_load_state, NULL);
+
+ if (vd != NULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ DATA_TYPE_UINT64, vd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
+ if (vd->vdev_path)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+ DATA_TYPE_STRING, vd->vdev_path, NULL);
+ if (vd->vdev_devid)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+ DATA_TYPE_STRING, vd->vdev_devid, NULL);
+
+ if (pvd != NULL) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+ DATA_TYPE_UINT64, pvd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+ DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+ NULL);
+ if (pvd->vdev_path)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+ DATA_TYPE_STRING, pvd->vdev_path, NULL);
+ if (pvd->vdev_devid)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+ DATA_TYPE_STRING, pvd->vdev_devid, NULL);
+ }
+ }
+
+ if (zio != NULL) {
+ /*
+ * Payload common to all I/Os.
+ */
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+ DATA_TYPE_INT32, zio->io_error, NULL);
+
+ /*
+ * If the 'size' parameter is non-zero, it indicates this is a
+ * RAID-Z or other I/O where the physical offset and length are
+ * provided for us, instead of within the zio_t.
+ */
+ if (vd != NULL) {
+ if (size)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ DATA_TYPE_UINT64, stateoroffset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, size, NULL);
+ else
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ DATA_TYPE_UINT64, zio->io_offset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, zio->io_size, NULL);
+ }
+
+ /*
+ * Payload for I/Os with corresponding logical information.
+ */
+ if (zio->io_logical != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_objset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_object,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+ DATA_TYPE_INT64,
+ zio->io_logical->io_bookmark.zb_level,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_blkid, NULL);
+ } else if (vd != NULL) {
+ /*
+ * If we have a vdev but no zio, this is a device fault, and the
+ * 'stateoroffset' parameter indicates the previous state of the
+ * vdev.
+ */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+ DATA_TYPE_UINT64, stateoroffset, NULL);
+ }
+ mutex_exit(&spa->spa_errlist_lock);
+
+ fm_ereport_post(ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+#endif
+}
+
+static void
+zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
+{
+#ifdef _KERNEL
+ nvlist_t *resource;
+ char class[64];
+
+ if ((resource = fm_nvlist_create(NULL)) == NULL)
+ return;
+
+ (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
+ ZFS_ERROR_CLASS, name);
+ VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
+ VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
+ if (vd)
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
+
+ fm_ereport_post(resource, EVCH_SLEEP);
+
+ fm_nvlist_destroy(resource, FM_NVA_FREE);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.ok' event is an internal signal that the associated
+ * resource (pool or disk) has been identified by ZFS as healthy. This will
+ * then trigger the DE to close the associated case, if any.
+ */
+void
+zfs_post_ok(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RESOURCE_OK);
+}
+
+/*
+ * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
+ * has been removed from the system. This will cause the DE to ignore any
+ * recent I/O errors, inferring that they are due to the asynchronous device
+ * removal.
+ */
+void
+zfs_post_remove(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
+}
+
+/*
+ * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
+ * has the 'autoreplace' property set, and therefore any broken vdevs will be
+ * handled by higher level logic, and no vdev fault should be generated.
+ */
+void
+zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
+{
+ zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
+}
diff --git a/zfs/lib/libzpool/zfs_znode.c b/zfs/lib/libzpool/zfs_znode.c
new file mode 100644
index 000000000..18ab8350f
--- /dev/null
+++ b/zfs/lib/libzpool/zfs_znode.c
@@ -0,0 +1,1390 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+
+#pragma ident "@(#)zfs_znode.c 1.34 08/04/27 SMI"
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/mkdev.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/mode.h>
+#include <sys/atomic.h>
+#include <vm/pvn.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/fs/zfs.h>
+#include <sys/kidmap.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+
+#include "zfs_prop.h"
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+struct kmem_cache *znode_cache = NULL;
+
+/*ARGSUSED*/
+static void
+znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
+{
+ /*
+ * We should never drop all dbuf refs without first clearing
+ * the eviction callback.
+ */
+ panic("evicting znode %p\n", user_ptr);
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ znode_t *zp = buf;
+
+ zp->z_vnode = vn_alloc(KM_SLEEP);
+ zp->z_vnode->v_data = (caddr_t)zp;
+ mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&zp->z_range_avl, zfs_range_compare,
+ sizeof (rl_t), offsetof(rl_t, r_node));
+
+ zp->z_dbuf = NULL;
+ zp->z_dirlocks = 0;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *cdarg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(zp->z_dirlocks == 0);
+ mutex_destroy(&zp->z_lock);
+ rw_destroy(&zp->z_map_lock);
+ rw_destroy(&zp->z_parent_lock);
+ rw_destroy(&zp->z_name_lock);
+ mutex_destroy(&zp->z_acl_lock);
+ avl_destroy(&zp->z_range_avl);
+ mutex_destroy(&zp->z_range_lock);
+
+ ASSERT(zp->z_dbuf == NULL);
+ ASSERT(ZTOV(zp)->v_count == 0);
+ vn_free(ZTOV(zp));
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, zfs_znode_cache_constructor,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_znode_fini(void)
+{
+ /*
+ * Cleanup vfs & vnode ops
+ */
+ zfs_remove_op_tables();
+
+ /*
+ * Cleanup zcache
+ */
+ if (znode_cache)
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+}
+
+struct vnodeops *zfs_dvnodeops;
+struct vnodeops *zfs_fvnodeops;
+struct vnodeops *zfs_symvnodeops;
+struct vnodeops *zfs_xdvnodeops;
+struct vnodeops *zfs_evnodeops;
+
+void
+zfs_remove_op_tables()
+{
+ /*
+ * Remove vfs ops
+ */
+ ASSERT(zfsfstype);
+ (void) vfs_freevfsops_by_type(zfsfstype);
+ zfsfstype = 0;
+
+ /*
+ * Remove vnode ops
+ */
+ if (zfs_dvnodeops)
+ vn_freevnodeops(zfs_dvnodeops);
+ if (zfs_fvnodeops)
+ vn_freevnodeops(zfs_fvnodeops);
+ if (zfs_symvnodeops)
+ vn_freevnodeops(zfs_symvnodeops);
+ if (zfs_xdvnodeops)
+ vn_freevnodeops(zfs_xdvnodeops);
+ if (zfs_evnodeops)
+ vn_freevnodeops(zfs_evnodeops);
+
+ zfs_dvnodeops = NULL;
+ zfs_fvnodeops = NULL;
+ zfs_symvnodeops = NULL;
+ zfs_xdvnodeops = NULL;
+ zfs_evnodeops = NULL;
+}
+
+extern const fs_operation_def_t zfs_dvnodeops_template[];
+extern const fs_operation_def_t zfs_fvnodeops_template[];
+extern const fs_operation_def_t zfs_xdvnodeops_template[];
+extern const fs_operation_def_t zfs_symvnodeops_template[];
+extern const fs_operation_def_t zfs_evnodeops_template[];
+
+int
+zfs_create_op_tables()
+{
+ int error;
+
+ /*
+ * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
+ * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
+ * In this case we just return as the ops vectors are already set up.
+ */
+ if (zfs_dvnodeops)
+ return (0);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
+ &zfs_dvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
+ &zfs_fvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
+ &zfs_symvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
+ &zfs_xdvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
+ &zfs_evnodeops);
+
+ return (error);
+}
+
+/*
+ * zfs_init_fs - Initialize the zfsvfs struct and the file system
+ * incore "master" object. Verify version compatibility.
+ */
+int
+zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
+{
+ extern int zfsfstype;
+
+ objset_t *os = zfsvfs->z_os;
+ int i, error;
+ dmu_object_info_t doi;
+ uint64_t fsid_guid;
+ uint64_t zval;
+
+ *zpp = NULL;
+
+ /*
+ * XXX - hack to auto-create the pool root filesystem at
+ * the first attempted mount.
+ */
+ if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ uint64_t zpl_version;
+ nvlist_t *zprops;
+
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ ASSERT3U(error, ==, 0);
+ if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+ zpl_version = ZPL_VERSION;
+ else
+ zpl_version = ZPL_VERSION_FUID - 1;
+
+ VERIFY(nvlist_alloc(&zprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(zprops,
+ zfs_prop_to_name(ZFS_PROP_VERSION), zpl_version) == 0);
+ zfs_create_fs(os, cr, zprops, tx);
+ nvlist_free(zprops);
+ dmu_tx_commit(tx);
+ }
+
+ error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+ if (error) {
+ return (error);
+ } else if (zfsvfs->z_version > ZPL_VERSION) {
+ (void) printf("Mismatched versions: File system "
+ "is version %llu on-disk format, which is "
+ "incompatible with this software version %lld!",
+ (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
+ return (ENOTSUP);
+ }
+
+ if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
+ return (error);
+ zfsvfs->z_norm = (int)zval;
+ if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
+ return (error);
+ zfsvfs->z_utf8 = (zval != 0);
+ if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
+ return (error);
+ zfsvfs->z_case = (uint_t)zval;
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ zfsvfs->z_case == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ /*
+ * The fsid is 64 bits, composed of an 8-bit fs type, which
+ * separates our fsid from any other filesystem types, and a
+ * 56-bit objset unique ID. The objset unique ID is unique to
+ * all objsets open on this system, provided by unique_create().
+ * The 8-bit fs type must be put in the low bits of fsid[1]
+ * because that's where other Solaris filesystems put it.
+ */
+ fsid_guid = dmu_objset_fsid_guid(os);
+ ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+ zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
+ zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+ zfsfstype & 0xFF;
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error)
+ return (error);
+ ASSERT(zfsvfs->z_root != 0);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error)
+ return (error);
+
+ /*
+ * Initialize zget mutex's
+ */
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
+ if (error) {
+ /*
+ * On error, we destroy the mutexes here since it's not
+ * possible for the caller to determine if the mutexes were
+ * initialized properly.
+ */
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ return (error);
+ }
+ ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+ &zfsvfs->z_fuid_obj);
+ if (error == ENOENT)
+ error = 0;
+
+ return (0);
+}
+
+/*
+ * define a couple of values we need available
+ * for both 64 and 32 bit environments.
+ */
+#ifndef NBITSMINOR64
+#define NBITSMINOR64 32
+#endif
+#ifndef MAXMAJ64
+#define MAXMAJ64 0xffffffffUL
+#endif
+#ifndef MAXMIN64
+#define MAXMIN64 0xffffffffUL
+#endif
+
+/*
+ * Create special expldev for ZFS private use.
+ * Can't use standard expldev since it doesn't do
+ * what we want. The standard expldev() takes a
+ * dev32_t in LP64 and expands it to a long dev_t.
+ * We need an interface that takes a dev32_t in ILP32
+ * and expands it to a long dev_t.
+ */
+static uint64_t
+zfs_expldev(dev_t dev)
+{
+#ifndef _LP64
+ major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
+ return (((uint64_t)major << NBITSMINOR64) |
+ ((minor_t)dev & MAXMIN32));
+#else
+ return (dev);
+#endif
+}
+
+/*
+ * Special cmpldev for ZFS private use.
+ * Can't use standard cmpldev since it takes
+ * a long dev_t and compresses it to dev32_t in
+ * LP64. We need to do a compaction of a long dev_t
+ * to a dev32_t in ILP32.
+ */
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+#ifndef _LP64
+ minor_t minor = (minor_t)dev & MAXMIN64;
+ major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
+
+ if (major > MAXMAJ32 || minor > MAXMIN32)
+ return (NODEV32);
+
+ return (((dev32_t)major << NBITSMINOR32) | minor);
+#else
+ return (dev);
+#endif
+}
+
+static void
+zfs_znode_dmu_init(znode_t *zp, dmu_buf_t *db)
+{
+ znode_t *nzp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp)));
+
+ mutex_enter(&zp->z_lock);
+
+ ASSERT(zp->z_dbuf == NULL);
+ zp->z_dbuf = db;
+ nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
+
+ /*
+ * there should be no
+ * concurrent zgets on this object.
+ */
+ if (nzp != NULL)
+ panic("existing znode %p for dbuf %p", nzp, db);
+
+ /*
+ * Slap on VROOT if we are the root znode
+ */
+ if (zp->z_id == zfsvfs->z_root)
+ ZTOV(zp)->v_flag |= VROOT;
+
+ mutex_exit(&zp->z_lock);
+ vn_exists(ZTOV(zp));
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+ dmu_buf_t *db = zp->z_dbuf;
+ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp)) || zp->z_unlinked ||
+ RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
+ ASSERT(zp->z_dbuf != NULL);
+ zp->z_dbuf = NULL;
+ VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
+ dmu_buf_rele(db, NULL);
+}
+
+/*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
+{
+ znode_t *zp;
+ vnode_t *vp;
+
+ zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+
+ ASSERT(zp->z_dirlocks == NULL);
+ ASSERT(zp->z_dbuf == NULL);
+
+ zp->z_phys = NULL;
+ zp->z_zfsvfs = zfsvfs;
+ zp->z_unlinked = 0;
+ zp->z_atime_dirty = 0;
+ zp->z_mapcnt = 0;
+ zp->z_last_itx = 0;
+ zp->z_id = db->db_object;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
+
+ vp = ZTOV(zp);
+ vn_reinit(vp);
+
+ zfs_znode_dmu_init(zp, db);
+
+ zp->z_gen = zp->z_phys->zp_gen;
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ vp->v_vfsp = zfsvfs->z_parent->z_vfs;
+ vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+
+ switch (vp->v_type) {
+ case VDIR:
+ if (zp->z_phys->zp_flags & ZFS_XATTR) {
+ vn_setops(vp, zfs_xdvnodeops);
+ vp->v_flag |= V_XATTRDIR;
+ } else {
+ vn_setops(vp, zfs_dvnodeops);
+ }
+ zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+ break;
+ case VBLK:
+ case VCHR:
+ vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev);
+ /*FALLTHROUGH*/
+ case VFIFO:
+ case VSOCK:
+ case VDOOR:
+ vn_setops(vp, zfs_fvnodeops);
+ break;
+ case VREG:
+ vp->v_flag |= VMODSORT;
+ vn_setops(vp, zfs_fvnodeops);
+ break;
+ case VLNK:
+ vn_setops(vp, zfs_symvnodeops);
+ break;
+ default:
+ vn_setops(vp, zfs_evnodeops);
+ break;
+ }
+
+ VFS_HOLD(zfsvfs->z_vfs);
+ return (zp);
+}
+
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_XATTR - new object is an attribute
+ * IS_REPLAY - intent log replay
+ * bonuslen - length of bonus buffer
+ * setaclp - File/Dir initial ACL
+ * fuidp - Tracks fuid allocation.
+ *
+ * OUT: zpp - allocated znode
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
+ zfs_fuid_info_t **fuidp)
+{
+ dmu_buf_t *db;
+ znode_phys_t *pzp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ timestruc_t now;
+ uint64_t gen, obj;
+ int err;
+
+ ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
+
+ if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
+ obj = vap->va_nodeid;
+ flag |= IS_REPLAY;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ } else {
+ obj = 0;
+ gethrestime(&now);
+ gen = dmu_tx_get_txg(tx);
+ }
+
+ /*
+ * Create a new DMU object.
+ */
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be to needed allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
+ if (vap->va_type == VDIR) {
+ if (flag & IS_REPLAY) {
+ err = zap_create_claim_norm(zfsvfs->z_os, obj,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ ASSERT3U(err, ==, 0);
+ } else {
+ obj = zap_create_norm(zfsvfs->z_os,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ }
+ } else {
+ if (flag & IS_REPLAY) {
+ err = dmu_object_claim(zfsvfs->z_os, obj,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ ASSERT3U(err, ==, 0);
+ } else {
+ obj = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ }
+ }
+ VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
+ dmu_buf_will_dirty(db, tx);
+
+ /*
+ * Initialize the znode physical data to zero.
+ */
+ ASSERT(db->db_size >= sizeof (znode_phys_t));
+ bzero(db->db_data, db->db_size);
+ pzp = db->db_data;
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_dbuf = db;
+ dzp->z_phys = pzp;
+ dzp->z_id = obj;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp->z_phys->zp_flags & ZFS_XATTR)
+ flag |= IS_XATTR;
+
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ pzp->zp_rdev = zfs_expldev(vap->va_rdev);
+ }
+
+ if (zfsvfs->z_use_fuids)
+ pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+
+ if (vap->va_type == VDIR) {
+ pzp->zp_size = 2; /* contents ("." and "..") */
+ pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ }
+
+ pzp->zp_parent = dzp->z_id;
+ if (flag & IS_XATTR)
+ pzp->zp_flags |= ZFS_XATTR;
+
+ pzp->zp_gen = gen;
+
+ ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
+ ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+
+ if (vap->va_mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+ }
+
+ if (vap->va_mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+ }
+
+ pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ if (!(flag & IS_ROOT_NODE)) {
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj)
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+ } else {
+ /*
+ * If we are creating the root node, the "parent" we
+ * passed in is the znode for the root.
+ */
+ *zpp = dzp;
+ }
+ zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
+}
+
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
+{
+ xoptattr_t *xoap;
+
+ xoap = xva_getxoptattr(xvap);
+ ASSERT(xoap);
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
+ XVA_SET_RTN(xvap, XAT_CREATETIME);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+ xoap->xoa_av_quarantined);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+ (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp));
+ zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+ }
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+ int err;
+
+ *zpp = NULL;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EINVAL);
+ }
+
+ zp = dmu_buf_get_user(db);
+ if (zp != NULL) {
+ mutex_enter(&zp->z_lock);
+
+ /*
+ * Since we do immediate eviction of the z_dbuf, we
+ * should never find a dbuf with a znode that doesn't
+ * know about the dbuf.
+ */
+ ASSERT3P(zp->z_dbuf, ==, db);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ if (zp->z_unlinked) {
+ err = ENOENT;
+ } else {
+ VN_HOLD(ZTOV(zp));
+ *zpp = zp;
+ err = 0;
+ }
+ dmu_buf_rele(db, NULL);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ /*
+ * Not found create new znode/vnode
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ *zpp = zp;
+ return (0);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ uint64_t obj_num = zp->z_id;
+ int err;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EINVAL);
+ }
+
+ if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
+ dmu_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EIO);
+ }
+
+ zfs_znode_dmu_init(zp, db);
+ zp->z_unlinked = (zp->z_phys->zp_links == 0);
+ zp->z_blksz = doi.doi_data_block_size;
+
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t obj = zp->z_id;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+ if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+ VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+ zp->z_phys->zp_acl.z_acl_extern_obj, tx));
+ }
+ VERIFY(0 == dmu_object_free(zfsvfs->z_os, obj, tx));
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+ zfs_znode_free(zp);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ vnode_t *vp = ZTOV(zp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t z_id = zp->z_id;
+
+ ASSERT(zp->z_dbuf && zp->z_phys);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode
+ */
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&vp->v_lock);
+ vp->v_count--;
+ if (vp->v_count > 0 || vn_has_cached_data(vp)) {
+ /*
+ * If the hold count is greater than zero, somebody has
+ * obtained a new reference on this znode while we were
+ * processing it here, so we are done. If we still have
+ * mapped pages then we are also done, since we don't
+ * want to inactivate the znode until the pages get pushed.
+ *
+ * XXX - if vn_has_cached_data(vp) is true, but count == 0,
+ * this seems like it would leave the znode hanging with
+ * no chance to go inactive...
+ */
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ return;
+ }
+ mutex_exit(&vp->v_lock);
+
+ /*
+ * If this was the last reference to a file with no links,
+ * remove the file from the file system.
+ */
+ if (zp->z_unlinked) {
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ zfs_rmnode(zp);
+ return;
+ }
+ mutex_exit(&zp->z_lock);
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ zfs_znode_free(zp);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ vn_invalid(ZTOV(zp));
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ kmem_cache_free(znode_cache, zp);
+
+ VFS_RELE(zfsvfs->z_vfs);
+}
+
+void
+zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+ timestruc_t now;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+
+ gethrestime(&now);
+
+ if (tx) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_atime_dirty = 0;
+ zp->z_seq++;
+ } else {
+ zp->z_atime_dirty = 1;
+ }
+
+ if (flag & AT_ATIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+
+ if (flag & AT_MTIME) {
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
+ if (zp->z_zfsvfs->z_use_fuids)
+ zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
+ }
+
+ if (flag & AT_CTIME) {
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+ if (zp->z_zfsvfs->z_use_fuids)
+ zp->z_phys->zp_flags |= ZFS_ARCHIVE;
+ }
+}
+
+/*
+ * Update the requested znode timestamps with the current time.
+ * If we are in a transaction, then go ahead and mark the znode
+ * dirty in the transaction so the timestamps will go to disk.
+ * Otherwise, we will get pushed next time the znode is updated
+ * in a transaction, or when this znode eventually goes inactive.
+ *
+ * Why is this OK?
+ * 1 - Only the ACCESS time is ever updated outside of a transaction.
+ * 2 - Multiple consecutive updates will be collapsed into a single
+ * znode update by the transaction grouping semantics of the DMU.
+ */
+void
+zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+ mutex_enter(&zp->z_lock);
+ zfs_time_stamper_locked(zp, flag, tx);
+ mutex_exit(&zp->z_lock);
+}
+
+/*
+ * Grow the block size for a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ if (size <= zp->z_blksz)
+ return;
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+ return;
+
+ error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+ size, 0, tx);
+ if (error == ENOTSUP)
+ return;
+ ASSERT3U(error, ==, 0);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+}
+
+/*
+ * This is a dummy interface used when pvn_vplist_dirty() should *not*
+ * be calling back into the fs for a putpage(). E.g.: when truncating
+ * a file, the pages being "thrown away* don't need to be written out.
+ */
+/* ARGSUSED */
+static int
+zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
+ int flags, cred_t *cr)
+{
+ ASSERT(0);
+ return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free (0 => to EOF).
+ * flag - current file open mode flags.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ rl_t *rl;
+ uint64_t end = off + len;
+ uint64_t size, new_blksz;
+ uint64_t pflags = zp->z_phys->zp_flags;
+ int error;
+
+ if ((pflags & (ZFS_IMMUTABLE|ZFS_READONLY)) ||
+ off < zp->z_phys->zp_size && (pflags & ZFS_APPENDONLY))
+ return (EPERM);
+
+ if (ZTOV(zp)->v_type == VFIFO)
+ return (0);
+
+ /*
+ * If we will change zp_size then lock the whole file,
+ * otherwise just lock the range being freed.
+ */
+ if (len == 0 || off + len > zp->z_phys->zp_size) {
+ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+ } else {
+ rl = zfs_range_lock(zp, off, len, RL_WRITER);
+ /* recheck, in case zp_size changed */
+ if (off + len > zp->z_phys->zp_size) {
+ /* lost race: file size changed, lock whole file */
+ zfs_range_unlock(rl);
+ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+ }
+ }
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ size = zp->z_phys->zp_size;
+ if (len == 0 && size == off && off != 0) {
+ zfs_range_unlock(rl);
+ return (0);
+ }
+
+ /*
+ * Check for any locks in the region to be freed.
+ */
+ if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
+ uint64_t start = off;
+ uint64_t extent = len;
+
+ if (off > size) {
+ start = size;
+ extent += off - size;
+ } else if (len == 0) {
+ extent = size - off;
+ }
+ if (error = chklock(vp, FWRITE, start, extent, flag, NULL)) {
+ zfs_range_unlock(rl);
+ return (error);
+ }
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ new_blksz = 0;
+ if (end > size &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+ } else {
+ new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+ }
+ dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
+ } else if (off < size) {
+ /*
+ * If len == 0, we are truncating the file.
+ */
+ dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+ }
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ zfs_range_unlock(rl);
+ return (error);
+ }
+
+ if (new_blksz)
+ zfs_grow_blocksize(zp, new_blksz, tx);
+
+ if (end > size || len == 0)
+ zp->z_phys->zp_size = end;
+
+ if (off < size) {
+ objset_t *os = zfsvfs->z_os;
+ uint64_t rlen = len;
+
+ if (len == 0)
+ rlen = -1;
+ else if (end > size)
+ rlen = size - off;
+ VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
+ }
+
+ if (log) {
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+ }
+
+ zfs_range_unlock(rl);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear any mapped pages in the truncated region. This has to
+ * happen outside of the transaction to avoid the possibility of
+ * a deadlock with someone trying to push a page that we are
+ * about to invalidate.
+ */
+ rw_enter(&zp->z_map_lock, RW_WRITER);
+ if (off < size && vn_has_cached_data(vp)) {
+ page_t *pp;
+ uint64_t start = off & PAGEMASK;
+ int poff = off & PAGEOFFSET;
+
+ if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
+ /*
+ * We need to zero a partial page.
+ */
+ pagezero(pp, poff, PAGESIZE - poff);
+ start += PAGESIZE;
+ page_unlock(pp);
+ }
+ error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
+ B_INVAL | B_TRUNC, NULL);
+ ASSERT(error == 0);
+ }
+ rw_exit(&zp->z_map_lock);
+
+ return (0);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+ zfsvfs_t zfsvfs;
+ uint64_t moid, doid;
+ uint64_t version = 0;
+ uint64_t sense = ZFS_CASE_SENSITIVE;
+ uint64_t norm = 0;
+ nvpair_t *elem;
+ int error;
+ znode_t *rootzp = NULL;
+ vnode_t *vp;
+ vattr_t vattr;
+ znode_t *zp;
+
+ /*
+ * First attempt to create master node.
+ */
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+ /* For the moment we expect all zpl props to be uint64_ts */
+ uint64_t val;
+ char *name;
+
+ ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+ VERIFY(nvpair_value_uint64(elem, &val) == 0);
+ name = nvpair_name(elem);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+ version = val;
+ error = zap_update(os, moid, ZPL_VERSION_STR,
+ 8, 1, &version, tx);
+ } else {
+ error = zap_update(os, moid, name, 8, 1, &val, tx);
+ }
+ ASSERT(error == 0);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+ norm = val;
+ else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+ sense = val;
+ }
+ ASSERT(version != 0);
+
+ /*
+ * Create a delete queue.
+ */
+ doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/vnode/zfsvfs
+ * to allow zfs_mknode to work.
+ */
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = crgetuid(cr);
+ vattr.va_gid = crgetgid(cr);
+
+ rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ rootzp->z_zfsvfs = &zfsvfs;
+ rootzp->z_unlinked = 0;
+ rootzp->z_atime_dirty = 0;
+
+ vp = ZTOV(rootzp);
+ vn_reinit(vp);
+ vp->v_type = VDIR;
+
+ bzero(&zfsvfs, sizeof (zfsvfs_t));
+
+ zfsvfs.z_os = os;
+ zfsvfs.z_assign = TXG_NOWAIT;
+ zfsvfs.z_parent = &zfsvfs;
+ zfsvfs.z_version = version;
+ zfsvfs.z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs.z_norm = norm;
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+ zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
+
+ mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL);
+ ASSERT3P(zp, ==, rootzp);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+ ASSERT(error == 0);
+
+ ZTOV(rootzp)->v_count = 0;
+ dmu_buf_rele(rootzp->z_dbuf, NULL);
+ rootzp->z_dbuf = NULL;
+ kmem_cache_free(znode_cache, rootzp);
+}
+
+#endif /* _KERNEL */
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
+{
+ dmu_buf_t *db;
+ dmu_object_info_t doi;
+ znode_phys_t *zp;
+ int error;
+
+ if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
+ return (error);
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db, FTAG);
+ return (EINVAL);
+ }
+
+ zp = db->db_data;
+ *pobjp = zp->zp_parent;
+ *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
+ S_ISDIR(zp->zp_mode);
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ char *path = buf + len - 1;
+ int error;
+
+ *path = '\0';
+
+ for (;;) {
+ uint64_t pobj;
+ char component[MAXNAMELEN + 2];
+ size_t complen;
+ int is_xattrdir;
+
+ if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
+ &is_xattrdir)) != 0)
+ break;
+
+ if (pobj == obj) {
+ if (path[0] != '/')
+ *--path = '/';
+ break;
+ }
+
+ component[0] = '/';
+ if (is_xattrdir) {
+ (void) sprintf(component + 1, "<xattrdir>");
+ } else {
+ error = zap_value_search(osp, pobj, obj,
+ ZFS_DIRENT_OBJ(-1ULL), component + 1);
+ if (error != 0)
+ break;
+ }
+
+ complen = strlen(component);
+ path -= complen;
+ ASSERT(path >= buf);
+ bcopy(component, path, complen);
+ obj = pobj;
+ }
+
+ if (error == 0)
+ (void) memmove(buf, path, buf + len - path);
+ return (error);
+}
diff --git a/zfs/lib/libzpool/zil.c b/zfs/lib/libzpool/zil.c
new file mode 100644
index 000000000..4f9325dbb
--- /dev/null
+++ b/zfs/lib/libzpool/zil.c
@@ -0,0 +1,1618 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zil.c 1.34 08/02/22 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * The zfs intent log (ZIL) saves transaction records of system calls
+ * that change the file system in memory with enough information
+ * to be able to replay them. These are stored in memory until
+ * either the DMU transaction group (txg) commits them to the stable pool
+ * and they can be discarded, or they are flushed to the stable log
+ * (also in the pool) due to a fsync, O_DSYNC or other synchronous
+ * requirement. In the event of a panic or power fail then those log
+ * records (transactions) are replayed.
+ *
+ * There is one ZIL per file system. Its on-disk (pool) format consists
+ * of 3 parts:
+ *
+ * - ZIL header
+ * - ZIL blocks
+ * - ZIL records
+ *
+ * A log record holds a system call transaction. Log blocks can
+ * hold many log records and the blocks are chained together.
+ * Each ZIL block contains a block pointer (blkptr_t) to the next
+ * ZIL block in the chain. The ZIL header points to the first
+ * block in the chain. Note there is not a fixed place in the pool
+ * to hold blocks. They are dynamically allocated and freed as
+ * needed from the blocks available. Figure X shows the ZIL structure:
+ */
+
+/*
+ * This global ZIL switch affects all pools
+ */
+int zil_disable = 0; /* disable intent logging */
+
+/*
+ * Tunable parameter for debugging or performance analysis. Setting
+ * zfs_nocacheflush will cause corruption on power loss if a volatile
+ * out-of-order write cache is enabled.
+ */
+boolean_t zfs_nocacheflush = B_FALSE;
+
+static kmem_cache_t *zil_lwb_cache;
+
+static int
+zil_dva_compare(const void *x1, const void *x2)
+{
+ const dva_t *dva1 = x1;
+ const dva_t *dva2 = x2;
+
+ if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
+ return (-1);
+ if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
+ return (1);
+
+ if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
+ return (-1);
+ if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
+ return (1);
+
+ return (0);
+}
+
+static void
+zil_dva_tree_init(avl_tree_t *t)
+{
+ avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
+ offsetof(zil_dva_node_t, zn_node));
+}
+
+static void
+zil_dva_tree_fini(avl_tree_t *t)
+{
+ zil_dva_node_t *zn;
+ void *cookie = NULL;
+
+ while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(zn, sizeof (zil_dva_node_t));
+
+ avl_destroy(t);
+}
+
+static int
+zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+{
+ zil_dva_node_t *zn;
+ avl_index_t where;
+
+ if (avl_find(t, dva, &where) != NULL)
+ return (EEXIST);
+
+ zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+ zn->zn_dva = *dva;
+ avl_insert(t, zn, where);
+
+ return (0);
+}
+
+static zil_header_t *
+zil_header_in_syncing_context(zilog_t *zilog)
+{
+ return ((zil_header_t *)zilog->zl_header);
+}
+
+static void
+zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
+{
+ zio_cksum_t *zc = &bp->blk_cksum;
+
+ zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
+ zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
+}
+
+/*
+ * Read a log block, make sure it's valid, and byteswap it if necessary.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+{
+ blkptr_t blk = *bp;
+ zbookmark_t zb;
+ uint32_t aflags = ARC_WAIT;
+ int error;
+
+ zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ *abufpp = NULL;
+
+ error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
+ arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+
+ if (error == 0) {
+ char *data = (*abufpp)->b_data;
+ uint64_t blksz = BP_GET_LSIZE(bp);
+ zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
+ zio_cksum_t cksum = bp->blk_cksum;
+
+ /*
+ * Sequence numbers should be... sequential. The checksum
+ * verifier for the next block should be bp's checksum plus 1.
+ */
+ cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)))
+ error = ESTALE;
+ else if (BP_IS_HOLE(&ztp->zit_next_blk))
+ error = ENOENT;
+ else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))
+ error = EOVERFLOW;
+
+ if (error) {
+ VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
+ *abufpp = NULL;
+ }
+ }
+
+ dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+
+ return (error);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ * Return the highest sequence number.
+ */
+uint64_t
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t claim_seq = zh->zh_claim_seq;
+ uint64_t seq = 0;
+ uint64_t max_seq = 0;
+ blkptr_t blk = zh->zh_log;
+ arc_buf_t *abuf;
+ char *lrbuf, *lrp;
+ zil_trailer_t *ztp;
+ int reclen, error;
+
+ if (BP_IS_HOLE(&blk))
+ return (max_seq);
+
+ /*
+ * Starting at the block pointed to by zh_log we read the log chain.
+ * For each block in the chain we strongly check that block to
+ * ensure its validity. We stop when an invalid block is found.
+ * For each block pointer in the chain we call parse_blk_func().
+ * For each record in each valid block we call parse_lr_func().
+ * If the log has been claimed, stop if we encounter a sequence
+ * number greater than the highest claimed sequence number.
+ */
+ zil_dva_tree_init(&zilog->zl_dva_tree);
+ for (;;) {
+ seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ if (claim_seq != 0 && seq > claim_seq)
+ break;
+
+ ASSERT(max_seq < seq);
+ max_seq = seq;
+
+ error = zil_read_log_block(zilog, &blk, &abuf);
+
+ if (parse_blk_func != NULL)
+ parse_blk_func(zilog, &blk, arg, txg);
+
+ if (error)
+ break;
+
+ lrbuf = abuf->b_data;
+ ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+ blk = ztp->zit_next_blk;
+
+ if (parse_lr_func == NULL) {
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ continue;
+ }
+
+ for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+ lr_t *lr = (lr_t *)lrp;
+ reclen = lr->lrc_reclen;
+ ASSERT3U(reclen, >=, sizeof (lr_t));
+ parse_lr_func(zilog, lr, arg, txg);
+ }
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ }
+ zil_dva_tree_fini(&zilog->zl_dva_tree);
+
+ return (max_seq);
+}
+
+/* ARGSUSED */
+static void
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ spa_t *spa = zilog->zl_spa;
+ int err;
+
+ /*
+ * Claim log block if not already committed and not already claimed.
+ */
+ if (bp->blk_birth >= first_txg &&
+ zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
+ err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
+ ASSERT(err == 0);
+ }
+}
+
+static void
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
+ }
+}
+
+/* ARGSUSED */
+static void
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
+{
+ zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+}
+
+static void
+zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
+{
+ /*
+ * If we previously claimed it, we need to free it.
+ */
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ if (bp->blk_birth >= claim_txg &&
+ !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
+ (void) arc_free(NULL, zilog->zl_spa,
+ dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
+ }
+ }
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static void
+zil_create(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb;
+ uint64_t txg = 0;
+ dmu_tx_t *tx = NULL;
+ blkptr_t blk;
+ int error = 0;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ ASSERT(zh->zh_claim_txg == 0);
+ ASSERT(zh->zh_replay_seq == 0);
+
+ blk = zh->zh_log;
+
+ /*
+ * If we don't already have an initial log block, allocate one now.
+ */
+ if (BP_IS_HOLE(&blk)) {
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
+ NULL, txg);
+
+ if (error == 0)
+ zil_init_log_chain(zilog, &blk);
+ }
+
+ /*
+ * Allocate a log write buffer (lwb) for the first log block.
+ */
+ if (error == 0) {
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = blk;
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
+ lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_zio = NULL;
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+ }
+
+ /*
+ * If we just allocated the first log block, commit our transaction
+ * and wait for zil_sync() to stuff the block poiner into zh_log.
+ * (zh is part of the MOS, so we cannot modify it in open context.)
+ */
+ if (tx != NULL) {
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+
+ ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header.
+ * If keep_first is set, then we're replaying a log with no content.
+ * We want to keep the first block, however, so that the first
+ * synchronous transaction doesn't require a txg_wait_synced()
+ * in zil_create(). We don't need to txg_wait_synced() here either
+ * when keep_first is set, because both zil_create() and zil_destroy()
+ * will wait for any in-progress destroys to complete.
+ */
+void
+zil_destroy(zilog_t *zilog, boolean_t keep_first)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb;
+ dmu_tx_t *tx;
+ uint64_t txg;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return;
+
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&zilog->zl_lock);
+
+ /*
+ * It is possible for the ZIL to get the previously mounted zilog
+ * structure of the same dataset if quickly remounted and the dbuf
+ * eviction has not completed. In this case we can see a non
+ * empty lwb list and keep_first will be set. We fix this by
+ * clearing the keep_first. This will be slower but it's very rare.
+ */
+ if (!list_is_empty(&zilog->zl_lwb_list) && keep_first)
+ keep_first = B_FALSE;
+
+ ASSERT3U(zilog->zl_destroy_txg, <, txg);
+ zilog->zl_destroy_txg = txg;
+ zilog->zl_keep_first = keep_first;
+
+ if (!list_is_empty(&zilog->zl_lwb_list)) {
+ ASSERT(zh->zh_claim_txg == 0);
+ ASSERT(!keep_first);
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ } else {
+ if (!keep_first) {
+ (void) zil_parse(zilog, zil_free_log_block,
+ zil_free_log_record, tx, zh->zh_claim_txg);
+ }
+ }
+ mutex_exit(&zilog->zl_lock);
+
+ dmu_tx_commit(tx);
+}
+
+/*
+ * zil_rollback_destroy() is only called by the rollback code.
+ * We already have a syncing tx. Rollback has exclusive access to the
+ * dataset, so we don't have to worry about concurrent zil access.
+ * The actual freeing of any log blocks occurs in zil_sync() later in
+ * this txg syncing phase.
+ */
+void
+zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t txg;
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return;
+
+ txg = dmu_tx_get_txg(tx);
+ ASSERT3U(zilog->zl_destroy_txg, <, txg);
+ zilog->zl_destroy_txg = txg;
+ zilog->zl_keep_first = B_FALSE;
+
+ /*
+ * Ensure there's no outstanding ZIL IO. No lwbs or just the
+ * unused one that allocated in advance is ok.
+ */
+ ASSERT(zilog->zl_lwb_list.list_head.list_next ==
+ zilog->zl_lwb_list.list_head.list_prev);
+ (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record,
+ tx, zh->zh_claim_txg);
+}
+
+int
+zil_claim(char *osname, void *txarg)
+{
+ dmu_tx_t *tx = txarg;
+ uint64_t first_txg = dmu_tx_get_txg(tx);
+ zilog_t *zilog;
+ zil_header_t *zh;
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ if (error) {
+ cmn_err(CE_WARN, "can't process intent log for %s", osname);
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ zh = zil_header_in_syncing_context(zilog);
+
+ /*
+ * Claim all log blocks if we haven't already done so, and remember
+ * the highest claimed sequence number. This ensures that if we can
+ * read only part of the log now (e.g. due to a missing device),
+ * but we can read the entire log later, we will not try to replay
+ * or destroy beyond the last block we successfully claimed.
+ */
+ ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+ if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+ zh->zh_claim_txg = first_txg;
+ zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
+ zil_claim_log_record, tx, first_txg);
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ }
+
+ ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+ dmu_objset_close(os);
+ return (0);
+}
+
+static int
+zil_vdev_compare(const void *x1, const void *x2)
+{
+ uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+ uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+
+ if (v1 < v2)
+ return (-1);
+ if (v1 > v2)
+ return (1);
+
+ return (0);
+}
+
+void
+zil_add_block(zilog_t *zilog, blkptr_t *bp)
+{
+ avl_tree_t *t = &zilog->zl_vdev_tree;
+ avl_index_t where;
+ zil_vdev_node_t *zv, zvsearch;
+ int ndvas = BP_GET_NDVAS(bp);
+ int i;
+
+ if (zfs_nocacheflush)
+ return;
+
+ ASSERT(zilog->zl_writer);
+
+ /*
+ * Even though we're zl_writer, we still need a lock because the
+ * zl_get_data() callbacks may have dmu_sync() done callbacks
+ * that will run concurrently.
+ */
+ mutex_enter(&zilog->zl_vdev_lock);
+ for (i = 0; i < ndvas; i++) {
+ zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+ if (avl_find(t, &zvsearch, &where) == NULL) {
+ zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
+ zv->zv_vdev = zvsearch.zv_vdev;
+ avl_insert(t, zv, where);
+ }
+ }
+ mutex_exit(&zilog->zl_vdev_lock);
+}
+
+void
+zil_flush_vdevs(zilog_t *zilog)
+{
+ spa_t *spa = zilog->zl_spa;
+ avl_tree_t *t = &zilog->zl_vdev_tree;
+ void *cookie = NULL;
+ zil_vdev_node_t *zv;
+ zio_t *zio;
+
+ ASSERT(zilog->zl_writer);
+
+ /*
+ * We don't need zl_vdev_lock here because we're the zl_writer,
+ * and all zl_get_data() callbacks are done.
+ */
+ if (avl_numnodes(t) == 0)
+ return;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
+ vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
+ if (vd != NULL)
+ zio_flush(zio, vd);
+ kmem_free(zv, sizeof (*zv));
+ }
+
+ /*
+ * Wait for all the flushes to complete. Not all devices actually
+ * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
+ */
+ (void) zio_wait(zio);
+
+ spa_config_exit(spa, FTAG);
+}
+
+/*
+ * Function called when a log block write completes
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+ lwb_t *lwb = zio->io_private;
+ zilog_t *zilog = lwb->lwb_zilog;
+
+ /*
+ * Now that we've written this log block, we have a stable pointer
+ * to the next block in the chain, so it's OK to let the txg in
+ * which we allocated the next block sync.
+ */
+ txg_rele_to_sync(&lwb->lwb_txgh);
+
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ mutex_enter(&zilog->zl_lock);
+ lwb->lwb_buf = NULL;
+ if (zio->io_error)
+ zilog->zl_log_error = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Initialize the io for a log block.
+ *
+ * Note, we should not initialize the IO until we are about
+ * to use it, since zio_rewrite() does a spa_config_enter().
+ */
+static void
+zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
+{
+ zbookmark_t zb;
+
+ zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ if (zilog->zl_root_zio == NULL) {
+ zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
+ if (lwb->lwb_zio == NULL) {
+ lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
+ ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
+ lwb->lwb_sz, zil_lwb_write_done, lwb,
+ ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb);
+ }
+}
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+{
+ lwb_t *nlwb;
+ zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+ spa_t *spa = zilog->zl_spa;
+ blkptr_t *bp = &ztp->zit_next_blk;
+ uint64_t txg;
+ uint64_t zil_blksz;
+ int error;
+
+ ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+
+ /*
+ * Allocate the next block and save its address in this block
+ * before writing it in order to establish the log chain.
+ * Note that if the allocation of nlwb synced before we wrote
+ * the block that points at it (lwb), we'd leak it if we crashed.
+ * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+ */
+ txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
+ txg_rele_to_quiesce(&lwb->lwb_txgh);
+
+ /*
+ * Pick a ZIL blocksize. We request a size that is the
+ * maximum of the previous used size, the current used size and
+ * the amount waiting in the queue.
+ */
+ zil_blksz = MAX(zilog->zl_prev_used,
+ zilog->zl_cur_used + sizeof (*ztp));
+ zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
+ zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
+ if (zil_blksz > ZIL_MAX_BLKSZ)
+ zil_blksz = ZIL_MAX_BLKSZ;
+
+ BP_ZERO(bp);
+ /* pass the old blkptr in order to spread log blocks across devs */
+ error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
+ if (error) {
+ dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
+
+ /*
+ * We dirty the dataset to ensure that zil_sync() will
+ * be called to remove this lwb from our zl_lwb_list.
+ * Failing to do so, may leave an lwb with a NULL lwb_buf
+ * hanging around on the zl_lwb_list.
+ */
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ dmu_tx_commit(tx);
+
+ /*
+ * Since we've just experienced an allocation failure so we
+ * terminate the current lwb and send it on its way.
+ */
+ ztp->zit_pad = 0;
+ ztp->zit_nused = lwb->lwb_nused;
+ ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+ zio_nowait(lwb->lwb_zio);
+
+ /*
+ * By returning NULL the caller will call tx_wait_synced()
+ */
+ return (NULL);
+ }
+
+ ASSERT3U(bp->blk_birth, ==, txg);
+ ztp->zit_pad = 0;
+ ztp->zit_nused = lwb->lwb_nused;
+ ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ /*
+ * Allocate a new log write buffer (lwb).
+ */
+ nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+
+ nlwb->lwb_zilog = zilog;
+ nlwb->lwb_blk = *bp;
+ nlwb->lwb_nused = 0;
+ nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
+ nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
+ nlwb->lwb_max_txg = txg;
+ nlwb->lwb_zio = NULL;
+
+ /*
+ * Put new lwb at the end of the log chain
+ */
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, nlwb);
+ mutex_exit(&zilog->zl_lock);
+
+ /* Record the block for later vdev flushing */
+ zil_add_block(zilog, &lwb->lwb_blk);
+
+ /*
+ * kick off the write for the old log block
+ */
+ dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+ ASSERT(lwb->lwb_zio);
+ zio_nowait(lwb->lwb_zio);
+
+ return (nlwb);
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+ lr_t *lrc = &itx->itx_lr; /* common log record */
+ lr_write_t *lr = (lr_write_t *)lrc;
+ uint64_t txg = lrc->lrc_txg;
+ uint64_t reclen = lrc->lrc_reclen;
+ uint64_t dlen;
+
+ if (lwb == NULL)
+ return (NULL);
+ ASSERT(lwb->lwb_buf != NULL);
+
+ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
+ dlen = P2ROUNDUP_TYPED(
+ lr->lr_length, sizeof (uint64_t), uint64_t);
+ else
+ dlen = 0;
+
+ zilog->zl_cur_used += (reclen + dlen);
+
+ zil_lwb_write_init(zilog, lwb);
+
+ /*
+ * If this record won't fit in the current log block, start a new one.
+ */
+ if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ lwb = zil_lwb_write_start(zilog, lwb);
+ if (lwb == NULL)
+ return (NULL);
+ zil_lwb_write_init(zilog, lwb);
+ ASSERT(lwb->lwb_nused == 0);
+ if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ return (lwb);
+ }
+ }
+
+ /*
+ * Update the lrc_seq, to be log record sequence number. See zil.h
+ * Then copy the record to the log buffer.
+ */
+ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+ bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+
+ /*
+ * If it's a write, fetch the data or get its blkptr as appropriate.
+ */
+ if (lrc->lrc_txtype == TX_WRITE) {
+ if (txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ if (itx->itx_wr_state != WR_COPIED) {
+ char *dbuf;
+ int error;
+
+ /* alignment is guaranteed */
+ lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
+ if (dlen) {
+ ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+ dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
+ lr->lr_common.lrc_reclen += dlen;
+ } else {
+ ASSERT(itx->itx_wr_state == WR_INDIRECT);
+ dbuf = NULL;
+ }
+ error = zilog->zl_get_data(
+ itx->itx_private, lr, dbuf, lwb->lwb_zio);
+ if (error) {
+ ASSERT(error == ENOENT || error == EEXIST ||
+ error == EALREADY);
+ return (lwb);
+ }
+ }
+ }
+
+ lwb->lwb_nused += reclen + dlen;
+ lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+ ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+ ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
+
+ return (lwb);
+}
+
+itx_t *
+zil_itx_create(uint64_t txtype, size_t lrsize)
+{
+ itx_t *itx;
+
+ lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
+
+ itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
+ itx->itx_lr.lrc_txtype = txtype;
+ itx->itx_lr.lrc_reclen = lrsize;
+ itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
+ itx->itx_lr.lrc_seq = 0; /* defensive */
+
+ return (itx);
+}
+
+uint64_t
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t seq;
+
+ ASSERT(itx->itx_lr.lrc_seq == 0);
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz += itx->itx_sod;
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+ itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
+ mutex_exit(&zilog->zl_lock);
+
+ return (seq);
+}
+
+/*
+ * Free up all in-memory intent log transactions that have now been synced.
+ */
+static void
+zil_itx_clean(zilog_t *zilog)
+{
+ uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
+ uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+ list_t clean_list;
+ itx_t *itx;
+
+ list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
+
+ mutex_enter(&zilog->zl_lock);
+ /* wait for a log writer to finish walking list */
+ while (zilog->zl_writer) {
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ }
+
+ /*
+ * Move the sync'd log transactions to a separate list so we can call
+ * kmem_free without holding the zl_lock.
+ *
+ * There is no need to set zl_writer as we don't drop zl_lock here
+ */
+ while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
+ itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
+ list_remove(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz -= itx->itx_sod;
+ list_insert_tail(&clean_list, itx);
+ }
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+
+ /* destroy sync'd log transactions */
+ while ((itx = list_head(&clean_list)) != NULL) {
+ list_remove(&clean_list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ }
+ list_destroy(&clean_list);
+}
+
+/*
+ * If there are any in-memory intent log transactions which have now been
+ * synced then start up a taskq to free them.
+ */
+void
+zil_clean(zilog_t *zilog)
+{
+ itx_t *itx;
+
+ mutex_enter(&zilog->zl_lock);
+ itx = list_head(&zilog->zl_itx_list);
+ if ((itx != NULL) &&
+ (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
+ (void) taskq_dispatch(zilog->zl_clean_taskq,
+ (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
+{
+ uint64_t txg;
+ uint64_t commit_seq = 0;
+ itx_t *itx, *itx_next = (itx_t *)-1;
+ lwb_t *lwb;
+ spa_t *spa;
+
+ zilog->zl_writer = B_TRUE;
+ zilog->zl_root_zio = NULL;
+ spa = zilog->zl_spa;
+
+ if (zilog->zl_suspend) {
+ lwb = NULL;
+ } else {
+ lwb = list_tail(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ /*
+ * Return if there's nothing to flush before we
+ * dirty the fs by calling zil_create()
+ */
+ if (list_is_empty(&zilog->zl_itx_list)) {
+ zilog->zl_writer = B_FALSE;
+ return;
+ }
+ mutex_exit(&zilog->zl_lock);
+ zil_create(zilog);
+ mutex_enter(&zilog->zl_lock);
+ lwb = list_tail(&zilog->zl_lwb_list);
+ }
+ }
+
+ /* Loop through in-memory log transactions filling log blocks. */
+ DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
+ for (;;) {
+ /*
+ * Find the next itx to push:
+ * Push all transactions related to specified foid and all
+ * other transactions except TX_WRITE, TX_TRUNCATE,
+ * TX_SETATTR and TX_ACL for all other files.
+ */
+ if (itx_next != (itx_t *)-1)
+ itx = itx_next;
+ else
+ itx = list_head(&zilog->zl_itx_list);
+ for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
+ if (foid == 0) /* push all foids? */
+ break;
+ if (itx->itx_sync) /* push all O_[D]SYNC */
+ break;
+ switch (itx->itx_lr.lrc_txtype) {
+ case TX_SETATTR:
+ case TX_WRITE:
+ case TX_TRUNCATE:
+ case TX_ACL:
+ /* lr_foid is same offset for these records */
+ if (((lr_write_t *)&itx->itx_lr)->lr_foid
+ != foid) {
+ continue; /* skip this record */
+ }
+ }
+ break;
+ }
+ if (itx == NULL)
+ break;
+
+ if ((itx->itx_lr.lrc_seq > seq) &&
+ ((lwb == NULL) || (lwb->lwb_nused == 0) ||
+ (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
+ break;
+ }
+
+ /*
+ * Save the next pointer. Even though we soon drop
+ * zl_lock all threads that may change the list
+ * (another writer or zil_itx_clean) can't do so until
+ * they have zl_writer.
+ */
+ itx_next = list_next(&zilog->zl_itx_list, itx);
+ list_remove(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz -= itx->itx_sod;
+ mutex_exit(&zilog->zl_lock);
+ txg = itx->itx_lr.lrc_txg;
+ ASSERT(txg);
+
+ if (txg > spa_last_synced_txg(spa) ||
+ txg > spa_freeze_txg(spa))
+ lwb = zil_lwb_commit(zilog, itx, lwb);
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ mutex_enter(&zilog->zl_lock);
+ }
+ DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
+ /* determine commit sequence number */
+ itx = list_head(&zilog->zl_itx_list);
+ if (itx)
+ commit_seq = itx->itx_lr.lrc_seq;
+ else
+ commit_seq = zilog->zl_itx_seq;
+ mutex_exit(&zilog->zl_lock);
+
+ /* write the last block out */
+ if (lwb != NULL && lwb->lwb_zio != NULL)
+ lwb = zil_lwb_write_start(zilog, lwb);
+
+ zilog->zl_prev_used = zilog->zl_cur_used;
+ zilog->zl_cur_used = 0;
+
+ /*
+ * Wait if necessary for the log blocks to be on stable storage.
+ */
+ if (zilog->zl_root_zio) {
+ DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
+ (void) zio_wait(zilog->zl_root_zio);
+ DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
+ zil_flush_vdevs(zilog);
+ }
+
+ if (zilog->zl_log_error || lwb == NULL) {
+ zilog->zl_log_error = 0;
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_writer = B_FALSE;
+
+ ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
+ zilog->zl_commit_seq = commit_seq;
+}
+
+/*
+ * Push zfs transactions to stable storage up to the supplied sequence number.
+ * If foid is 0 push out all transactions, otherwise push only those
+ * for that file or might have been used to create that file.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
+{
+ if (zilog == NULL || seq == 0)
+ return;
+
+ mutex_enter(&zilog->zl_lock);
+
+ seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
+
+ while (zilog->zl_writer) {
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ if (seq < zilog->zl_commit_seq) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ }
+ zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
+ /* wake up others waiting on the commit */
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = zilog->zl_spa;
+ lwb_t *lwb;
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT(zilog->zl_stop_sync == 0);
+
+ zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+
+ if (zilog->zl_destroy_txg == txg) {
+ blkptr_t blk = zh->zh_log;
+
+ ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+ ASSERT(spa_sync_pass(spa) == 1);
+
+ bzero(zh, sizeof (zil_header_t));
+ bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+
+ if (zilog->zl_keep_first) {
+ /*
+ * If this block was part of log chain that couldn't
+ * be claimed because a device was missing during
+ * zil_claim(), but that device later returns,
+ * then this block could erroneously appear valid.
+ * To guard against this, assign a new GUID to the new
+ * log chain so it doesn't matter what blk points to.
+ */
+ zil_init_log_chain(zilog, &blk);
+ zh->zh_log = blk;
+ }
+ }
+
+ for (;;) {
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ zh->zh_log = lwb->lwb_blk;
+ if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+ break;
+ list_remove(&zilog->zl_lwb_list, lwb);
+ zio_free_blk(spa, &lwb->lwb_blk, txg);
+ kmem_cache_free(zil_lwb_cache, lwb);
+
+ /*
+ * If we don't have anything left in the lwb list then
+ * we've had an allocation failure and we need to zero
+ * out the zil_header blkptr so that we don't end
+ * up freeing the same block twice.
+ */
+ if (list_head(&zilog->zl_lwb_list) == NULL)
+ BP_ZERO(&zh->zh_log);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_init(void)
+{
+ zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+ sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zil_fini(void)
+{
+ kmem_cache_destroy(zil_lwb_cache);
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+ zilog_t *zilog;
+
+ zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+ zilog->zl_header = zh_phys;
+ zilog->zl_os = os;
+ zilog->zl_spa = dmu_objset_spa(os);
+ zilog->zl_dmu_pool = dmu_objset_pool(os);
+ zilog->zl_destroy_txg = TXG_INITIAL - 1;
+
+ mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&zilog->zl_itx_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
+ list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+ offsetof(lwb_t, lwb_node));
+
+ mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
+ sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
+
+ cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+
+ return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+ lwb_t *lwb;
+
+ zilog->zl_stop_sync = 1;
+
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ list_destroy(&zilog->zl_lwb_list);
+
+ avl_destroy(&zilog->zl_vdev_tree);
+ mutex_destroy(&zilog->zl_vdev_lock);
+
+ ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+ list_destroy(&zilog->zl_itx_list);
+ mutex_destroy(&zilog->zl_lock);
+
+ cv_destroy(&zilog->zl_cv_writer);
+ cv_destroy(&zilog->zl_cv_suspend);
+
+ kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * return true if the initial log block is not valid
+ */
+static int
+zil_empty(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ arc_buf_t *abuf = NULL;
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return (1);
+
+ if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
+ return (1);
+
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ return (0);
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ zilog->zl_get_data = get_data;
+ zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
+ 2, 2, TASKQ_PREPOPULATE);
+
+ return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+ /*
+ * If the log isn't already committed, mark the objset dirty
+ * (so zil_sync() will be called) and wait for that txg to sync.
+ */
+ if (!zil_is_committed(zilog)) {
+ uint64_t txg;
+ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+
+ taskq_destroy(zilog->zl_clean_taskq);
+ zilog->zl_clean_taskq = NULL;
+ zilog->zl_get_data = NULL;
+
+ zil_itx_clean(zilog);
+ ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+}
+
+/*
+ * Suspend an intent log. While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * We suspend the log briefly when taking a snapshot so that the snapshot
+ * contains all the data it's supposed to, and has an empty intent log.
+ */
+int
+zil_suspend(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+
+ mutex_enter(&zilog->zl_lock);
+ if (zh->zh_claim_txg != 0) { /* unplayed log */
+ mutex_exit(&zilog->zl_lock);
+ return (EBUSY);
+ }
+ if (zilog->zl_suspend++ != 0) {
+ /*
+ * Someone else already began a suspend.
+ * Just wait for them to finish.
+ */
+ while (zilog->zl_suspending)
+ cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
+ ASSERT(BP_IS_HOLE(&zh->zh_log));
+ mutex_exit(&zilog->zl_lock);
+ return (0);
+ }
+ zilog->zl_suspending = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+
+ zil_commit(zilog, UINT64_MAX, 0);
+
+ /*
+ * Wait for any in-flight log writes to complete.
+ */
+ mutex_enter(&zilog->zl_lock);
+ while (zilog->zl_writer)
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ mutex_exit(&zilog->zl_lock);
+
+ zil_destroy(zilog, B_FALSE);
+
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_suspending = B_FALSE;
+ cv_broadcast(&zilog->zl_cv_suspend);
+ mutex_exit(&zilog->zl_lock);
+
+ return (0);
+}
+
+void
+zil_resume(zilog_t *zilog)
+{
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zilog->zl_suspend != 0);
+ zilog->zl_suspend--;
+ mutex_exit(&zilog->zl_lock);
+}
+
+typedef struct zil_replay_arg {
+ objset_t *zr_os;
+ zil_replay_func_t **zr_replay;
+ void *zr_arg;
+ uint64_t *zr_txgp;
+ boolean_t zr_byteswap;
+ char *zr_lrbuf;
+} zil_replay_arg_t;
+
+static void
+zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
+{
+ zil_replay_arg_t *zr = zra;
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t reclen = lr->lrc_reclen;
+ uint64_t txtype = lr->lrc_txtype;
+ char *name;
+ int pass, error, sunk;
+
+ if (zilog->zl_stop_replay)
+ return;
+
+ if (lr->lrc_txg < claim_txg) /* already committed */
+ return;
+
+ if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
+ return;
+
+ /* Strip case-insensitive bit, still present in log record */
+ txtype &= ~TX_CI;
+
+ /*
+ * Make a copy of the data so we can revise and extend it.
+ */
+ bcopy(lr, zr->zr_lrbuf, reclen);
+
+ /*
+ * The log block containing this lr may have been byteswapped
+ * so that we can easily examine common fields like lrc_txtype.
+ * However, the log is a mix of different data types, and only the
+ * replay vectors know how to byteswap their records. Therefore, if
+ * the lr was byteswapped, undo it before invoking the replay vector.
+ */
+ if (zr->zr_byteswap)
+ byteswap_uint64_array(zr->zr_lrbuf, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ lr_write_t *lrw = (lr_write_t *)lr;
+ blkptr_t *wbp = &lrw->lr_blkptr;
+ uint64_t wlen = lrw->lr_length;
+ char *wbuf = zr->zr_lrbuf + reclen;
+
+ if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
+ bzero(wbuf, wlen);
+ } else {
+ /*
+ * A subsequent write may have overwritten this block,
+ * in which case wbp may have been been freed and
+ * reallocated, and our read of wbp may fail with a
+ * checksum error. We can safely ignore this because
+ * the later write will provide the correct data.
+ */
+ zbookmark_t zb;
+
+ zb.zb_objset = dmu_objset_id(zilog->zl_os);
+ zb.zb_object = lrw->lr_foid;
+ zb.zb_level = -1;
+ zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
+
+ (void) zio_wait(zio_read(NULL, zilog->zl_spa,
+ wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
+ (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
+ }
+ }
+
+ /*
+ * We must now do two things atomically: replay this log record,
+ * and update the log header to reflect the fact that we did so.
+ * We use the DMU's ability to assign into a specific txg to do this.
+ */
+ for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
+ uint64_t replay_txg;
+ dmu_tx_t *replay_tx;
+
+ replay_tx = dmu_tx_create(zr->zr_os);
+ error = dmu_tx_assign(replay_tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(replay_tx);
+ break;
+ }
+
+ replay_txg = dmu_tx_get_txg(replay_tx);
+
+ if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+ error = EINVAL;
+ } else {
+ /*
+ * On the first pass, arrange for the replay vector
+ * to fail its dmu_tx_assign(). That's the only way
+ * to ensure that those code paths remain well tested.
+ *
+ * Only byteswap (if needed) on the 1st pass.
+ */
+ *zr->zr_txgp = replay_txg - (pass == 1);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+ zr->zr_byteswap && pass == 1);
+ *zr->zr_txgp = TXG_NOWAIT;
+ }
+
+ if (error == 0) {
+ dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
+ zilog->zl_replay_seq[replay_txg & TXG_MASK] =
+ lr->lrc_seq;
+ }
+
+ dmu_tx_commit(replay_tx);
+
+ if (!error)
+ return;
+
+ /*
+ * The DMU's dnode layer doesn't see removes until the txg
+ * commits, so a subsequent claim can spuriously fail with
+ * EEXIST. So if we receive any error other than ERESTART
+ * we try syncing out any removes then retrying the
+ * transaction.
+ */
+ if (error != ERESTART && !sunk) {
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ sunk = B_TRUE;
+ continue; /* retry */
+ }
+
+ if (error != ERESTART)
+ break;
+
+ if (pass != 1)
+ txg_wait_open(spa_get_dsl(zilog->zl_spa),
+ replay_txg + 1);
+
+ dprintf("pass %d, retrying\n", pass);
+ }
+
+ ASSERT(error && error != ERESTART);
+ name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ dmu_objset_name(zr->zr_os, name);
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu %s\n",
+ error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
+ (lr->lrc_txtype & TX_CI) ? "CI" : "");
+ zilog->zl_stop_replay = 1;
+ kmem_free(name, MAXNAMELEN);
+}
+
+/* ARGSUSED */
+static void
+zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zilog->zl_replay_blks++;
+}
+
+/*
+ * If this dataset has a non-empty intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE])
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+ const zil_header_t *zh = zilog->zl_header;
+ zil_replay_arg_t zr;
+
+ if (zil_empty(zilog)) {
+ zil_destroy(zilog, B_TRUE);
+ return;
+ }
+
+ zr.zr_os = os;
+ zr.zr_replay = replay_func;
+ zr.zr_arg = arg;
+ zr.zr_txgp = txgp;
+ zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
+ zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+ /*
+ * Wait for in-progress removes to sync before starting replay.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ zilog->zl_stop_replay = 0;
+ zilog->zl_replay_time = lbolt;
+ ASSERT(zilog->zl_replay_blks == 0);
+ (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
+ zh->zh_claim_txg);
+ kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+
+ zil_destroy(zilog, B_FALSE);
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+}
+
+/*
+ * Report whether all transactions are committed
+ */
+int
+zil_is_committed(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ int ret;
+
+ mutex_enter(&zilog->zl_lock);
+ while (zilog->zl_writer)
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+
+ /* recent unpushed intent log transactions? */
+ if (!list_is_empty(&zilog->zl_itx_list)) {
+ ret = B_FALSE;
+ goto out;
+ }
+
+ /* intent log never used? */
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ ret = B_TRUE;
+ goto out;
+ }
+
+ /*
+ * more than 1 log buffer means zil_sync() hasn't yet freed
+ * entries after a txg has committed
+ */
+ if (list_next(&zilog->zl_lwb_list, lwb)) {
+ ret = B_FALSE;
+ goto out;
+ }
+
+ ASSERT(zil_empty(zilog));
+ ret = B_TRUE;
+out:
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+ return (ret);
+}
diff --git a/zfs/lib/libzpool/zio.c b/zfs/lib/libzpool/zio.c
new file mode 100644
index 000000000..7eb44cbba
--- /dev/null
+++ b/zfs/lib/libzpool/zio.c
@@ -0,0 +1,2082 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zio.c 1.32 08/03/20 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * ==========================================================================
+ * I/O priority table
+ * ==========================================================================
+ */
+uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
+ 0, /* ZIO_PRIORITY_NOW */
+ 0, /* ZIO_PRIORITY_SYNC_READ */
+ 0, /* ZIO_PRIORITY_SYNC_WRITE */
+ 6, /* ZIO_PRIORITY_ASYNC_READ */
+ 4, /* ZIO_PRIORITY_ASYNC_WRITE */
+ 4, /* ZIO_PRIORITY_FREE */
+ 0, /* ZIO_PRIORITY_CACHE_FILL */
+ 0, /* ZIO_PRIORITY_LOG_WRITE */
+ 10, /* ZIO_PRIORITY_RESILVER */
+ 20, /* ZIO_PRIORITY_SCRUB */
+};
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+char *zio_type_name[ZIO_TYPES] = {
+ "null", "read", "write", "free", "claim", "ioctl" };
+
+/* Force an allocation failure when non-zero */
+uint16_t zio_zil_fail_shift = 0;
+uint16_t zio_io_fail_shift = 0;
+
+/* Enable/disable the write-retry logic */
+int zio_write_retry = 1;
+
+/* Taskq to handle reissuing of I/Os */
+taskq_t *zio_taskq;
+int zio_resume_threads = 4;
+
+typedef struct zio_sync_pass {
+ int zp_defer_free; /* defer frees after this pass */
+ int zp_dontcompress; /* don't compress after this pass */
+ int zp_rewrite; /* rewrite new bps after this pass */
+} zio_sync_pass_t;
+
+zio_sync_pass_t zio_sync_pass = {
+ 1, /* zp_defer_free */
+ 4, /* zp_dontcompress */
+ 1, /* zp_rewrite */
+};
+
+static boolean_t zio_io_should_fail(uint16_t);
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_cache;
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
+
+/*
+ * Determine if we are allowed to issue the IO based on the
+ * pool state. If we must wait then block until we are told
+ * that we may continue.
+ */
+#define ZIO_ENTER(spa) { \
+ if (spa->spa_state == POOL_STATE_IO_FAILURE) { \
+ mutex_enter(&spa->spa_zio_lock); \
+ while (spa->spa_state == POOL_STATE_IO_FAILURE) \
+ cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \
+ mutex_exit(&spa->spa_zio_lock); \
+ } \
+}
+
+/*
+ * An allocation zio is one that either currently has the DVA allocate
+ * stage set or will have it later in it's lifetime.
+ */
+#define IO_IS_ALLOCATING(zio) \
+ ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+
+void
+zio_init(void)
+{
+ size_t c;
+ vmem_t *data_alloc_arena = NULL;
+
+#ifdef _KERNEL
+ data_alloc_arena = zio_alloc_arena;
+#endif
+
+ zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
+ NULL, NULL, NULL, NULL, NULL, 0);
+
+ /*
+ * For small buffers, we want a cache for each multiple of
+ * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
+ * for each quarter-power of 2. For large buffers, we want
+ * a cache for each multiple of PAGESIZE.
+ */
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+ size_t p2 = size;
+ size_t align = 0;
+
+ while (p2 & (p2 - 1))
+ p2 &= p2 - 1;
+
+ if (size <= 4 * SPA_MINBLOCKSIZE) {
+ align = SPA_MINBLOCKSIZE;
+ } else if (P2PHASE(size, PAGESIZE) == 0) {
+ align = PAGESIZE;
+ } else if (P2PHASE(size, p2 >> 2) == 0) {
+ align = p2 >> 2;
+ }
+
+ if (align != 0) {
+ char name[36];
+ (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+
+ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
+ zio_data_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, data_alloc_arena,
+ KMC_NODEBUG);
+
+ }
+ }
+
+ while (--c != 0) {
+ ASSERT(zio_buf_cache[c] != NULL);
+ if (zio_buf_cache[c - 1] == NULL)
+ zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+ ASSERT(zio_data_buf_cache[c] != NULL);
+ if (zio_data_buf_cache[c - 1] == NULL)
+ zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
+ }
+
+ zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
+ maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+
+ zio_inject_init();
+}
+
+void
+zio_fini(void)
+{
+ size_t c;
+ kmem_cache_t *last_cache = NULL;
+ kmem_cache_t *last_data_cache = NULL;
+
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ if (zio_buf_cache[c] != last_cache) {
+ last_cache = zio_buf_cache[c];
+ kmem_cache_destroy(zio_buf_cache[c]);
+ }
+ zio_buf_cache[c] = NULL;
+
+ if (zio_data_buf_cache[c] != last_data_cache) {
+ last_data_cache = zio_data_buf_cache[c];
+ kmem_cache_destroy(zio_data_buf_cache[c]);
+ }
+ zio_data_buf_cache[c] = NULL;
+ }
+
+ taskq_destroy(zio_taskq);
+
+ kmem_cache_destroy(zio_cache);
+
+ zio_inject_fini();
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously. Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+}
+
+/*
+ * Use zio_data_buf_alloc to allocate data. The data will not appear in a
+ * crashdump if the kernel panics. This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_buf_cache[c], buf);
+}
+
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_data_buf_cache[c], buf);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+static void
+zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
+{
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+ zt->zt_data = data;
+ zt->zt_size = size;
+ zt->zt_bufsize = bufsize;
+
+ zt->zt_next = zio->io_transform_stack;
+ zio->io_transform_stack = zt;
+
+ zio->io_data = data;
+ zio->io_size = size;
+}
+
+static void
+zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
+{
+ zio_transform_t *zt = zio->io_transform_stack;
+
+ *data = zt->zt_data;
+ *size = zt->zt_size;
+ *bufsize = zt->zt_bufsize;
+
+ zio->io_transform_stack = zt->zt_next;
+ kmem_free(zt, sizeof (zio_transform_t));
+
+ if ((zt = zio->io_transform_stack) != NULL) {
+ zio->io_data = zt->zt_data;
+ zio->io_size = zt->zt_size;
+ }
+}
+
+static void
+zio_clear_transform_stack(zio_t *zio)
+{
+ void *data;
+ uint64_t size, bufsize;
+
+ ASSERT(zio->io_transform_stack != NULL);
+
+ zio_pop_transform(zio, &data, &size, &bufsize);
+ while (zio->io_transform_stack != NULL) {
+ zio_buf_free(data, bufsize);
+ zio_pop_transform(zio, &data, &size, &bufsize);
+ }
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ void *data, uint64_t size, zio_done_func_t *done, void *private,
+ zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
+{
+ zio_t *zio;
+
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+
+ zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
+ bzero(zio, sizeof (zio_t));
+ zio->io_parent = pio;
+ zio->io_spa = spa;
+ zio->io_txg = txg;
+ zio->io_flags = flags;
+ if (bp != NULL) {
+ zio->io_bp = bp;
+ zio->io_bp_copy = *bp;
+ zio->io_bp_orig = *bp;
+ }
+ zio->io_done = done;
+ zio->io_private = private;
+ zio->io_type = type;
+ zio->io_priority = priority;
+ zio->io_stage = stage;
+ zio->io_pipeline = pipeline;
+ zio->io_timestamp = lbolt64;
+ mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+ zio_push_transform(zio, data, size, size);
+
+ /*
+ * Note on config lock:
+ *
+ * If CONFIG_HELD is set, then the caller already has the config
+ * lock, so we don't need it for this io.
+ *
+ * We set CONFIG_GRABBED to indicate that we have grabbed the
+ * config lock on behalf of this io, so it should be released
+ * in zio_done.
+ *
+ * Unless CONFIG_HELD is set, we will grab the config lock for
+ * any top-level (parent-less) io, *except* NULL top-level ios.
+ * The NULL top-level ios rarely have any children, so we delay
+ * grabbing the lock until the first child is added (but it is
+ * still grabbed on behalf of the top-level i/o, so additional
+ * children don't need to also grab it). This greatly reduces
+ * contention on the config lock.
+ */
+ if (pio == NULL) {
+ if (type != ZIO_TYPE_NULL &&
+ !(flags & ZIO_FLAG_CONFIG_HELD)) {
+ spa_config_enter(spa, RW_READER, zio);
+ zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
+ }
+ zio->io_root = zio;
+ } else {
+ zio->io_root = pio->io_root;
+ if (!(flags & ZIO_FLAG_NOBOOKMARK))
+ zio->io_logical = pio->io_logical;
+ mutex_enter(&pio->io_lock);
+ if (pio->io_parent == NULL &&
+ pio->io_type == ZIO_TYPE_NULL &&
+ !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
+ !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
+ pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
+ spa_config_enter(spa, RW_READER, pio);
+ }
+ if (stage < ZIO_STAGE_READY)
+ pio->io_children_notready++;
+ pio->io_children_notdone++;
+ zio->io_sibling_next = pio->io_child;
+ zio->io_sibling_prev = NULL;
+ if (pio->io_child != NULL)
+ pio->io_child->io_sibling_prev = zio;
+ pio->io_child = zio;
+ zio->io_ndvas = pio->io_ndvas;
+ mutex_exit(&pio->io_lock);
+ }
+
+ /*
+ * Save off the original state incase we need to retry later.
+ */
+ zio->io_orig_stage = zio->io_stage;
+ zio->io_orig_pipeline = zio->io_pipeline;
+ zio->io_orig_flags = zio->io_flags;
+
+ return (zio);
+}
+
+static void
+zio_reset(zio_t *zio)
+{
+ zio_clear_transform_stack(zio);
+
+ zio->io_flags = zio->io_orig_flags;
+ zio->io_stage = zio->io_orig_stage;
+ zio->io_pipeline = zio->io_orig_pipeline;
+ zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
+ int flags)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
+ ZIO_WAIT_FOR_CHILDREN_PIPELINE);
+
+ return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+{
+ return (zio_null(NULL, spa, done, private, flags));
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ int priority, int flags, zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ ASSERT3U(size, ==, BP_GET_LSIZE(bp));
+
+ /*
+ * If the user has specified that we allow I/Os to continue
+ * then attempt to satisfy the read.
+ */
+ if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
+ ZIO_ENTER(spa);
+
+ zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+ zio->io_bookmark = *zb;
+
+ zio->io_logical = zio;
+
+ /*
+ * Work off our copy of the bp so the caller can free it.
+ */
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
+ checksum < ZIO_CHECKSUM_FUNCTIONS);
+
+ ASSERT(compress >= ZIO_COMPRESS_OFF &&
+ compress < ZIO_COMPRESS_FUNCTIONS);
+
+ ZIO_ENTER(spa);
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+
+ zio->io_ready = ready;
+
+ zio->io_bookmark = *zb;
+
+ zio->io_logical = zio;
+
+ zio->io_checksum = checksum;
+ zio->io_compress = compress;
+ zio->io_ndvas = ncopies;
+
+ if (bp->blk_birth != txg) {
+ /* XXX the bp usually (always?) gets re-zeroed later */
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ } else {
+ /* Make sure someone doesn't change their mind on overwrites */
+ ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
+
+ zio->io_bookmark = *zb;
+ zio->io_checksum = checksum;
+ zio->io_compress = ZIO_COMPRESS_OFF;
+
+ if (pio != NULL)
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+
+ return (zio);
+}
+
+static void
+zio_write_allocate_ready(zio_t *zio)
+{
+ /* Free up the previous block */
+ if (!BP_IS_HOLE(&zio->io_bp_orig)) {
+ zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+ &zio->io_bp_orig, NULL, NULL));
+ }
+}
+
+static zio_t *
+zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
+
+ zio->io_checksum = checksum;
+ zio->io_compress = ZIO_COMPRESS_OFF;
+ zio->io_ready = zio_write_allocate_ready;
+
+ return (zio);
+}
+
+zio_t *
+zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (txg == spa->spa_syncing_txg &&
+ spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
+ bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
+ return (zio_null(pio, spa, NULL, NULL, 0));
+ }
+
+ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+ ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
+
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ /*
+ * A claim is an allocation of a specific block. Claims are needed
+ * to support immediate writes in the intent log. The issue is that
+ * immediate writes contain committed data, but in a txg that was
+ * *not* committed. Upon opening the pool after an unclean shutdown,
+ * the intent log claims all blocks that contain immediate write data
+ * so that the SPA knows they're in use.
+ *
+ * All claims *must* be resolved in the first txg -- before the SPA
+ * starts allocating blocks -- so that nothing is allocated twice.
+ */
+ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
+ ASSERT3U(spa_first_txg(spa), <=, txg);
+
+ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+ ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
+ ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
+
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+ int c;
+
+ if (vd->vdev_children == 0) {
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ ZIO_TYPE_IOCTL, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_cmd = cmd;
+ } else {
+ zio = zio_null(pio, spa, NULL, NULL, flags);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+ done, private, priority, flags));
+ }
+
+ return (zio);
+}
+
+static void
+zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
+ int checksum, boolean_t labels)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ ASSERT(size <= SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+ ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+#ifdef ZFS_DEBUG
+ if (labels) {
+ ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ }
+#endif
+ ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+ BP_ZERO(bp);
+
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ if (checksum != ZIO_CHECKSUM_OFF)
+ ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ void *data, int checksum, zio_done_func_t *done, void *private,
+ int priority, int flags, boolean_t labels)
+{
+ zio_t *zio;
+ blkptr_t blk;
+
+ ZIO_ENTER(vd->vdev_spa);
+
+ zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
+ ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+
+ /*
+ * Work off our copy of the bp so the caller can free it.
+ */
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ void *data, int checksum, zio_done_func_t *done, void *private,
+ int priority, int flags, boolean_t labels)
+{
+ zio_block_tail_t *zbt;
+ void *wbuf;
+ zio_t *zio;
+ blkptr_t blk;
+
+ ZIO_ENTER(vd->vdev_spa);
+
+ zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
+ ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+
+ zio->io_bp = &zio->io_bp_copy;
+ zio->io_checksum = checksum;
+
+ if (zio_checksum_table[checksum].ci_zbt) {
+ /*
+ * zbt checksums are necessarily destructive -- they modify
+ * one word of the write buffer to hold the verifier/checksum.
+ * Therefore, we must make a local copy in case the data is
+ * being written to multiple places.
+ */
+ wbuf = zio_buf_alloc(size);
+ bcopy(data, wbuf, size);
+ zio_push_transform(zio, wbuf, size, size);
+
+ zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
+ zbt->zbt_cksum = blk.blk_cksum;
+ }
+
+ return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us. It has no associated bp.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+ void *data, uint64_t size, int type, int priority, int flags,
+ zio_done_func_t *done, void *private)
+{
+ uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ zio_t *cio;
+
+ if (type == ZIO_TYPE_READ && bp != NULL) {
+ /*
+ * If we have the bp, then the child should perform the
+ * checksum and the parent need not. This pushes error
+ * detection as close to the leaves as possible and
+ * eliminates redundant checksums in the interior nodes.
+ */
+ pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
+ zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ }
+
+ cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
+ done, private, type, priority,
+ (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
+ ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+
+ cio->io_vd = vd;
+ cio->io_offset = offset;
+
+ return (cio);
+}
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+ int error;
+
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+
+ zio->io_waiter = curthread;
+
+ zio_execute(zio);
+
+ mutex_enter(&zio->io_lock);
+ while (zio->io_stalled != ZIO_STAGE_DONE)
+ cv_wait(&zio->io_cv, &zio->io_lock);
+ mutex_exit(&zio->io_lock);
+
+ error = zio->io_error;
+ mutex_destroy(&zio->io_lock);
+ cv_destroy(&zio->io_cv);
+ kmem_cache_free(zio_cache, zio);
+
+ return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+ zio_execute(zio);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+ (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
+ (task_func_t *)zio_execute, zio, TQ_SLEEP);
+}
+
+static int
+zio_issue_async(zio_t *zio)
+{
+ (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
+ (task_func_t *)zio_execute, zio, TQ_SLEEP);
+
+ return (ZIO_PIPELINE_STOP);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline interlocks: parent/child dependency scoreboarding
+ * ==========================================================================
+ */
+static int
+zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+ int rv = ZIO_PIPELINE_CONTINUE;
+
+ mutex_enter(&zio->io_lock);
+ ASSERT(zio->io_stalled == 0);
+ if (*countp != 0) {
+ zio->io_stalled = stage;
+ rv = ZIO_PIPELINE_STOP;
+ }
+ mutex_exit(&zio->io_lock);
+
+ return (rv);
+}
+
+static void
+zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+ zio_t *pio = zio->io_parent;
+
+ mutex_enter(&pio->io_lock);
+ if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ pio->io_error = zio->io_error;
+ ASSERT3U(*countp, >, 0);
+ if (--*countp == 0 && pio->io_stalled == stage) {
+ pio->io_stalled = 0;
+ mutex_exit(&pio->io_lock);
+ zio_execute(pio);
+ } else {
+ mutex_exit(&pio->io_lock);
+ }
+}
+
+int
+zio_wait_for_children_ready(zio_t *zio)
+{
+ return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
+ &zio->io_children_notready));
+}
+
+int
+zio_wait_for_children_done(zio_t *zio)
+{
+ return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
+ &zio->io_children_notdone));
+}
+
+static int
+zio_read_init(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint64_t csize = BP_GET_PSIZE(bp);
+ void *cbuf = zio_buf_alloc(csize);
+
+ zio_push_transform(zio, cbuf, csize, csize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
+ }
+
+ if (BP_IS_GANG(bp)) {
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
+ }
+
+ if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ready(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+
+ if (zio->io_ready)
+ zio->io_ready(zio);
+
+ if (pio != NULL)
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
+ &pio->io_children_notready);
+
+ if (zio->io_bp)
+ zio->io_bp_copy = *zio->io_bp;
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_vdev_retry_io(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+
+ /*
+ * Preserve the failed bp so that the io_ready() callback can
+ * update the accounting accordingly. The callback will also be
+ * responsible for freeing the previously allocated block, if one
+ * exists.
+ */
+ zio->io_bp_orig = *zio->io_bp;
+
+ /*
+ * We must zero out the old DVA and blk_birth before reallocating
+ * the bp.
+ */
+ BP_ZERO_DVAS(zio->io_bp);
+ zio_reset(zio);
+
+ if (pio) {
+ /*
+ * Let the parent know that we will
+ * re-alloc the write (=> new bp info).
+ */
+ mutex_enter(&pio->io_lock);
+ pio->io_children_notready++;
+
+ /*
+ * If the parent I/O is still in the open stage, then
+ * don't bother telling it to retry since it hasn't
+ * progressed far enough for it to care.
+ */
+ if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
+ pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+
+ ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
+ mutex_exit(&pio->io_lock);
+ }
+
+ /*
+ * We are getting ready to process the retry request so clear
+ * the flag and the zio's current error status.
+ */
+ zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
+ zio->io_error = 0;
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+int
+zio_vdev_resume_io(spa_t *spa)
+{
+ zio_t *zio;
+
+ mutex_enter(&spa->spa_zio_lock);
+
+ /*
+ * Probe all of vdevs that have experienced an I/O error.
+ * If we are still unable to verify the integrity of the vdev
+ * then we prevent the resume from proceeeding.
+ */
+ for (zio = list_head(&spa->spa_zio_list); zio != NULL;
+ zio = list_next(&spa->spa_zio_list, zio)) {
+ int error = 0;
+
+ /* We only care about I/Os that must succeed */
+ if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL)
+ continue;
+ error = vdev_probe(zio->io_vd);
+ if (error) {
+ mutex_exit(&spa->spa_zio_lock);
+ return (error);
+ }
+ }
+
+ /*
+ * Clear the vdev stats so that I/O can flow.
+ */
+ vdev_clear(spa, NULL, B_FALSE);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+ while ((zio = list_head(&spa->spa_zio_list)) != NULL) {
+ list_remove(&spa->spa_zio_list, zio);
+ zio->io_error = 0;
+
+ /*
+ * If we are resuming an allocating I/O then we force it
+ * to retry and let it resume operation where it left off.
+ * Otherwise, go back to the ready stage and pick up from
+ * there.
+ */
+ if (zio_write_retry && IO_IS_ALLOCATING(zio)) {
+ zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+ zio->io_stage--;
+ } else {
+ zio->io_stage = ZIO_STAGE_READY;
+ }
+
+ (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute,
+ zio, TQ_SLEEP);
+ }
+ mutex_exit(&spa->spa_zio_lock);
+
+ /*
+ * Wait for the taskqs to finish and recheck the pool state since
+ * it's possible that a resumed I/O has failed again.
+ */
+ taskq_wait(zio_taskq);
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
+ mutex_enter(&spa->spa_zio_lock);
+ cv_broadcast(&spa->spa_zio_cv);
+ mutex_exit(&spa->spa_zio_lock);
+
+ return (0);
+}
+
+static int
+zio_vdev_suspend_io(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ /*
+ * We've experienced an unrecoverable failure so
+ * set the pool state accordingly and queue all
+ * failed IOs.
+ */
+ spa->spa_state = POOL_STATE_IO_FAILURE;
+
+ mutex_enter(&spa->spa_zio_lock);
+ list_insert_tail(&spa->spa_zio_list, zio);
+
+#ifndef _KERNEL
+ /* Used to notify ztest that the pool has suspended */
+ cv_broadcast(&spa->spa_zio_cv);
+#endif
+ mutex_exit(&spa->spa_zio_lock);
+
+ return (ZIO_PIPELINE_STOP);
+}
+
+static int
+zio_assess(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ vdev_t *vd = zio->io_vd;
+
+ ASSERT(zio->io_children_notready == 0);
+ ASSERT(zio->io_children_notdone == 0);
+
+ if (bp != NULL) {
+ ASSERT(bp->blk_pad[0] == 0);
+ ASSERT(bp->blk_pad[1] == 0);
+ ASSERT(bp->blk_pad[2] == 0);
+ ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
+ if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ if (zio->io_ndvas != 0)
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+ ASSERT(BP_COUNT_GANG(bp) == 0 ||
+ (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+ }
+ }
+
+ /*
+ * Some child I/O has indicated that a retry is necessary, so
+ * we set an error on the I/O and let the logic below do the
+ * rest.
+ */
+ if (zio->io_flags & ZIO_FLAG_WRITE_RETRY)
+ zio->io_error = ERESTART;
+
+ if (vd != NULL)
+ vdev_stat_update(zio);
+
+ if (zio->io_error) {
+ /*
+ * If this I/O is attached to a particular vdev,
+ * generate an error message describing the I/O failure
+ * at the block level. We ignore these errors if the
+ * device is currently unavailable.
+ */
+ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
+ zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
+
+ if ((zio->io_error == EIO ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
+ zio->io_logical == zio) {
+ /*
+ * For root I/O requests, tell the SPA to log the error
+ * appropriately. Also, generate a logical data
+ * ereport.
+ */
+ spa_log_error(spa, zio);
+
+ zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
+ 0, 0);
+ }
+
+ /*
+ * If we are an allocating I/O then we attempt to reissue
+ * the I/O on another vdev unless the pool is out of space.
+ * We handle this condition based on the spa's failmode
+ * property.
+ */
+ if (zio_write_retry && zio->io_error != ENOSPC &&
+ IO_IS_ALLOCATING(zio))
+ return (zio_vdev_retry_io(zio));
+
+ ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
+
+ /*
+ * For I/O requests that cannot fail, we carry out
+ * the requested behavior based on the failmode pool
+ * property.
+ *
+ * XXX - Need to differentiate between an ENOSPC as
+ * a result of vdev failures vs. a full pool.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ char *blkbuf;
+
+#ifdef ZFS_DEBUG
+ blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
+ if (blkbuf) {
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
+ bp ? bp : &zio->io_bp_copy);
+ }
+ cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p "
+ "%s): error %d", zio->io_error == ECKSUM ?
+ "bad checksum" : "I/O failure",
+ zio_type_name[zio->io_type],
+ vdev_description(vd),
+ (u_longlong_t)zio->io_offset,
+ (void *)zio, blkbuf ? blkbuf : "", zio->io_error);
+#endif
+
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) {
+ fm_panic("Pool '%s' has encountered an "
+ "uncorrectable I/O failure and the "
+ "failure mode property for this pool "
+ "is set to panic.", spa_name(spa));
+ }
+ cmn_err(CE_WARN, "Pool '%s' has encountered "
+ "an uncorrectable I/O error. "
+ "Manual intervention is required.", spa_name(spa));
+ return (zio_vdev_suspend_io(zio));
+ }
+ }
+ ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
+ ASSERT(zio->io_children_notready == 0);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT(zio->io_children_notready == 0);
+ ASSERT(zio->io_children_notdone == 0);
+
+ zio_clear_transform_stack(zio);
+
+ if (zio->io_done)
+ zio->io_done(zio);
+
+ ASSERT(zio->io_delegate_list == NULL);
+ ASSERT(zio->io_delegate_next == NULL);
+
+ if (pio != NULL) {
+ zio_t *next, *prev;
+
+ mutex_enter(&pio->io_lock);
+ next = zio->io_sibling_next;
+ prev = zio->io_sibling_prev;
+ if (next != NULL)
+ next->io_sibling_prev = prev;
+ if (prev != NULL)
+ prev->io_sibling_next = next;
+ if (pio->io_child == zio)
+ pio->io_child = next;
+ mutex_exit(&pio->io_lock);
+
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
+ &pio->io_children_notdone);
+ }
+
+ /*
+ * Note: this I/O is now done, and will shortly be freed, so there is no
+ * need to clear this (or any other) flag.
+ */
+ if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
+ spa_config_exit(spa, zio);
+
+ if (zio->io_waiter != NULL) {
+ mutex_enter(&zio->io_lock);
+ ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+ zio->io_stalled = zio->io_stage;
+ cv_broadcast(&zio->io_cv);
+ mutex_exit(&zio->io_lock);
+ } else {
+ mutex_destroy(&zio->io_lock);
+ cv_destroy(&zio->io_cv);
+ kmem_cache_free(zio_cache, zio);
+ }
+
+ return (ZIO_PIPELINE_STOP);
+}
+
+/*
+ * ==========================================================================
+ * Compression support
+ * ==========================================================================
+ */
+static int
+zio_write_compress(zio_t *zio)
+{
+ int compress = zio->io_compress;
+ blkptr_t *bp = zio->io_bp;
+ void *cbuf;
+ uint64_t lsize = zio->io_size;
+ uint64_t csize = lsize;
+ uint64_t cbufsize = 0;
+ int pass;
+
+ if (bp->blk_birth == zio->io_txg) {
+ /*
+ * We're rewriting an existing block, which means we're
+ * working on behalf of spa_sync(). For spa_sync() to
+ * converge, it must eventually be the case that we don't
+ * have to allocate new blocks. But compression changes
+ * the blocksize, which forces a reallocate, and makes
+ * convergence take longer. Therefore, after the first
+ * few passes, stop compressing to ensure convergence.
+ */
+ pass = spa_sync_pass(zio->io_spa);
+ if (pass > zio_sync_pass.zp_dontcompress)
+ compress = ZIO_COMPRESS_OFF;
+ } else {
+ ASSERT(BP_IS_HOLE(bp));
+ pass = 1;
+ }
+
+ if (compress != ZIO_COMPRESS_OFF)
+ if (!zio_compress_data(compress, zio->io_data, zio->io_size,
+ &cbuf, &csize, &cbufsize))
+ compress = ZIO_COMPRESS_OFF;
+
+ if (compress != ZIO_COMPRESS_OFF && csize != 0)
+ zio_push_transform(zio, cbuf, csize, cbufsize);
+
+ /*
+ * The final pass of spa_sync() must be all rewrites, but the first
+ * few passes offer a trade-off: allocating blocks defers convergence,
+ * but newly allocated blocks are sequential, so they can be written
+ * to disk faster. Therefore, we allow the first few passes of
+ * spa_sync() to reallocate new blocks, but force rewrites after that.
+ * There should only be a handful of blocks after pass 1 in any case.
+ */
+ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ pass > zio_sync_pass.zp_rewrite) {
+ ASSERT(csize != 0);
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_COMPRESS(bp, compress);
+ zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp);
+ } else {
+ if (bp->blk_birth == zio->io_txg)
+ BP_ZERO(bp);
+ if (csize == 0) {
+ BP_ZERO(bp);
+ zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
+ } else {
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_PSIZE(bp, csize);
+ BP_SET_COMPRESS(bp, compress);
+ }
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_read_decompress(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ void *data;
+ uint64_t size;
+ uint64_t bufsize;
+ int compress = BP_GET_COMPRESS(bp);
+
+ ASSERT(compress != ZIO_COMPRESS_OFF);
+
+ zio_pop_transform(zio, &data, &size, &bufsize);
+
+ if (zio_decompress_data(compress, data, size,
+ zio->io_data, zio->io_size))
+ zio->io_error = EIO;
+
+ zio_buf_free(data, bufsize);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Gang block support
+ * ==========================================================================
+ */
+static void
+zio_gang_byteswap(zio_t *zio)
+{
+ ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+
+ if (BP_SHOULD_BYTESWAP(zio->io_bp))
+ byteswap_uint64_array(zio->io_data, zio->io_size);
+}
+
+static int
+zio_get_gang_header(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ ASSERT(BP_IS_GANG(bp));
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+
+ zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
+ NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+ ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
+
+ return (zio_wait_for_children_done(zio));
+}
+
+static int
+zio_read_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize, loff, lsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ lsize = BP_GET_PSIZE(gbp);
+
+ ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+ ASSERT3U(loff + lsize, <=, zio->io_size);
+ ASSERT(i < SPA_GBH_NBLKPTRS);
+ ASSERT(!BP_IS_HOLE(gbp));
+
+ zio_nowait(zio_read(zio, zio->io_spa, gbp,
+ (char *)zio->io_data + loff, lsize,
+ NULL, NULL, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+
+ return (zio_wait_for_children_done(zio));
+}
+
+static int
+zio_rewrite_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize, loff, lsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ ASSERT(gsize == gbufsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ lsize = BP_GET_PSIZE(gbp);
+
+ ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+ ASSERT3U(loff + lsize, <=, zio->io_size);
+ ASSERT(i < SPA_GBH_NBLKPTRS);
+ ASSERT(!BP_IS_HOLE(gbp));
+
+ zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
+ zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
+ NULL, NULL, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
+ }
+
+ zio_push_transform(zio, gbh, gsize, gbufsize);
+
+ return (zio_wait_for_children_ready(zio));
+}
+
+static int
+zio_free_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+ gbp, NULL, NULL));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_claim_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
+ gbp, NULL, NULL));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static void
+zio_write_allocate_gang_member_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ dva_t *cdva = zio->io_bp->blk_dva;
+ dva_t *pdva = pio->io_bp->blk_dva;
+ uint64_t asize;
+ int d;
+
+ ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
+ ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+
+ mutex_enter(&pio->io_lock);
+ for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
+ ASSERT(DVA_GET_GANG(&pdva[d]));
+ asize = DVA_GET_ASIZE(&pdva[d]);
+ asize += DVA_GET_ASIZE(&cdva[d]);
+ DVA_SET_ASIZE(&pdva[d], asize);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
+{
+ blkptr_t *bp = zio->io_bp;
+ dva_t *dva = bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+ zio_gbh_phys_t *gbh;
+ uint64_t txg = zio->io_txg;
+ uint64_t resid = zio->io_size;
+ uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
+ uint64_t gsize, loff, lsize;
+ uint32_t gbps_left;
+ int ndvas = zio->io_ndvas;
+ int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+ int error;
+ int i, d;
+
+ gsize = SPA_GANGBLOCKSIZE;
+ gbps_left = SPA_GBH_NBLKPTRS;
+
+ error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
+ B_FALSE);
+ if (error) {
+ zio->io_error = error;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ for (d = 0; d < gbh_ndvas; d++)
+ DVA_SET_GANG(&dva[d], 1);
+
+ bp->blk_birth = txg;
+
+ gbh = zio_buf_alloc(gsize);
+ bzero(gbh, gsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size;
+ loff += lsize, resid -= lsize, gbps_left--, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ dva = gbp->blk_dva;
+
+ ASSERT(gbps_left != 0);
+ maxalloc = MIN(maxalloc, resid);
+
+ while (resid <= maxalloc * gbps_left) {
+ error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas,
+ txg, bp, B_FALSE);
+ if (error == 0)
+ break;
+ ASSERT3U(error, ==, ENOSPC);
+ /* XXX - free up previous allocations? */
+ if (maxalloc == SPA_MINBLOCKSIZE) {
+ zio->io_error = error;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+ maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
+ }
+
+ if (resid <= maxalloc * gbps_left) {
+ lsize = maxalloc;
+ BP_SET_LSIZE(gbp, lsize);
+ BP_SET_PSIZE(gbp, lsize);
+ BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
+ gbp->blk_birth = txg;
+ zio_nowait(zio_rewrite(zio, spa,
+ zio->io_checksum, txg, gbp,
+ (char *)zio->io_data + loff, lsize,
+ zio_write_allocate_gang_member_done, NULL,
+ zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+ &zio->io_bookmark));
+ } else {
+ lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
+ ASSERT(lsize != SPA_MINBLOCKSIZE);
+ zio_nowait(zio_write_allocate(zio, spa,
+ zio->io_checksum, txg, gbp,
+ (char *)zio->io_data + loff, lsize,
+ zio_write_allocate_gang_member_done, NULL,
+ zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT));
+ }
+ }
+
+ ASSERT(resid == 0 && loff == zio->io_size);
+
+ zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
+
+ zio_push_transform(zio, gbh, gsize, gsize);
+
+ /*
+ * As much as we'd like this to be 'ready' instead of 'done',
+ * updating our ASIZE doesn't happen until the io_done callback,
+ * so we have to wait for that to finish in order for our BP
+ * to be stable.
+ */
+ return (zio_wait_for_children_done(zio));
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static int
+zio_dva_allocate(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ metaslab_class_t *mc = spa->spa_normal_class;
+ blkptr_t *bp = zio->io_bp;
+ int error;
+
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+ ASSERT3U(zio->io_ndvas, >, 0);
+ ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
+
+ /*
+ * For testing purposes, we force I/Os to retry. We don't allow
+ * retries beyond the first pass since those I/Os are non-allocating
+ * writes.
+ */
+ if (zio_io_fail_shift &&
+ spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite &&
+ zio_io_should_fail(zio_io_fail_shift))
+ zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
+ zio->io_txg, NULL, B_FALSE);
+
+ if (error == 0) {
+ bp->blk_birth = zio->io_txg;
+ } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
+ return (zio_write_allocate_gang_members(zio, mc));
+ } else {
+ zio->io_error = error;
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_dva_free(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
+
+ BP_ZERO(bp);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_dva_claim(zio_t *zio)
+{
+ zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+
+static int
+zio_vdev_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd ? vd->vdev_top : NULL;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t align;
+ spa_t *spa = zio->io_spa;
+
+ /*
+ * If the pool is already in a failure state then just suspend
+ * this IO until the problem is resolved. We will reissue them
+ * at that time.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
+ zio->io_type == ZIO_TYPE_WRITE)
+ return (zio_vdev_suspend_io(zio));
+
+ /*
+ * The mirror_ops handle multiple DVAs in a single BP
+ */
+ if (vd == NULL)
+ return (vdev_mirror_ops.vdev_op_io_start(zio));
+
+ align = 1ULL << tvd->vdev_ashift;
+
+ if (zio->io_retries == 0 && vd == tvd)
+ zio->io_flags |= ZIO_FLAG_FAILFAST;
+
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
+ zio->io_flags |= ZIO_FLAG_PHYSICAL;
+ zio->io_offset += VDEV_LABEL_START_SIZE;
+ }
+
+ if (P2PHASE(zio->io_size, align) != 0) {
+ uint64_t asize = P2ROUNDUP(zio->io_size, align);
+ char *abuf = zio_buf_alloc(asize);
+ ASSERT(vd == tvd);
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ bcopy(zio->io_data, abuf, zio->io_size);
+ bzero(abuf + zio->io_size, asize - zio->io_size);
+ }
+ zio_push_transform(zio, abuf, asize, asize);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
+ zio->io_flags |= ZIO_FLAG_SUBBLOCK;
+ }
+
+ ASSERT(P2PHASE(zio->io_offset, align) == 0);
+ ASSERT(P2PHASE(zio->io_size, align) == 0);
+ ASSERT(bp == NULL ||
+ P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
+ ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+
+ return (vd->vdev_ops->vdev_op_io_start(zio));
+}
+
+static int
+zio_vdev_io_done(zio_t *zio)
+{
+ if (zio->io_vd == NULL)
+ return (vdev_mirror_ops.vdev_op_io_done(zio));
+
+ return (zio->io_vd->vdev_ops->vdev_op_io_done(zio));
+}
+
+/* XXPOLICY */
+boolean_t
+zio_should_retry(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ if (zio->io_error == 0)
+ return (B_FALSE);
+ if (zio->io_delegate_list != NULL)
+ return (B_FALSE);
+ if (vd && vd != vd->vdev_top)
+ return (B_FALSE);
+ if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
+ return (B_FALSE);
+ if (zio->io_retries > 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static int
+zio_vdev_io_assess(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd ? vd->vdev_top : NULL;
+
+ ASSERT(zio->io_vsd == NULL);
+
+ if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
+ void *abuf;
+ uint64_t asize;
+ ASSERT(vd == tvd);
+ zio_pop_transform(zio, &abuf, &asize, &asize);
+ if (zio->io_type == ZIO_TYPE_READ)
+ bcopy(abuf, zio->io_data, zio->io_size);
+ zio_buf_free(abuf, asize);
+ zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
+ }
+
+ if (zio_injection_enabled && !zio->io_error)
+ zio->io_error = zio_handle_fault_injection(zio, EIO);
+
+ /*
+ * If the I/O failed, determine whether we should attempt to retry it.
+ */
+ /* XXPOLICY */
+ if (zio_should_retry(zio)) {
+ ASSERT(tvd == vd);
+
+ zio->io_retries++;
+ zio->io_error = 0;
+ zio->io_flags &= ZIO_FLAG_RETRY_INHERIT;
+ /* XXPOLICY */
+ zio->io_flags &= ~ZIO_FLAG_FAILFAST;
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
+
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_stage--;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+ zio->io_stage--;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static int
+zio_checksum_generate(zio_t *zio)
+{
+ int checksum = zio->io_checksum;
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_gang_checksum_generate(zio_t *zio)
+{
+ zio_cksum_t zc;
+ zio_gbh_phys_t *gbh = zio->io_data;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+ zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
+
+ zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_checksum_verify(zio_t *zio)
+{
+ if (zio->io_bp != NULL) {
+ zio->io_error = zio_checksum_error(zio);
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, zio->io_vd, zio, 0, 0);
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+ zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+}
+
+/*
+ * Set the external verifier for a gang block based on stuff in the bp
+ */
+void
+zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
+ zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
+ zcp->zc_word[2] = bp->blk_birth;
+ zcp->zc_word[3] = 0;
+}
+
+/*
+ * ==========================================================================
+ * Define the pipeline
+ * ==========================================================================
+ */
+typedef int zio_pipe_stage_t(zio_t *zio);
+
+zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
+ NULL,
+ zio_wait_for_children_ready,
+ zio_read_init,
+ zio_issue_async,
+ zio_write_compress,
+ zio_checksum_generate,
+ zio_get_gang_header,
+ zio_rewrite_gang_members,
+ zio_free_gang_members,
+ zio_claim_gang_members,
+ zio_dva_allocate,
+ zio_dva_free,
+ zio_dva_claim,
+ zio_gang_checksum_generate,
+ zio_ready,
+ zio_vdev_io_start,
+ zio_vdev_io_done,
+ zio_vdev_io_assess,
+ zio_wait_for_children_done,
+ zio_checksum_verify,
+ zio_read_gang_members,
+ zio_read_decompress,
+ zio_assess,
+ zio_done,
+ NULL
+};
+
+/*
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread. In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
+ */
+void
+zio_execute(zio_t *zio)
+{
+ while (zio->io_stage < ZIO_STAGE_DONE) {
+ uint32_t pipeline = zio->io_pipeline;
+ int rv;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+ /*
+ * If an error occurred outside the vdev stack,
+ * just execute the interlock stages to clean up.
+ */
+ if (zio->io_error &&
+ ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0)
+ pipeline &= ZIO_ERROR_PIPELINE_MASK;
+
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
+
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
+
+ rv = zio_pipeline[zio->io_stage](zio);
+
+ if (rv == ZIO_PIPELINE_STOP)
+ return;
+
+ ASSERT(rv == ZIO_PIPELINE_CONTINUE);
+ }
+}
+
+static boolean_t
+zio_io_should_fail(uint16_t range)
+{
+ static uint16_t allocs = 0;
+
+ return (P2PHASE(allocs++, 1U<<range) == 0);
+}
+
+/*
+ * Try to allocate an intent log block. Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
+ uint64_t txg)
+{
+ int error;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) {
+ spa_config_exit(spa, FTAG);
+ return (ENOSPC);
+ }
+
+ /*
+ * We were passed the previous log block's DVA in bp->blk_dva[0].
+ * We use that as a hint for which vdev to allocate from next.
+ */
+ error = metaslab_alloc(spa, spa->spa_log_class, size,
+ new_bp, 1, txg, old_bp, B_TRUE);
+
+ if (error)
+ error = metaslab_alloc(spa, spa->spa_normal_class, size,
+ new_bp, 1, txg, old_bp, B_TRUE);
+
+ if (error == 0) {
+ BP_SET_LSIZE(new_bp, size);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_LEVEL(new_bp, 0);
+ BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+ new_bp->blk_birth = txg;
+ }
+
+ spa_config_exit(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Free an intent log block. We know it can't be a gang block, so there's
+ * nothing to do except metaslab_free() it.
+ */
+void
+zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(!BP_IS_GANG(bp));
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ metaslab_free(spa, bp, txg, B_FALSE);
+
+ spa_config_exit(spa, FTAG);
+}
+
+/*
+ * start an async flush of the write cache for this vdev
+ */
+void
+zio_flush(zio_t *zio, vdev_t *vd)
+{
+ zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+}
diff --git a/zfs/lib/libzpool/zio_checksum.c b/zfs/lib/libzpool/zio_checksum.c
new file mode 100644
index 000000000..f79254ef6
--- /dev/null
+++ b/zfs/lib/libzpool/zio_checksum.c
@@ -0,0 +1,172 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zio_checksum.c 1.6 06/11/10 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed. We support checksum vectors
+ * for three distinct reasons:
+ *
+ * 1. Different kinds of data need different levels of protection.
+ * For SPA metadata, we always want a very strong checksum.
+ * For user data, we let users make the trade-off between speed
+ * and checksum strength.
+ *
+ * 2. Cryptographic hash and MAC algorithms are an area of active research.
+ * It is likely that in future hash functions will be at least as strong
+ * as current best-of-breed, and may be substantially faster as well.
+ * We want the ability to take advantage of these new hashes as soon as
+ * they become available.
+ *
+ * 3. If someone develops hardware that can compute a strong hash quickly,
+ * we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in five bits of the DVA.
+ * This gives us room for up to 32 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength. When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified in the DVA encoding.
+ */
+
+/*ARGSUSED*/
+static void
+zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+ {{NULL, NULL}, 0, 0, "inherit"},
+ {{NULL, NULL}, 0, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
+};
+
+uint8_t
+zio_checksum_select(uint8_t child, uint8_t parent)
+{
+ ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (ZIO_CHECKSUM_ON_VALUE);
+
+ return (child);
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
+{
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t zbt_cksum;
+
+ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(ci->ci_func[0] != NULL);
+
+ if (ci->ci_zbt) {
+ *zcp = zbt->zbt_cksum;
+ zbt->zbt_magic = ZBT_MAGIC;
+ ci->ci_func[0](data, size, &zbt_cksum);
+ zbt->zbt_cksum = zbt_cksum;
+ } else {
+ ci->ci_func[0](data, size, zcp);
+ }
+}
+
+int
+zio_checksum_error(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ zio_cksum_t zc = bp->blk_cksum;
+ uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
+ BP_GET_CHECKSUM(bp);
+ int byteswap = BP_SHOULD_BYTESWAP(bp);
+ void *data = zio->io_data;
+ uint64_t size = ZIO_GET_IOSIZE(zio);
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t actual_cksum, expected_cksum;
+
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ return (EINVAL);
+
+ if (ci->ci_zbt) {
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+ zio_set_gang_verifier(zio, &zc);
+
+ if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
+ expected_cksum = zbt->zbt_cksum;
+ byteswap_uint64_array(&expected_cksum,
+ sizeof (zio_cksum_t));
+ zbt->zbt_cksum = zc;
+ byteswap_uint64_array(&zbt->zbt_cksum,
+ sizeof (zio_cksum_t));
+ ci->ci_func[1](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ byteswap_uint64_array(&zbt->zbt_cksum,
+ sizeof (zio_cksum_t));
+ } else {
+ expected_cksum = zbt->zbt_cksum;
+ zbt->zbt_cksum = zc;
+ ci->ci_func[0](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ }
+ zc = expected_cksum;
+ } else {
+ ASSERT(!BP_IS_GANG(bp));
+ ci->ci_func[byteswap](data, size, &actual_cksum);
+ }
+
+ if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc))
+ return (ECKSUM);
+
+ if (zio_injection_enabled && !zio->io_error)
+ return (zio_handle_fault_injection(zio, ECKSUM));
+
+ return (0);
+}
diff --git a/zfs/lib/libzpool/zio_compress.c b/zfs/lib/libzpool/zio_compress.c
new file mode 100644
index 000000000..190c4ead8
--- /dev/null
+++ b/zfs/lib/libzpool/zio_compress.c
@@ -0,0 +1,148 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zio_compress.c 1.4 07/03/22 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/compress.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Compression vectors.
+ */
+
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+ {NULL, NULL, 0, "inherit"},
+ {NULL, NULL, 0, "on"},
+ {NULL, NULL, 0, "uncompressed"},
+ {lzjb_compress, lzjb_decompress, 0, "lzjb"},
+ {NULL, NULL, 0, "empty"},
+ {gzip_compress, gzip_decompress, 1, "gzip-1"},
+ {gzip_compress, gzip_decompress, 2, "gzip-2"},
+ {gzip_compress, gzip_decompress, 3, "gzip-3"},
+ {gzip_compress, gzip_decompress, 4, "gzip-4"},
+ {gzip_compress, gzip_decompress, 5, "gzip-5"},
+ {gzip_compress, gzip_decompress, 6, "gzip-6"},
+ {gzip_compress, gzip_decompress, 7, "gzip-7"},
+ {gzip_compress, gzip_decompress, 8, "gzip-8"},
+ {gzip_compress, gzip_decompress, 9, "gzip-9"},
+};
+
+uint8_t
+zio_compress_select(uint8_t child, uint8_t parent)
+{
+ ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+
+ if (child == ZIO_COMPRESS_INHERIT)
+ return (parent);
+
+ if (child == ZIO_COMPRESS_ON)
+ return (ZIO_COMPRESS_ON_VALUE);
+
+ return (child);
+}
+
+int
+zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
+ uint64_t *destsizep, uint64_t *destbufsizep)
+{
+ uint64_t *word, *word_end;
+ uint64_t ciosize, gapsize, destbufsize;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ char *dest;
+ uint_t allzero;
+
+ ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+
+ /*
+ * If the data is all zeroes, we don't even need to allocate
+ * a block for it. We indicate this by setting *destsizep = 0.
+ */
+ allzero = 1;
+ word = src;
+ word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
+ while (word < word_end) {
+ if (*word++ != 0) {
+ allzero = 0;
+ break;
+ }
+ }
+ if (allzero) {
+ *destp = NULL;
+ *destsizep = 0;
+ *destbufsizep = 0;
+ return (1);
+ }
+
+ if (cpfunc == ZIO_COMPRESS_EMPTY)
+ return (0);
+
+ /* Compress at least 12.5% */
+ destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
+ if (destbufsize == 0)
+ return (0);
+ dest = zio_buf_alloc(destbufsize);
+
+ ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
+ (size_t)destbufsize, ci->ci_level);
+ if (ciosize > destbufsize) {
+ zio_buf_free(dest, destbufsize);
+ return (0);
+ }
+
+ /* Cool. We compressed at least as much as we were hoping to. */
+
+ /* For security, make sure we don't write random heap crap to disk */
+ gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
+ if (gapsize != 0) {
+ bzero(dest + ciosize, gapsize);
+ ciosize += gapsize;
+ }
+
+ ASSERT3U(ciosize, <=, destbufsize);
+ ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
+ *destp = dest;
+ *destsizep = ciosize;
+ *destbufsizep = destbufsize;
+
+ return (1);
+}
+
+int
+zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+ void *dest, uint64_t destsize)
+{
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+
+ return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+}
diff --git a/zfs/lib/libzpool/zio_inject.c b/zfs/lib/libzpool/zio_inject.c
new file mode 100644
index 000000000..30a938816
--- /dev/null
+++ b/zfs/lib/libzpool/zio_inject.c
@@ -0,0 +1,315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)zio_inject.c 1.2 07/12/07 SMI"
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault. These are kept in a global list. Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field. If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+
+uint32_t zio_injection_enabled;
+
+typedef struct inject_handler {
+ int zi_id;
+ spa_t *zi_spa;
+ zinject_record_t zi_record;
+ list_node_t zi_link;
+} inject_handler_t;
+
+static list_t inject_handlers;
+static krwlock_t inject_lock;
+static int inject_next_id = 1;
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(zbookmark_t *zb, uint64_t type,
+ zinject_record_t *record, int error)
+{
+ /*
+ * Check for a match against the MOS, which is based on type
+ */
+ if (zb->zb_objset == 0 && record->zi_objset == 0 &&
+ record->zi_object == 0) {
+ if (record->zi_type == DMU_OT_NONE ||
+ type == record->zi_type)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+ else
+ return (B_FALSE);
+ }
+
+ /*
+ * Check for an exact match.
+ */
+ if (zb->zb_objset == record->zi_objset &&
+ zb->zb_object == record->zi_object &&
+ zb->zb_level == record->zi_level &&
+ zb->zb_blkid >= record->zi_start &&
+ zb->zb_blkid <= record->zi_end &&
+ error == record->zi_error)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the I/O in question should return failure. Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+ int ret = 0;
+ inject_handler_t *handler;
+
+ /*
+ * Ignore I/O not associated with any logical data.
+ */
+ if (zio->io_logical == NULL)
+ return (0);
+
+ /*
+ * Currently, we only support fault injection on reads.
+ */
+ if (zio->io_type != ZIO_TYPE_READ)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (zio->io_spa != handler->zi_spa)
+ continue;
+
+ /* Ignore device errors */
+ if (handler->zi_record.zi_guid != 0)
+ continue;
+
+ /* If this handler matches, return EIO */
+ if (zio_match_handler(&zio->io_logical->io_bookmark,
+ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+ &handler->zi_record, error)) {
+ ret = error;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+int
+zio_handle_device_injection(vdev_t *vd, int error)
+{
+ inject_handler_t *handler;
+ int ret = 0;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (vd->vdev_guid == handler->zi_record.zi_guid) {
+ if (handler->zi_record.zi_error == error) {
+ /*
+ * For a failed open, pretend like the device
+ * has gone away.
+ */
+ if (error == ENXIO)
+ vd->vdev_stat.vs_aux =
+ VDEV_AUX_OPEN_FAILED;
+ ret = error;
+ break;
+ }
+ if (handler->zi_record.zi_error == ENXIO) {
+ ret = EIO;
+ break;
+ }
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+/*
+ * Create a new handler for the given record. We add it to the list, adding
+ * a reference to the spa_t in the process. We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int error;
+ spa_t *spa;
+
+ /*
+ * If this is pool-wide metadata, make sure we unload the corresponding
+ * spa_t, so that the next attempt to load it will trigger the fault.
+ * We call spa_reset() to unload the pool appropriately.
+ */
+ if (flags & ZINJECT_UNLOAD_SPA)
+ if ((error = spa_reset(name)) != 0)
+ return (error);
+
+ if (!(flags & ZINJECT_NULL)) {
+ /*
+ * spa_inject_ref() will add an injection reference, which will
+ * prevent the pool from being removed from the namespace while
+ * still allowing it to be unloaded.
+ */
+ if ((spa = spa_inject_addref(name)) == NULL)
+ return (ENOENT);
+
+ handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ *id = handler->zi_id = inject_next_id++;
+ handler->zi_spa = spa;
+ handler->zi_record = *record;
+ list_insert_tail(&inject_handlers, handler);
+ atomic_add_32(&zio_injection_enabled, 1);
+
+ rw_exit(&inject_lock);
+ }
+
+ /*
+ * Flush the ARC, so that any attempts to read this data will end up
+ * going to the ZIO layer. Note that this is a little overkill, but
+ * we don't have the necessary ARC interfaces to do anything else, and
+ * fault injection isn't a performance critical path.
+ */
+ if (flags & ZINJECT_FLUSH_ARC)
+ arc_flush(NULL);
+
+ return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function. Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+ zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ mutex_enter(&spa_namespace_lock);
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id > *id)
+ break;
+
+ if (handler) {
+ *record = handler->zi_record;
+ *id = handler->zi_id;
+ (void) strncpy(name, spa_name(handler->zi_spa), buflen);
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+
+ rw_exit(&inject_lock);
+ mutex_exit(&spa_namespace_lock);
+
+ return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id == id)
+ break;
+
+ if (handler == NULL) {
+ ret = ENOENT;
+ } else {
+ list_remove(&inject_handlers, handler);
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_add_32(&zio_injection_enabled, -1);
+ ret = 0;
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+void
+zio_inject_init(void)
+{
+ list_create(&inject_handlers, sizeof (inject_handler_t),
+ offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+ list_destroy(&inject_handlers);
+}