diff options
Diffstat (limited to 'module/zfs')
119 files changed, 76920 insertions, 0 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c new file mode 100644 index 000000000..73aecb285 --- /dev/null +++ b/module/zfs/arc.c @@ -0,0 +1,4479 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * DVA-based Adjustable Replacement Cache + * + * While much of the theory of operation used here is + * based on the self-tuning, low overhead replacement cache + * presented by Megiddo and Modha at FAST 2003, there are some + * significant differences: + * + * 1. The Megiddo and Modha model assumes any page is evictable. + * Pages in its cache cannot be "locked" into memory. This makes + * the eviction algorithm simple: evict the last page in the list. + * This also make the performance characteristics easy to reason + * about. Our cache is not so simple. At any given moment, some + * subset of the blocks in the cache are un-evictable because we + * have handed out a reference to them. Blocks are only evictable + * when there are no external references active. This makes + * eviction far more problematic: we choose to evict the evictable + * blocks that are the "lowest" in the list. + * + * There are times when it is not possible to evict the requested + * space. In these circumstances we are unable to adjust the cache + * size. To prevent the cache growing unbounded at these times we + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. + * + * 2. The Megiddo and Modha model assumes a fixed cache size. + * Pages are evicted when the cache is full and there is a cache + * miss. Our model has a variable sized cache. It grows with + * high use, but also tries to react to memory pressure from the + * operating system: decreasing its size when system memory is + * tight. + * + * 3. The Megiddo and Modha model assumes a fixed page size. All + * elements of the cache are therefor exactly the same size. So + * when adjusting the cache size following a cache miss, its simply + * a matter of choosing a single page to evict. In our model, we + * have variable sized cache blocks (rangeing from 512 bytes to + * 128K bytes). We therefor choose a set of blocks to evict to make + * space for a cache miss that approximates as closely as possible + * the space used by the new block. + * + * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" + * by N. Megiddo & D. Modha, FAST 2003 + */ + +/* + * The locking model: + * + * A new reference to a cache buffer can be obtained in two + * ways: 1) via a hash table lookup using the DVA as a key, + * or 2) via one of the ARC lists. The arc_read() interface + * uses method 1, while the internal arc algorithms for + * adjusting the cache use method 2. We therefor provide two + * types of locks: 1) the hash table lock array, and 2) the + * arc list locks. + * + * Buffers do not have their own mutexs, rather they rely on the + * hash table mutexs for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexs). + * + * buf_hash_find() returns the appropriate mutex (held) when it + * locates the requested buffer in the hash table. It returns + * NULL for the mutex if the buffer was not in the table. + * + * buf_hash_remove() expects the appropriate hash mutex to be + * already held before it is invoked. + * + * Each arc state also has a mutex which is used to protect the + * buffer list associated with the state. When attempting to + * obtain a hash table lock while holding an arc list lock you + * must use: mutex_tryenter() to avoid deadlock. Also note that + * the active state mutex must be held before the ghost state mutex. + * + * Arc buffers may have an associated eviction callback function. + * This function will be invoked prior to removing the buffer (e.g. + * in arc_do_user_evicts()). Note however that the data associated + * with the buffer may be evicted prior to the callback. The callback + * must be made with *no locks held* (to prevent deadlock). Additionally, + * the users of callbacks must ensure that their private data is + * protected from simultaneous callbacks from arc_buf_evict() + * and arc_do_user_evicts(). + * + * Note that the majority of the performance stats are manipulated + * with atomic operations. + * + * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists + */ + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_context.h> +#include <sys/arc.h> +#include <sys/refcount.h> +#include <sys/vdev.h> +#ifdef _KERNEL +#include <sys/vmsystm.h> +#include <vm/anon.h> +#include <sys/fs/swapnode.h> +#include <sys/dnlc.h> +#endif +#include <sys/callb.h> +#include <sys/kstat.h> + +static kmutex_t arc_reclaim_thr_lock; +static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ +static uint8_t arc_thread_exit; + +extern int zfs_write_limit_shift; +extern uint64_t zfs_write_limit_max; +extern kmutex_t zfs_write_limit_lock; + +#define ARC_REDUCE_DNLC_PERCENT 3 +uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; + +typedef enum arc_reclaim_strategy { + ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ + ARC_RECLAIM_CONS /* Conservative reclaim strategy */ +} arc_reclaim_strategy_t; + +/* number of seconds before growing cache again */ +static int arc_grow_retry = 60; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_lifespan; + +static int arc_dead; + +/* + * The arc has filled available memory and has now warmed up. + */ +static boolean_t arc_warm; + +/* + * These tunables are for performance analysis. + */ +uint64_t zfs_arc_max; +uint64_t zfs_arc_min; +uint64_t zfs_arc_meta_limit = 0; +int zfs_mdcomp_disable = 0; + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ + uint64_t arcs_size; /* total amount of data in this state */ + kmutex_t arcs_mtx; +} arc_state_t; + +/* The 6 states: */ +static arc_state_t ARC_anon; +static arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +static arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + kstat_named_t arcstat_recycle_miss; + kstat_named_t arcstat_mutex_miss; + kstat_named_t arcstat_evict_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + kstat_named_t arcstat_size; + kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_memory_throttle_count; +} arc_stats_t; + +static arc_stats_t arc_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_misses", KSTAT_DATA_UINT64 }, + { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_misses", KSTAT_DATA_UINT64 }, + { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_misses", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, + { "mru_hits", KSTAT_DATA_UINT64 }, + { "mru_ghost_hits", KSTAT_DATA_UINT64 }, + { "mfu_hits", KSTAT_DATA_UINT64 }, + { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "deleted", KSTAT_DATA_UINT64 }, + { "recycle_miss", KSTAT_DATA_UINT64 }, + { "mutex_miss", KSTAT_DATA_UINT64 }, + { "evict_skip", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "p", KSTAT_DATA_UINT64 }, + { "c", KSTAT_DATA_UINT64 }, + { "c_min", KSTAT_DATA_UINT64 }, + { "c_max", KSTAT_DATA_UINT64 }, + { "size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 } +}; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)); + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +#define ARCSTAT_MAX(stat, val) { \ + uint64_t m; \ + while ((val) > (m = arc_stats.stat.value.ui64) && \ + (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ + continue; \ +} + +#define ARCSTAT_MAXSTAT(stat) \ + ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) + +/* + * We define a macro to allow ARC hits/misses to be easily broken down by + * two separate conditions, giving a total of four different subtypes for + * each of hits and misses (so eight statistics total). + */ +#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ + if (cond1) { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ + } \ + } else { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ + } \ + } + +kstat_t *arc_ksp; +static arc_state_t *arc_anon; +static arc_state_t *arc_mru; +static arc_state_t *arc_mru_ghost; +static arc_state_t *arc_mfu; +static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ + +static int arc_no_grow; /* Don't try to grow cache size */ +static uint64_t arc_tempreserve; +static uint64_t arc_meta_used; +static uint64_t arc_meta_limit; +static uint64_t arc_meta_max = 0; + +typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_done_func_t *acb_done; + arc_buf_t *acb_buf; + zio_t *acb_zio_dummy; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + uint64_t b_cksum0; + + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_buf_t *b_buf; + uint32_t b_flags; + uint32_t b_datacnt; + + arc_callback_t *b_acb; + kcondvar_t b_cv; + + /* immutable */ + arc_buf_contents_t b_type; + uint64_t b_size; + spa_t *b_spa; + + /* protected by arc state mutex */ + arc_state_t *b_state; + list_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + refcount_t b_refcnt; + + l2arc_buf_hdr_t *b_l2hdr; + list_node_t b_l2node; +}; + +static arc_buf_t *arc_eviction_list; +static kmutex_t arc_eviction_mtx; +static arc_buf_hdr_t arc_eviction_hdr; +static void arc_get_data_buf(arc_buf_t *buf); +static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); +static int arc_evict_needed(arc_buf_contents_t type); +static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); + +#define GHOST_STATE(state) \ + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) + +/* + * Private ARC flags. These flags are private ARC only flags that will show up + * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can + * be passed in as arc_flags in things like arc_read. However, these flags + * should never be passed and should only be set by ARC code. When adding new + * public flags, make sure not to smash the private ones. + */ + +#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ +#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ +#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ +#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ +#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ +#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ +#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ +#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ +#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ +#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ +#define ARC_STORED (1 << 19) /* has been store()d to */ + +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) +#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) + +/* + * Other sizes + */ + +#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) + +/* + * Hash table routines + */ + +#define HT_LOCK_PAD 64 + +struct ht_lock { + kmutex_t ht_lock; +#ifdef _KERNEL + unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +#define BUF_LOCKS 256 +typedef struct buf_hash_table { + uint64_t ht_mask; + arc_buf_hdr_t **ht_table; + struct ht_lock ht_locks[BUF_LOCKS]; +} buf_hash_table_t; + +static buf_hash_table_t buf_hash_table; + +#define BUF_HASH_INDEX(spa, dva, birth) \ + (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) +#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) +#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define HDR_LOCK(buf) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) + +uint64_t zfs_crc64_table[256]; + +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 4 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval */ + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* + * L2ARC Performance Tunables + */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_write; /* desired write size, bytes */ + uint64_t l2ad_boost; /* warmup write boost, bytes */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ +} l2arc_dev_t; + +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_t *l2rcb_buf; /* read buffer */ + spa_t *l2rcb_spa; /* spa */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ +} l2arc_read_callback_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; + +struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + daddr_t b_daddr; /* disk address, offset byte */ +}; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + void *l2df_data; + size_t l2df_size; + void (*l2df_func)(void *, size_t); + list_node_t l2df_list_node; +} l2arc_data_free_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static void l2arc_read_done(zio_t *zio); +static void l2arc_hdr_stat_add(void); +static void l2arc_hdr_stat_remove(void); + +static uint64_t +buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) +{ + uintptr_t spav = (uintptr_t)spa; + uint8_t *vdva = (uint8_t *)dva; + uint64_t crc = -1ULL; + int i; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + for (i = 0; i < sizeof (dva_t); i++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; + + crc ^= (spav>>8) ^ birth; + + return (crc); +} + +#define BUF_EMPTY(buf) \ + ((buf)->b_dva.dva_word[0] == 0 && \ + (buf)->b_dva.dva_word[1] == 0 && \ + (buf)->b_birth == 0) + +#define BUF_EQUAL(spa, dva, birth, buf) \ + ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((buf)->b_birth == birth) && ((buf)->b_spa == spa) + +static arc_buf_hdr_t * +buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *buf; + + mutex_enter(hash_lock); + for (buf = buf_hash_table.ht_table[idx]; buf != NULL; + buf = buf->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, buf)) { + *lockp = hash_lock; + return (buf); + } + } + mutex_exit(hash_lock); + *lockp = NULL; + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static arc_buf_hdr_t * +buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *fbuf; + uint32_t i; + + ASSERT(!HDR_IN_HASH_TABLE(buf)); + *lockp = hash_lock; + mutex_enter(hash_lock); + for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; + fbuf = fbuf->b_hash_next, i++) { + if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) + return (fbuf); + } + + buf->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = buf; + buf->b_flags |= ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + if (i > 0) { + ARCSTAT_BUMP(arcstat_hash_collisions); + if (i == 1) + ARCSTAT_BUMP(arcstat_hash_chains); + + ARCSTAT_MAX(arcstat_hash_chain_max, i); + } + + ARCSTAT_BUMP(arcstat_hash_elements); + ARCSTAT_MAXSTAT(arcstat_hash_elements); + + return (NULL); +} + +static void +buf_hash_remove(arc_buf_hdr_t *buf) +{ + arc_buf_hdr_t *fbuf, **bufp; + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + + ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + ASSERT(HDR_IN_HASH_TABLE(buf)); + + bufp = &buf_hash_table.ht_table[idx]; + while ((fbuf = *bufp) != buf) { + ASSERT(fbuf != NULL); + bufp = &fbuf->b_hash_next; + } + *bufp = buf->b_hash_next; + buf->b_hash_next = NULL; + buf->b_flags &= ~ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + ARCSTAT_BUMPDOWN(arcstat_hash_elements); + + if (buf_hash_table.ht_table[idx] && + buf_hash_table.ht_table[idx]->b_hash_next == NULL) + ARCSTAT_BUMPDOWN(arcstat_hash_chains); +} + +/* + * Global data structures and functions for the buf kmem cache. + */ +static kmem_cache_t *hdr_cache; +static kmem_cache_t *buf_cache; + +static void +buf_fini(void) +{ + int i; + + kmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); + for (i = 0; i < BUF_LOCKS; i++) + mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(buf_cache); +} + +/* + * Constructor callback - called when the cache is empty + * and a new buf is requested. + */ +/* ARGSUSED */ +static int +hdr_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_hdr_t)); + refcount_create(&buf->b_refcnt); + cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); + return (0); +} + +/* ARGSUSED */ +static int +buf_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_t)); + rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + return (0); +} + +/* + * Destructor callback - called when a cached buf is + * no longer required. + */ +/* ARGSUSED */ +static void +hdr_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *buf = vbuf; + + refcount_destroy(&buf->b_refcnt); + cv_destroy(&buf->b_cv); + mutex_destroy(&buf->b_freeze_lock); + + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +/* ARGSUSED */ +static void +buf_dest(void *vbuf, void *unused) +{ + arc_buf_t *buf = vbuf; + + rw_destroy(&buf->b_lock); +} + +/* + * Reclaim callback -- invoked when memory is low. + */ +/* ARGSUSED */ +static void +hdr_recl(void *unused) +{ + dprintf("hdr_recl called\n"); + /* + * umem calls the reclaim func when we destroy the buf cache, + * which is after we do arc_fini(). + */ + if (!arc_dead) + cv_signal(&arc_reclaim_thr_cv); +} + +static void +buf_init(void) +{ + uint64_t *ct; + uint64_t hsize = 1ULL << 12; + int i, j; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 64K block size. The table will take up + * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). + */ + while (hsize * 65536 < physmem * PAGESIZE) + hsize <<= 1; +retry: + buf_hash_table.ht_mask = hsize - 1; + buf_hash_table.ht_table = + kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); + if (buf_hash_table.ht_table == NULL) { + ASSERT(hsize > (1ULL << 8)); + hsize >>= 1; + goto retry; + } + + hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), + 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), + 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + + for (i = 0; i < BUF_LOCKS; i++) { + mutex_init(&buf_hash_table.ht_locks[i].ht_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +static void +arc_cksum_verify(arc_buf_t *buf) +{ + zio_cksum_t zc; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || + (buf->b_hdr->b_flags & ARC_IO_ERROR)) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + panic("buffer modified while frozen!"); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +static int +arc_cksum_equal(arc_buf_t *buf) +{ + zio_cksum_t zc; + int equal; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); + mutex_exit(&buf->b_hdr->b_freeze_lock); + + return (equal); +} + +static void +arc_cksum_compute(arc_buf_t *buf, boolean_t force) +{ + if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, + buf->b_hdr->b_freeze_cksum); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_thaw(arc_buf_t *buf) +{ + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + } + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + buf->b_hdr->b_freeze_cksum = NULL; + } + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_freeze(arc_buf_t *buf) +{ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || + buf->b_hdr->b_state == arc_anon); + arc_cksum_compute(buf, B_FALSE); +} + +static void +add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if ((refcount_add(&ab->b_refcnt, tag) == 1) && + (ab->b_state != arc_anon)) { + uint64_t delta = ab->b_size * ab->b_datacnt; + list_t *list = &ab->b_state->arcs_list[ab->b_type]; + uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; + + ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); + mutex_enter(&ab->b_state->arcs_mtx); + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(list, ab); + if (GHOST_STATE(ab->b_state)) { + ASSERT3U(ab->b_datacnt, ==, 0); + ASSERT3P(ab->b_buf, ==, NULL); + delta = ab->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); + mutex_exit(&ab->b_state->arcs_mtx); + /* remove the prefetch flag if we get a reference */ + if (ab->b_flags & ARC_PREFETCH) + ab->b_flags &= ~ARC_PREFETCH; + } +} + +static int +remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + int cnt; + arc_state_t *state = ab->b_state; + + ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(state)); + + if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + (state != arc_anon)) { + uint64_t *size = &state->arcs_lsize[ab->b_type]; + + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(!list_link_active(&ab->b_arc_node)); + list_insert_head(&state->arcs_list[ab->b_type], ab); + ASSERT(ab->b_datacnt > 0); + atomic_add_64(size, ab->b_size * ab->b_datacnt); + mutex_exit(&state->arcs_mtx); + } + return (cnt); +} + +/* + * Move the supplied buffer to the indicated state. The mutex + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +{ + arc_state_t *old_state = ab->b_state; + int64_t refcnt = refcount_count(&ab->b_refcnt); + uint64_t from_delta, to_delta; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(new_state != old_state); + ASSERT(refcnt == 0 || ab->b_datacnt > 0); + ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + + from_delta = to_delta = ab->b_datacnt * ab->b_size; + + /* + * If this buffer is evictable, transfer it from the + * old state list to the new state list. + */ + if (refcnt == 0) { + if (old_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + + if (use_mutex) + mutex_enter(&old_state->arcs_mtx); + + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&old_state->arcs_list[ab->b_type], ab); + + /* + * If prefetching out of the ghost cache, + * we will have a non-null datacnt. + */ + if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + /* ghost elements have a ghost size */ + ASSERT(ab->b_buf == NULL); + from_delta = ab->b_size; + } + ASSERT3U(*size, >=, from_delta); + atomic_add_64(size, -from_delta); + + if (use_mutex) + mutex_exit(&old_state->arcs_mtx); + } + if (new_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + + if (use_mutex) + mutex_enter(&new_state->arcs_mtx); + + list_insert_head(&new_state->arcs_list[ab->b_type], ab); + + /* ghost elements have a ghost size */ + if (GHOST_STATE(new_state)) { + ASSERT(ab->b_datacnt == 0); + ASSERT(ab->b_buf == NULL); + to_delta = ab->b_size; + } + atomic_add_64(size, to_delta); + + if (use_mutex) + mutex_exit(&new_state->arcs_mtx); + } + } + + ASSERT(!BUF_EMPTY(ab)); + if (new_state == arc_anon) { + buf_hash_remove(ab); + } + + /* adjust state sizes */ + if (to_delta) + atomic_add_64(&new_state->arcs_size, to_delta); + if (from_delta) { + ASSERT3U(old_state->arcs_size, >=, from_delta); + atomic_add_64(&old_state->arcs_size, -from_delta); + } + ab->b_state = new_state; + + /* adjust l2arc hdr stats */ + if (new_state == arc_l2c_only) + l2arc_hdr_stat_add(); + else if (old_state == arc_l2c_only) + l2arc_hdr_stat_remove(); +} + +void +arc_space_consume(uint64_t space) +{ + atomic_add_64(&arc_meta_used, space); + atomic_add_64(&arc_size, space); +} + +void +arc_space_return(uint64_t space) +{ + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + atomic_add_64(&arc_meta_used, -space); + ASSERT(arc_size >= space); + atomic_add_64(&arc_size, -space); +} + +void * +arc_data_buf_alloc(uint64_t size) +{ + if (arc_evict_needed(ARC_BUFC_DATA)) + cv_signal(&arc_reclaim_thr_cv); + atomic_add_64(&arc_size, size); + return (zio_data_buf_alloc(size)); +} + +void +arc_data_buf_free(void *buf, uint64_t size) +{ + zio_data_buf_free(buf, size); + ASSERT(arc_size >= size); + atomic_add_64(&arc_size, -size); +} + +arc_buf_t * +arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + + ASSERT3U(size, >, 0); + hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + ASSERT(BUF_EMPTY(hdr)); + hdr->b_size = size; + hdr->b_type = type; + hdr->b_spa = spa; + hdr->b_state = arc_anon; + hdr->b_arc_access = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + arc_get_data_buf(buf); + hdr->b_datacnt = 1; + hdr->b_flags = 0; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + (void) refcount_add(&hdr->b_refcnt, tag); + + return (buf); +} + +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = hdr->b_size; + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = hdr->b_buf; + hdr->b_buf = buf; + arc_get_data_buf(buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_datacnt += 1; + return (buf); +} + +void +arc_buf_add_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + + /* + * Check to see if this buffer is evicted. Callers + * must verify b_data != NULL to know if the add_ref + * was successful. + */ + rw_enter(&buf->b_lock, RW_READER); + if (buf->b_data == NULL) { + rw_exit(&buf->b_lock); + return; + } + hdr = buf->b_hdr; + ASSERT(hdr != NULL); + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + rw_exit(&buf->b_lock); + + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); +} + +/* + * Free the arc data buffer. If it is an l2arc write in progress, + * the buffer is placed on l2arc_free_on_write to be freed later. + */ +static void +arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), + void *data, size_t size) +{ + if (HDR_L2_WRITING(hdr)) { + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + free_func(data, size); + } +} + +static void +arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) +{ + arc_buf_t **bufp; + + /* free up data associated with the buf */ + if (buf->b_data) { + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_cksum_verify(buf); + if (!recycle) { + if (type == ARC_BUFC_METADATA) { + arc_buf_data_free(buf->b_hdr, zio_buf_free, + buf->b_data, size); + arc_space_return(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_buf_data_free(buf->b_hdr, + zio_data_buf_free, buf->b_data, size); + atomic_add_64(&arc_size, -size); + } + } + if (list_link_active(&buf->b_hdr->b_arc_node)) { + uint64_t *cnt = &state->arcs_lsize[type]; + + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(state != arc_anon); + + ASSERT3U(*cnt, >=, size); + atomic_add_64(cnt, -size); + } + ASSERT3U(state->arcs_size, >=, size); + atomic_add_64(&state->arcs_size, -size); + buf->b_data = NULL; + ASSERT(buf->b_hdr->b_datacnt > 0); + buf->b_hdr->b_datacnt -= 1; + } + + /* only remove the buf if requested */ + if (!all) + return; + + /* remove the buf from the hdr list */ + for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + continue; + *bufp = buf->b_next; + + ASSERT(buf->b_efunc == NULL); + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_destroy(arc_buf_hdr_t *hdr) +{ + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT3P(hdr->b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!(hdr->b_flags & ARC_STORED)); + + if (hdr->b_l2hdr != NULL) { + if (!MUTEX_HELD(&l2arc_buflist_mtx)) { + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + mutex_enter(&l2arc_buflist_mtx); + if (hdr->b_l2hdr != NULL) { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, + hdr); + } + mutex_exit(&l2arc_buflist_mtx); + } else { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + } + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } + + if (!BUF_EMPTY(hdr)) { + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + } + while (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; + + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_WRITER); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_buf, FALSE, FALSE); + hdr->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + rw_exit(&buf->b_lock); + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_buf, FALSE, TRUE); + } + } + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT3P(hdr->b_hash_next, ==, NULL); + ASSERT3P(hdr->b_acb, ==, NULL); + kmem_cache_free(hdr_cache, hdr); +} + +void +arc_buf_free(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + int hashed = hdr->b_state != arc_anon; + + ASSERT(buf->b_efunc == NULL); + ASSERT(buf->b_data != NULL); + + if (hashed) { + kmutex_t *hash_lock = HDR_LOCK(hdr); + + mutex_enter(hash_lock); + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) + arc_buf_destroy(buf, FALSE, TRUE); + else + hdr->b_flags |= ARC_BUF_AVAILABLE; + mutex_exit(hash_lock); + } else if (HDR_IO_IN_PROGRESS(hdr)) { + int destroy_hdr; + /* + * We are in the middle of an async write. Don't destroy + * this buffer unless the write completes before we finish + * decrementing the reference count. + */ + mutex_enter(&arc_eviction_mtx); + (void) remove_reference(hdr, NULL, tag); + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + if (remove_reference(hdr, NULL, tag) > 0) { + ASSERT(HDR_IO_ERROR(hdr)); + arc_buf_destroy(buf, FALSE, TRUE); + } else { + arc_hdr_destroy(hdr); + } + } +} + +int +arc_buf_remove_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + int no_callback = (buf->b_efunc == NULL); + + if (hdr->b_state == arc_anon) { + arc_buf_free(buf, tag); + return (no_callback); + } + + mutex_enter(hash_lock); + ASSERT(hdr->b_state != arc_anon); + ASSERT(buf->b_data != NULL); + + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) { + if (no_callback) + arc_buf_destroy(buf, FALSE, TRUE); + } else if (no_callback) { + ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + ASSERT(no_callback || hdr->b_datacnt > 1 || + refcount_is_zero(&hdr->b_refcnt)); + mutex_exit(hash_lock); + return (no_callback); +} + +int +arc_buf_size(arc_buf_t *buf) +{ + return (buf->b_hdr->b_size); +} + +/* + * Evict buffers from list until we've removed the specified number of + * bytes. Move the removed buffers to the appropriate evict state. + * If the recycle flag is set, then attempt to "recycle" a buffer: + * - look for a buffer to evict that is `bytes' long. + * - return the data block from this buffer rather than freeing it. + * This flag is used by callers that are trying to make space for a + * new buffer in a full arc cache. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so may not catch all candidates. + * It may also return without evicting as much space as requested. + */ +static void * +arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, + arc_buf_contents_t type) +{ + arc_state_t *evicted_state; + uint64_t bytes_evicted = 0, skipped = 0, missed = 0; + arc_buf_hdr_t *ab, *ab_prev = NULL; + list_t *list = &state->arcs_list[type]; + kmutex_t *hash_lock; + boolean_t have_lock; + void *stolen = NULL; + + ASSERT(state == arc_mru || state == arc_mfu); + + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(ab) || + (spa && ab->b_spa != spa) || + (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && + lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { + skipped++; + continue; + } + /* "lookahead" for better eviction candidate */ + if (recycle && ab->b_size != bytes && + ab_prev && ab_prev->b_size == bytes) + continue; + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (have_lock || mutex_tryenter(hash_lock)) { + ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + ASSERT(ab->b_datacnt > 0); + while (ab->b_buf) { + arc_buf_t *buf = ab->b_buf; + if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { + missed += 1; + break; + } + if (buf->b_data) { + bytes_evicted += ab->b_size; + if (recycle && ab->b_type == type && + ab->b_size == bytes && + !HDR_L2_WRITING(ab)) { + stolen = buf->b_data; + recycle = FALSE; + } + } + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + arc_buf_destroy(buf, + buf->b_data == stolen, FALSE); + ab->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + rw_exit(&buf->b_lock); + } else { + rw_exit(&buf->b_lock); + arc_buf_destroy(buf, + buf->b_data == stolen, TRUE); + } + } + if (ab->b_datacnt == 0) { + arc_change_state(evicted_state, ab, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags |= ARC_IN_HASH_TABLE; + ab->b_flags &= ~ARC_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + } + if (!have_lock) + mutex_exit(hash_lock); + if (bytes >= 0 && bytes_evicted >= bytes) + break; + } else { + missed += 1; + } + } + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&state->arcs_mtx); + + if (bytes_evicted < bytes) + dprintf("only evicted %lld bytes from %x", + (longlong_t)bytes_evicted, state); + + if (skipped) + ARCSTAT_INCR(arcstat_evict_skip, skipped); + + if (missed) + ARCSTAT_INCR(arcstat_mutex_miss, missed); + + /* + * We have just evicted some date into the ghost state, make + * sure we also adjust the ghost state size if necessary. + */ + if (arc_no_grow && + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { + int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + int64_t todelete = + MIN(arc_mru_ghost->arcs_lsize[type], mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); + } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { + int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + } + } + + return (stolen); +} + +/* + * Remove buffers from list until we've removed the specified number of + * bytes. Destroy the buffers that are removed. + */ +static void +arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) +{ + arc_buf_hdr_t *ab, *ab_prev; + list_t *list = &state->arcs_list[ARC_BUFC_DATA]; + kmutex_t *hash_lock; + uint64_t bytes_deleted = 0; + uint64_t bufs_skipped = 0; + + ASSERT(GHOST_STATE(state)); +top: + mutex_enter(&state->arcs_mtx); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + if (spa && ab->b_spa != spa) + continue; + hash_lock = HDR_LOCK(ab); + if (mutex_tryenter(hash_lock)) { + ASSERT(!HDR_IO_IN_PROGRESS(ab)); + ASSERT(ab->b_buf == NULL); + ARCSTAT_BUMP(arcstat_deleted); + bytes_deleted += ab->b_size; + + if (ab->b_l2hdr != NULL) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, ab, hash_lock); + mutex_exit(hash_lock); + } else { + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + } + + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + if (bytes >= 0 && bytes_deleted >= bytes) + break; + } else { + if (bytes < 0) { + mutex_exit(&state->arcs_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + bufs_skipped += 1; + } + } + mutex_exit(&state->arcs_mtx); + + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_deleted < bytes)) { + list = &state->arcs_list[ARC_BUFC_METADATA]; + goto top; + } + + if (bufs_skipped) { + ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); + ASSERT(bytes >= 0); + } + + if (bytes_deleted < bytes) + dprintf("only deleted %lld bytes from %p", + (longlong_t)bytes_deleted, state); +} + +static void +arc_adjust(void) +{ + int64_t top_sz, mru_over, arc_over, todelete; + + top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; + + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); + (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + } + + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); + (void) arc_evict(arc_mru, NULL, toevict, FALSE, + ARC_BUFC_METADATA); + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + } + + mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0) { + if (arc_mru_ghost->arcs_size > 0) { + todelete = MIN(arc_mru_ghost->arcs_size, mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); + } + } + + if ((arc_over = arc_size - arc_c) > 0) { + int64_t tbl_over; + + if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); + (void) arc_evict(arc_mfu, NULL, toevict, FALSE, + ARC_BUFC_DATA); + arc_over = arc_size - arc_c; + } + + if (arc_over > 0 && + arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], + arc_over); + (void) arc_evict(arc_mfu, NULL, toevict, FALSE, + ARC_BUFC_METADATA); + } + + tbl_over = arc_size + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c * 2; + + if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { + todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + } + } +} + +static void +arc_do_user_evicts(void) +{ + mutex_enter(&arc_eviction_mtx); + while (arc_eviction_list != NULL) { + arc_buf_t *buf = arc_eviction_list; + arc_eviction_list = buf->b_next; + rw_enter(&buf->b_lock, RW_WRITER); + buf->b_hdr = NULL; + rw_exit(&buf->b_lock); + mutex_exit(&arc_eviction_mtx); + + if (buf->b_efunc != NULL) + VERIFY(buf->b_efunc(buf) == 0); + + buf->b_efunc = NULL; + buf->b_private = NULL; + kmem_cache_free(buf_cache, buf); + mutex_enter(&arc_eviction_mtx); + } + mutex_exit(&arc_eviction_mtx); +} + +/* + * Flush all *evictable* data from the cache for the given spa. + * NOTE: this will not touch "active" (i.e. referenced) data. + */ +void +arc_flush(spa_t *spa) +{ + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + + arc_evict_ghost(arc_mru_ghost, spa, -1); + arc_evict_ghost(arc_mfu_ghost, spa, -1); + + mutex_enter(&arc_reclaim_thr_lock); + arc_do_user_evicts(); + mutex_exit(&arc_reclaim_thr_lock); + ASSERT(spa || arc_eviction_list == NULL); +} + +int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ + +void +arc_shrink(void) +{ + if (arc_c > arc_c_min) { + uint64_t to_free; + +#ifdef _KERNEL + to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); +#else + to_free = arc_c >> arc_shrink_shift; +#endif + if (arc_c > arc_c_min + to_free) + atomic_add_64(&arc_c, -to_free); + else + arc_c = arc_c_min; + + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (arc_c > arc_size) + arc_c = MAX(arc_size, arc_c_min); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } + + if (arc_size > arc_c) + arc_adjust(); +} + +static int +arc_reclaim_needed(void) +{ + uint64_t extra; + +#ifdef _KERNEL + + if (needfree) + return (1); + + /* + * take 'desfree' extra pages, so we reclaim sooner, rather than later + */ + extra = desfree; + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + if (freemem < lotsfree + needfree + extra) + return (1); + + /* + * check to make sure that swapfs has enough space so that anon + * reservations can still succeed. anon_resvmem() checks that the + * availrmem is greater than swapfs_minfree, and the number of reserved + * swap pages. We also add a bit of extra here just to prevent + * circumstances from getting really dire. + */ + if (availrmem < swapfs_minfree + swapfs_reserve + extra) + return (1); + +#if defined(__i386) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + if (btop(vmem_size(heap_arena, VMEM_FREE)) < + (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + return (1); +#endif + +#else + if (spa_get_random(100) == 0) + return (1); +#endif + return (0); +} + +static void +arc_kmem_reap_now(arc_reclaim_strategy_t strat) +{ + size_t i; + kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; + extern kmem_cache_t *zio_buf_cache[]; + extern kmem_cache_t *zio_data_buf_cache[]; + +#ifdef _KERNEL + if (arc_meta_used >= arc_meta_limit) { + /* + * We are exceeding our meta-data cache limit. + * Purge some DNLC entries to release holds on meta-data. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + } +#if defined(__i386) + /* + * Reclaim unused memory from all kmem caches. + */ + kmem_reap(); +#endif +#endif + + /* + * An aggressive reclamation will shrink the cache size as well as + * reap free buffers from the arc kmem caches. + */ + if (strat == ARC_RECLAIM_AGGR) + arc_shrink(); + + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } + } + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_cache); +} + +static void +arc_reclaim_thread(void) +{ + clock_t growtime = 0; + arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_reclaim_thr_lock); + while (arc_thread_exit == 0) { + if (arc_reclaim_needed()) { + + if (arc_no_grow) { + if (last_reclaim == ARC_RECLAIM_CONS) { + last_reclaim = ARC_RECLAIM_AGGR; + } else { + last_reclaim = ARC_RECLAIM_CONS; + } + } else { + arc_no_grow = TRUE; + last_reclaim = ARC_RECLAIM_AGGR; + membar_producer(); + } + + /* reset the growth delay for every reclaim */ + growtime = lbolt + (arc_grow_retry * hz); + + arc_kmem_reap_now(last_reclaim); + arc_warm = B_TRUE; + + } else if (arc_no_grow && lbolt >= growtime) { + arc_no_grow = FALSE; + } + + if (2 * arc_c < arc_size + + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) + arc_adjust(); + + if (arc_eviction_list != NULL) + arc_do_user_evicts(); + + /* block until needed, or one second, whichever is shorter */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&arc_reclaim_thr_cv, + &arc_reclaim_thr_lock, (lbolt + hz)); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + } + + arc_thread_exit = 0; + cv_broadcast(&arc_reclaim_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + thread_exit(); +} + +/* + * Adapt arc info given the number of bytes we are trying to add and + * the state that we are comming from. This function is only called + * when we are adding new content to the cache. + */ +static void +arc_adapt(int bytes, arc_state_t *state) +{ + int mult; + + if (state == arc_l2c_only) + return; + + ASSERT(bytes > 0); + /* + * Adapt the target size of the MRU list: + * - if we just hit in the MRU ghost list, then increase + * the target size of the MRU list. + * - if we just hit in the MFU ghost list, then increase + * the target size of the MFU list by decreasing the + * target size of the MRU list. + */ + if (state == arc_mru_ghost) { + mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? + 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + + arc_p = MIN(arc_c, arc_p + bytes * mult); + } else if (state == arc_mfu_ghost) { + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? + 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + + arc_p = MAX(0, (int64_t)arc_p - bytes * mult); + } + ASSERT((int64_t)arc_p >= 0); + + if (arc_reclaim_needed()) { + cv_signal(&arc_reclaim_thr_cv); + return; + } + + if (arc_no_grow) + return; + + if (arc_c >= arc_c_max) + return; + + /* + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size + */ + if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + atomic_add_64(&arc_c, (int64_t)bytes); + if (arc_c > arc_c_max) + arc_c = arc_c_max; + else if (state == arc_anon) + atomic_add_64(&arc_p, (int64_t)bytes); + if (arc_p > arc_c) + arc_p = arc_c; + } + ASSERT((int64_t)arc_p >= 0); +} + +/* + * Check if the cache has reached its limits and eviction is required + * prior to insert. + */ +static int +arc_evict_needed(arc_buf_contents_t type) +{ + if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) + return (1); + +#ifdef _KERNEL + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this area remains + * above about 1/32nd free. + */ + if (type == ARC_BUFC_DATA && zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) + return (1); +#endif + + if (arc_reclaim_needed()) + return (1); + + return (arc_size > arc_c); +} + +/* + * The buffer, supplied as the first argument, needs a data block. + * So, if we are at cache max, determine which cache should be victimized. + * We have the following cases: + * + * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> + * In this situation if we're out of space, but the resident size of the MFU is + * under the limit, victimize the MFU cache to satisfy this insertion request. + * + * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> + * Here, we've used up all of the available space for the MRU, so we need to + * evict from our own cache instead. Evict from the set of resident MRU + * entries. + * + * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> + * c minus p represents the MFU space in the cache, since p is the size of the + * cache that is dedicated to the MRU. In this situation there's still space on + * the MFU side, so the MRU side needs to be victimized. + * + * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> + * MFU's resident set is consuming more space than it has been allotted. In + * this situation, we must victimize our own cache, the MFU, for this insertion. + */ +static void +arc_get_data_buf(arc_buf_t *buf) +{ + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_adapt(size, state); + + /* + * We have not yet reached cache maximum size, + * just allocate a new buffer. + */ + if (!arc_evict_needed(type)) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); + } + goto out; + } + + /* + * If we are prefetching from the mfu ghost list, this buffer + * will end up on the mru list; so steal space from there. + */ + if (state == arc_mfu_ghost) + state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; + else if (state == arc_mru_ghost) + state = arc_mru; + + if (state == arc_mru || state == arc_anon) { + uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; + state = (arc_mfu->arcs_lsize[type] > 0 && + arc_p > mru_used) ? arc_mfu : arc_mru; + } else { + /* MFU cases */ + uint64_t mfu_space = arc_c - arc_p; + state = (arc_mru->arcs_lsize[type] > 0 && + mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + } + if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); + } + ARCSTAT_BUMP(arcstat_recycle_miss); + } + ASSERT(buf->b_data != NULL); +out: + /* + * Update the state size. Note that ghost states have a + * "ghost size" and so don't need to be updated. + */ + if (!GHOST_STATE(buf->b_hdr->b_state)) { + arc_buf_hdr_t *hdr = buf->b_hdr; + + atomic_add_64(&hdr->b_state->arcs_size, size); + if (list_link_active(&hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + atomic_add_64(&hdr->b_state->arcs_lsize[type], size); + } + /* + * If we are growing the cache, and we are adding anonymous + * data, and we have outgrown arc_p, update arc_p + */ + if (arc_size < arc_c && hdr->b_state == arc_anon && + arc_anon->arcs_size + arc_mru->arcs_size > arc_p) + arc_p = MIN(arc_c, arc_p + size); + } +} + +/* + * This routine is called whenever a buffer is accessed. + * NOTE: the hash lock is dropped in this function. + */ +static void +arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if (buf->b_state == arc_anon) { + /* + * This buffer is not in the cache, and does not + * appear in our "ghost" list. Add the new buffer + * to the MRU state. + */ + + ASSERT(buf->b_arc_access == 0); + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + arc_change_state(arc_mru, buf, hash_lock); + + } else if (buf->b_state == arc_mru) { + /* + * If this buffer is here because of a prefetch, then either: + * - clear the flag if this is a "referencing" read + * (any subsequent access will bump this into the MFU state). + * or + * - move the buffer to the head of the list if this is + * another prefetch (to make it less likely to be evicted). + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + if (refcount_count(&buf->b_refcnt) == 0) { + ASSERT(list_link_active(&buf->b_arc_node)); + } else { + buf->b_flags &= ~ARC_PREFETCH; + ARCSTAT_BUMP(arcstat_mru_hits); + } + buf->b_arc_access = lbolt; + return; + } + + /* + * This buffer has been "accessed" only once so far, + * but it is still in the cache. Move it to the MFU + * state. + */ + if (lbolt > buf->b_arc_access + ARC_MINTIME) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); + } + ARCSTAT_BUMP(arcstat_mru_hits); + } else if (buf->b_state == arc_mru_ghost) { + arc_state_t *new_state; + /* + * This buffer has been "accessed" recently, but + * was evicted from the cache. Move it to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + new_state = arc_mru; + if (refcount_count(&buf->b_refcnt) > 0) + buf->b_flags &= ~ARC_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + } else { + new_state = arc_mfu; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + } + + buf->b_arc_access = lbolt; + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + } else if (buf->b_state == arc_mfu) { + /* + * This buffer has been accessed more than once and is + * still in the cache. Keep it in the MFU state. + * + * NOTE: an add_reference() that occurred when we did + * the arc_read() will have kicked this off the list. + * If it was a prefetch, we will explicitly move it to + * the head of the list now. + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + ASSERT(refcount_count(&buf->b_refcnt) == 0); + ASSERT(list_link_active(&buf->b_arc_node)); + } + ARCSTAT_BUMP(arcstat_mfu_hits); + buf->b_arc_access = lbolt; + } else if (buf->b_state == arc_mfu_ghost) { + arc_state_t *new_state = arc_mfu; + /* + * This buffer has been accessed more than once but has + * been evicted from the cache. Move it back to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + /* + * This is a prefetch access... + * move this block back to the MRU state. + */ + ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + new_state = arc_mru; + } + + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (buf->b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); + } else { + ASSERT(!"invalid arc state"); + } +} + +/* a generic arc_done_func_t which you can use */ +/* ARGSUSED */ +void +arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + bcopy(buf->b_data, arg, buf->b_hdr->b_size); + VERIFY(arc_buf_remove_ref(buf, arg) == 1); +} + +/* a generic arc_done_func_t */ +void +arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + arc_buf_t **bufp = arg; + if (zio && zio->io_error) { + VERIFY(arc_buf_remove_ref(buf, arg) == 1); + *bufp = NULL; + } else { + *bufp = buf; + } +} + +static void +arc_read_done(zio_t *zio) +{ + arc_buf_hdr_t *hdr, *found; + arc_buf_t *buf; + arc_buf_t *abuf; /* buffer we're assigning to callback */ + kmutex_t *hash_lock; + arc_callback_t *callback_list, *acb; + int freeable = FALSE; + + buf = zio->io_private; + hdr = buf->b_hdr; + + /* + * The hdr was inserted into hash-table and removed from lists + * prior to starting I/O. We should find this header, since + * it's in the hash table, and it should be legit since it's + * not possible to evict it during the I/O. The only possible + * reason for it not to be found is if we were freed during the + * read. + */ + found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + &hash_lock); + + ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + + hdr->b_flags &= ~ARC_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) + hdr->b_flags &= ~ARC_L2CACHE; + + /* byteswap if necessary */ + callback_list = hdr->b_acb; + ASSERT(callback_list != NULL); + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? + byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + func(buf->b_data, hdr->b_size); + } + + arc_cksum_compute(buf, B_FALSE); + + /* create copies of the data buffer for the callers */ + abuf = buf; + for (acb = callback_list; acb; acb = acb->acb_next) { + if (acb->acb_done) { + if (abuf == NULL) + abuf = arc_buf_clone(buf); + acb->acb_buf = abuf; + abuf = NULL; + } + } + hdr->b_acb = NULL; + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + ASSERT(!HDR_BUF_AVAILABLE(hdr)); + if (abuf == buf) + hdr->b_flags |= ARC_BUF_AVAILABLE; + + ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + + if (zio->io_error != 0) { + hdr->b_flags |= ARC_IO_ERROR; + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + /* + * Broadcast before we drop the hash_lock to avoid the possibility + * that the hdr (and hence the cv) might be freed before we get to + * the cv_broadcast(). + */ + cv_broadcast(&hdr->b_cv); + + if (hash_lock) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + if (zio->io_error == 0 && hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else { + /* + * This block was freed while we waited for the read to + * complete. It has been removed from the hash table and + * moved to the anonymous state (so that it won't show up + * in the cache). + */ + ASSERT3P(hdr->b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + /* execute each callback and free its structure */ + while ((acb = callback_list) != NULL) { + if (acb->acb_done) + acb->acb_done(zio, acb->acb_buf, acb->acb_private); + + if (acb->acb_zio_dummy != NULL) { + acb->acb_zio_dummy->io_error = zio->io_error; + zio_nowait(acb->acb_zio_dummy); + } + + callback_list = acb->acb_next; + kmem_free(acb, sizeof (arc_callback_t)); + } + + if (freeable) + arc_hdr_destroy(hdr); +} + +/* + * "Read" the block block at the specified DVA (in bp) via the + * cache. If the block is found in the cache, invoke the provided + * callback immediately and return. Note that the `zio' parameter + * in the callback will be NULL in this case, since no IO was + * required. If the block is not in the cache pass the read request + * on to the spa with a substitute callback function, so that the + * requested block will be added to the cache. + * + * If a read request arrives for a block that has a read in-progress, + * either wait for the in-progress read to complete (and return the + * results); or, if this is a read with a "done" func, add a record + * to the read to invoke the "done" func when the read completes, + * and return; or just return. + * + * arc_read_done() will invoke all the requested "done" functions + * for readers of this block. + * + * Normal callers should use arc_read and pass the arc buffer and offset + * for the bp. But if you know you don't need locking, you can use + * arc_read_bp. + */ +int +arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + int err; + arc_buf_hdr_t *hdr = pbuf->b_hdr; + + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); + ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); + rw_enter(&pbuf->b_lock, RW_READER); + + err = arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb); + + ASSERT3P(hdr, ==, pbuf->b_hdr); + rw_exit(&pbuf->b_lock); + return (err); +} + +int +arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + zio_t *rzio; + +top: + hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + if (hdr && hdr->b_datacnt > 0) { + + *arc_flags |= ARC_CACHED; + + if (HDR_IO_IN_PROGRESS(hdr)) { + + if (*arc_flags & ARC_WAIT) { + cv_wait(&hdr->b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + ASSERT(*arc_flags & ARC_NOWAIT); + + if (done) { + arc_callback_t *acb = NULL; + + acb = kmem_zalloc(sizeof (arc_callback_t), + KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + if (pio != NULL) + acb->acb_zio_dummy = zio_null(pio, + spa, NULL, NULL, zio_flags); + + ASSERT(acb->acb_done != NULL); + acb->acb_next = hdr->b_acb; + hdr->b_acb = acb; + add_reference(hdr, hash_lock, private); + mutex_exit(hash_lock); + return (0); + } + mutex_exit(hash_lock); + return (0); + } + + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + if (done) { + add_reference(hdr, hash_lock, private); + /* + * If this block is already in use, create a new + * copy of the data so that we will be guaranteed + * that arc_release() will always succeed. + */ + buf = hdr->b_buf; + ASSERT(buf); + ASSERT(buf->b_data); + if (HDR_BUF_AVAILABLE(hdr)) { + ASSERT(buf->b_efunc == NULL); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + } else { + buf = arc_buf_clone(buf); + } + } else if (*arc_flags & ARC_PREFETCH && + refcount_count(&hdr->b_refcnt) == 0) { + hdr->b_flags |= ARC_PREFETCH; + } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); + + if (done) + done(NULL, buf, private); + } else { + uint64_t size = BP_GET_LSIZE(bp); + arc_callback_t *acb; + vdev_t *vd = NULL; + daddr_t addr; + + if (hdr == NULL) { + /* this block is not in the cache */ + arc_buf_hdr_t *exists; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + buf = arc_buf_alloc(spa, size, private, type); + hdr = buf->b_hdr; + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = bp->blk_birth; + hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + (void) arc_buf_remove_ref(buf, private); + goto top; /* restart the IO request */ + } + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) { + (void) remove_reference(hdr, hash_lock, + private); + hdr->b_flags |= ARC_PREFETCH; + } + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + if (BP_GET_LEVEL(bp) > 0) + hdr->b_flags |= ARC_INDIRECT; + } else { + /* this block is in the ghost cache */ + ASSERT(GHOST_STATE(hdr->b_state)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); + ASSERT(hdr->b_buf == NULL); + + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) + hdr->b_flags |= ARC_PREFETCH; + else + add_reference(hdr, hash_lock, private); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + arc_get_data_buf(buf); + ASSERT(hdr->b_datacnt == 0); + hdr->b_datacnt = 1; + + } + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + + ASSERT(hdr->b_acb == NULL); + hdr->b_acb = acb; + hdr->b_flags |= ARC_IO_IN_PROGRESS; + + /* + * If the buffer has been evicted, migrate it to a present state + * before issuing the I/O. Once we drop the hash-table lock, + * the header will be marked as I/O in progress and have an + * attached buffer. At this point, anybody who finds this + * buffer ought to notice that it's legit but has a pending I/O. + */ + + if (GHOST_STATE(hdr->b_state)) + arc_access(hdr, hash_lock); + + if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && + (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + addr = hdr->b_l2hdr->b_daddr; + /* + * Lock out device removal. + */ + if (vdev_is_dead(vd) || + !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + vd = NULL; + } + + mutex_exit(hash_lock); + + ASSERT3U(hdr->b_size, ==, size); + DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, + zbookmark_t *, zb); + ARCSTAT_BUMP(arcstat_misses); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, misses); + + if (vd != NULL) { + /* + * Read from the L2ARC if the following are true: + * 1. The L2ARC vdev was previously cached. + * 2. This buffer still has L2ARC metadata. + * 3. This buffer isn't currently writing to the L2ARC. + * 4. The L2ARC entry wasn't evicted, which may + * also have invalidated the vdev. + */ + if (hdr->b_l2hdr != NULL && + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { + l2arc_read_callback_t *cb; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_buf = buf; + cb->l2rcb_spa = spa; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = zio_flags; + + /* + * l2arc read. The SCL_L2ARC lock will be + * released by l2arc_read_done(). + */ + rzio = zio_read_phys(pio, vd, addr, size, + buf->b_data, ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, zio_flags | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + + if (*arc_flags & ARC_NOWAIT) { + zio_nowait(rzio); + return (0); + } + + ASSERT(*arc_flags & ARC_WAIT); + if (zio_wait(rzio) == 0) + return (0); + + /* l2arc read error; goto zio_read() */ + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + spa_config_exit(spa, SCL_L2ARC, vd); + } + } + + rzio = zio_read(pio, spa, bp, buf->b_data, size, + arc_read_done, buf, priority, zio_flags, zb); + + if (*arc_flags & ARC_WAIT) + return (zio_wait(rzio)); + + ASSERT(*arc_flags & ARC_NOWAIT); + zio_nowait(rzio); + } + return (0); +} + +/* + * arc_read() variant to support pool traversal. If the block is already + * in the ARC, make a copy of it; otherwise, the caller will do the I/O. + * The idea is that we don't want pool traversal filling up memory, but + * if the ARC already has the data anyway, we shouldn't pay for the I/O. + */ +int +arc_tryread(spa_t *spa, blkptr_t *bp, void *data) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_mtx; + int rc = 0; + + hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); + + if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { + arc_buf_t *buf = hdr->b_buf; + + ASSERT(buf); + while (buf->b_data == NULL) { + buf = buf->b_next; + ASSERT(buf); + } + bcopy(buf->b_data, data, hdr->b_size); + } else { + rc = ENOENT; + } + + if (hash_mtx) + mutex_exit(hash_mtx); + + return (rc); +} + +void +arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) +{ + ASSERT(buf->b_hdr != NULL); + ASSERT(buf->b_hdr->b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + buf->b_efunc = func; + buf->b_private = private; +} + +/* + * This is used by the DMU to let the ARC know that a buffer is + * being evicted, so the ARC should clean up. If this arc buf + * is not yet in the evicted state, it will be put there. + */ +int +arc_buf_evict(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + arc_buf_t **bufp; + + rw_enter(&buf->b_lock, RW_WRITER); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(). + */ + ASSERT(buf->b_data == NULL); + rw_exit(&buf->b_lock); + return (0); + } else if (buf->b_data == NULL) { + arc_buf_t copy = *buf; /* structure assignment */ + /* + * We are on the eviction list; process this buffer now + * but let arc_do_user_evicts() do the reaping. + */ + buf->b_efunc = NULL; + rw_exit(&buf->b_lock); + VERIFY(copy.b_efunc(©) == 0); + return (1); + } + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + ASSERT(buf->b_hdr == hdr); + ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + /* + * Pull this buffer off of the hdr + */ + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + + ASSERT(buf->b_data != NULL); + arc_buf_destroy(buf, FALSE, FALSE); + + if (hdr->b_datacnt == 0) { + arc_state_t *old_state = hdr->b_state; + arc_state_t *evicted_state; + + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + + evicted_state = + (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&old_state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&old_state->arcs_mtx); + } + mutex_exit(hash_lock); + rw_exit(&buf->b_lock); + + VERIFY(buf->b_efunc(buf) == 0); + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); + return (1); +} + +/* + * Release this buffer from the cache. This must be done + * after a read and prior to modifying the buffer contents. + * If the buffer has more than one reference, we must make + * a new hdr for the buffer. + */ +void +arc_release(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + l2arc_buf_hdr_t *l2hdr; + uint64_t buf_size; + + rw_enter(&buf->b_lock, RW_WRITER); + hdr = buf->b_hdr; + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_refcnt) > 0); + ASSERT(!(hdr->b_flags & ARC_STORED)); + + if (hdr->b_state == arc_anon) { + /* this buffer is already released */ + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); + ASSERT(BUF_EMPTY(hdr)); + ASSERT(buf->b_efunc == NULL); + arc_buf_thaw(buf); + rw_exit(&buf->b_lock); + return; + } + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + l2hdr = hdr->b_l2hdr; + if (l2hdr) { + mutex_enter(&l2arc_buflist_mtx); + hdr->b_l2hdr = NULL; + buf_size = hdr->b_size; + } + + /* + * Do we have more than one buf? + */ + if (hdr->b_datacnt > 1) { + arc_buf_hdr_t *nhdr; + arc_buf_t **bufp; + uint64_t blksz = hdr->b_size; + spa_t *spa = hdr->b_spa; + arc_buf_contents_t type = hdr->b_type; + uint32_t flags = hdr->b_flags; + + ASSERT(hdr->b_buf != buf || buf->b_next != NULL); + /* + * Pull the data off of this buf and attach it to + * a new anonymous buf. + */ + (void) remove_reference(hdr, hash_lock, tag); + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = (*bufp)->b_next; + buf->b_next = NULL; + + ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_refcnt)) { + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3U(*size, >=, hdr->b_size); + atomic_add_64(size, -hdr->b_size); + } + hdr->b_datacnt -= 1; + arc_cksum_verify(buf); + + mutex_exit(hash_lock); + + nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + nhdr->b_size = blksz; + nhdr->b_spa = spa; + nhdr->b_type = type; + nhdr->b_buf = buf; + nhdr->b_state = arc_anon; + nhdr->b_arc_access = 0; + nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_l2hdr = NULL; + nhdr->b_datacnt = 1; + nhdr->b_freeze_cksum = NULL; + (void) refcount_add(&nhdr->b_refcnt, tag); + buf->b_hdr = nhdr; + rw_exit(&buf->b_lock); + atomic_add_64(&arc_anon->arcs_size, blksz); + } else { + rw_exit(&buf->b_lock); + ASSERT(refcount_count(&hdr->b_refcnt) == 1); + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_arc_access = 0; + mutex_exit(hash_lock); + + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + arc_buf_thaw(buf); + } + buf->b_efunc = NULL; + buf->b_private = NULL; + + if (l2hdr) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -buf_size); + mutex_exit(&l2arc_buflist_mtx); + } +} + +int +arc_released(arc_buf_t *buf) +{ + int released; + + rw_enter(&buf->b_lock, RW_READER); + released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + rw_exit(&buf->b_lock); + return (released); +} + +int +arc_has_callback(arc_buf_t *buf) +{ + int callback; + + rw_enter(&buf->b_lock, RW_READER); + callback = (buf->b_efunc != NULL); + rw_exit(&buf->b_lock); + return (callback); +} + +#ifdef ZFS_DEBUG +int +arc_referenced(arc_buf_t *buf) +{ + int referenced; + + rw_enter(&buf->b_lock, RW_READER); + referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + rw_exit(&buf->b_lock); + return (referenced); +} +#endif + +static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + + /* + * If the IO is already in progress, then this is a re-write + * attempt, so we need to thaw and re-compute the cksum. + * It is the responsibility of the callback to handle the + * accounting for any re-write attempt. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + mutex_enter(&hdr->b_freeze_lock); + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_freeze_lock); + } + arc_cksum_compute(buf, B_FALSE); + hdr->b_flags |= ARC_IO_IN_PROGRESS; +} + +static void +arc_write_done(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + hdr->b_acb = NULL; + + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = zio->io_bp->blk_birth; + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + /* + * If the block to be written was all-zero, we may have + * compressed it away. In this case no write was performed + * so there will be no dva/birth-date/checksum. The buffer + * must therefor remain anonymous (and uncached). + */ + if (!BUF_EMPTY(hdr)) { + arc_buf_hdr_t *exists; + kmutex_t *hash_lock; + + arc_cksum_verify(buf); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* + * This can only happen if we overwrite for + * sync-to-convergence, because we remove + * buffers from the hash table when we arc_free(). + */ + ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE); + ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), + BP_IDENTITY(zio->io_bp))); + ASSERT3U(zio->io_bp_orig.blk_birth, ==, + zio->io_bp->blk_birth); + + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + /* if it's not anon, we are doing a scrub */ + if (hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else if (callback->awcb_done == NULL) { + int destroy_hdr; + /* + * This is an anonymous buffer with no user callback, + * destroy it if there are no active references. + */ + mutex_enter(&arc_eviction_mtx); + destroy_hdr = refcount_is_zero(&hdr->b_refcnt); + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + } + hdr->b_flags &= ~ARC_STORED; + + if (callback->awcb_done) { + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); + } + + kmem_free(callback, sizeof (arc_write_callback_t)); +} + +void +write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp) +{ + boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); + + /* Determine checksum setting */ + if (ismd) { + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && + !zio_checksum_table[wp->wp_oschecksum].ci_zbt) + zp->zp_checksum = wp->wp_oschecksum; + else + zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4; + } else { + zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum, + wp->wp_oschecksum); + } + + /* Determine compression setting */ + if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + } else { + zp->zp_compress = zio_compress_select(wp->wp_dncompress, + wp->wp_oscompress); + } + + zp->zp_type = wp->wp_type; + zp->zp_level = wp->wp_level; + zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa)); +} + +zio_t * +arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, + boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, + int zio_flags, const zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_write_callback_t *callback; + zio_t *zio; + zio_prop_t zp; + + ASSERT(ready != NULL); + ASSERT(!HDR_IO_ERROR(hdr)); + ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); + ASSERT(hdr->b_acb == 0); + if (l2arc) + hdr->b_flags |= ARC_L2CACHE; + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback->awcb_ready = ready; + callback->awcb_done = done; + callback->awcb_private = private; + callback->awcb_buf = buf; + + write_policy(spa, wp, &zp); + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp, + arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); + + return (zio); +} + +int +arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, uint32_t arc_flags) +{ + arc_buf_hdr_t *ab; + kmutex_t *hash_lock; + zio_t *zio; + + /* + * If this buffer is in the cache, release it, so it + * can be re-used. + */ + ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + if (ab != NULL) { + /* + * The checksum of blocks to free is not always + * preserved (eg. on the deadlist). However, if it is + * nonzero, it should match what we have in the cache. + */ + ASSERT(bp->blk_cksum.zc_word[0] == 0 || + bp->blk_cksum.zc_word[0] == ab->b_cksum0 || + bp->blk_fill == BLK_FILL_ALREADY_FREED); + + if (ab->b_state != arc_anon) + arc_change_state(arc_anon, ab, hash_lock); + if (HDR_IO_IN_PROGRESS(ab)) { + /* + * This should only happen when we prefetch. + */ + ASSERT(ab->b_flags & ARC_PREFETCH); + ASSERT3U(ab->b_datacnt, ==, 1); + ab->b_flags |= ARC_FREED_IN_READ; + if (HDR_IN_HASH_TABLE(ab)) + buf_hash_remove(ab); + ab->b_arc_access = 0; + bzero(&ab->b_dva, sizeof (dva_t)); + ab->b_birth = 0; + ab->b_cksum0 = 0; + ab->b_buf->b_efunc = NULL; + ab->b_buf->b_private = NULL; + mutex_exit(hash_lock); + } else if (refcount_is_zero(&ab->b_refcnt)) { + ab->b_flags |= ARC_FREE_IN_PROGRESS; + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + ARCSTAT_BUMP(arcstat_deleted); + } else { + /* + * We still have an active reference on this + * buffer. This can happen, e.g., from + * dbuf_unoverride(). + */ + ASSERT(!HDR_IN_HASH_TABLE(ab)); + ab->b_arc_access = 0; + bzero(&ab->b_dva, sizeof (dva_t)); + ab->b_birth = 0; + ab->b_cksum0 = 0; + ab->b_buf->b_efunc = NULL; + ab->b_buf->b_private = NULL; + mutex_exit(hash_lock); + } + } + + zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED); + + if (arc_flags & ARC_WAIT) + return (zio_wait(zio)); + + ASSERT(arc_flags & ARC_NOWAIT); + zio_nowait(zio); + + return (0); +} + +static int +arc_memory_throttle(uint64_t reserve, uint64_t txg) +{ +#ifdef _KERNEL + uint64_t inflight_data = arc_anon->arcs_size; + uint64_t available_memory = ptob(freemem); + static uint64_t page_load = 0; + static uint64_t last_txg = 0; + +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif + if (available_memory >= zfs_write_limit_max) + return (0); + + if (txg > last_txg) { + last_txg = txg; + page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (curproc == proc_pageout) { + if (page_load > MAX(ptob(minfree), available_memory) / 4) + return (ERESTART); + /* Note: reserve is inflated, so we deflate */ + page_load += reserve / 8; + return (0); + } else if (page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (EAGAIN); + } + page_load = 0; + + if (arc_size > arc_c_min) { + uint64_t evictable_memory = + arc_mru->arcs_lsize[ARC_BUFC_DATA] + + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + available_memory += MIN(evictable_memory, arc_size - arc_c_min); + } + + if (inflight_data > available_memory / 4) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (ERESTART); + } +#endif + return (0); +} + +void +arc_tempreserve_clear(uint64_t reserve) +{ + atomic_add_64(&arc_tempreserve, -reserve); + ASSERT((int64_t)arc_tempreserve >= 0); +} + +int +arc_tempreserve_space(uint64_t reserve, uint64_t txg) +{ + int error; + +#ifdef ZFS_DEBUG + /* + * Once in a while, fail for no reason. Everything should cope. + */ + if (spa_get_random(10000) == 0) { + dprintf("forcing random failure\n"); + return (ERESTART); + } +#endif + if (reserve > arc_c/4 && !arc_no_grow) + arc_c = MIN(arc_c_max, reserve * 4); + if (reserve > arc_c) + return (ENOMEM); + + /* + * Writes will, almost always, require additional memory allocations + * in order to compress/encrypt/etc the data. We therefor need to + * make sure that there is sufficient available memory for this. + */ + if (error = arc_memory_throttle(reserve, txg)) + return (error); + + /* + * Throttle writes when the amount of dirty data in the cache + * gets too large. We try to keep the cache less than half full + * of dirty blocks so that our sync times don't grow too large. + * Note: if two requests come in concurrently, we might let them + * both succeed, when one of them should fail. Not a huge deal. + */ + if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && + arc_anon->arcs_size > arc_c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, + arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, + arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, + reserve>>10, arc_c>>10); + return (ERESTART); + } + atomic_add_64(&arc_tempreserve, reserve); + return (0); +} + +void +arc_init(void) +{ + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + + /* Convert seconds to clock ticks */ + arc_min_prefetch_lifespan = 1 * hz; + + /* Start out with 1/8 of all memory */ + arc_c = physmem * PAGESIZE / 8; + +#ifdef _KERNEL + /* + * On architectures where the physical memory can be larger + * than the addressable space (intel in 32-bit mode), we may + * need to limit the cache to 1/8 of VM size. + */ + arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); +#endif + + /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ + arc_c_min = MAX(arc_c / 4, 64<<20); + /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ + if (arc_c * 8 >= 1<<30) + arc_c_max = (arc_c * 8) - (1<<30); + else + arc_c_max = arc_c_min; + arc_c_max = MAX(arc_c * 6, arc_c_max); + + /* + * Allow the tunables to override our calculations if they are + * reasonable (ie. over 64MB) + */ + if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) + arc_c_max = zfs_arc_max; + if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) + arc_c_min = zfs_arc_min; + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + arc_size = 0; + + mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + + buf_init(); + + arc_thread_exit = 0; + arc_eviction_list = NULL; + mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); + bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, + sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (arc_ksp != NULL) { + arc_ksp->ks_data = &arc_stats; + kstat_install(arc_ksp); + } + + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + + arc_dead = FALSE; + arc_warm = B_FALSE; + + if (zfs_write_limit_max == 0) + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + else + zfs_write_limit_shift = 0; + mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +arc_fini(void) +{ + mutex_enter(&arc_reclaim_thr_lock); + arc_thread_exit = 1; + while (arc_thread_exit != 0) + cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); + mutex_exit(&arc_reclaim_thr_lock); + + arc_flush(NULL); + + arc_dead = TRUE; + + if (arc_ksp != NULL) { + kstat_delete(arc_ksp); + arc_ksp = NULL; + } + + mutex_destroy(&arc_eviction_mtx); + mutex_destroy(&arc_reclaim_thr_lock); + cv_destroy(&arc_reclaim_thr_cv); + + list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + + mutex_destroy(&arc_anon->arcs_mtx); + mutex_destroy(&arc_mru->arcs_mtx); + mutex_destroy(&arc_mru_ghost->arcs_mtx); + mutex_destroy(&arc_mfu->arcs_mtx); + mutex_destroy(&arc_mfu_ghost->arcs_mtx); + + mutex_destroy(&zfs_write_limit_lock); + + buf_fini(); +} + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. After system boot and before the ARC has filled main memory, there are + * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru + * lists can remain mostly static. Instead of searching from tail of these + * lists as pictured, the l2arc_feed_thread() will search from the list heads + * for eligible buffers, greatly increasing its chance of finding them. + * + * The L2ARC device write speed is also boosted during this time so that + * the L2ARC warms up faster. Since there have been no ARC evictions yet, + * there are no L2ARC reads, and no fear of degrading read performance + * through increased writes. + * + * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 7. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 8. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_write_boost extra write bytes during device warmup + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + */ + +static void +l2arc_hdr_stat_add(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +static void +l2arc_hdr_stat_remove(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * If a device is returned, this also returns holding the spa config lock. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *first, *next = NULL; + + /* + * Lock out the removal of spas (spa_namespace_lock), then removal + * of cache devices (l2arc_dev_mtx). Once a device has been selected, + * both locks will be dropped and a spa config lock held instead. + */ + mutex_enter(&spa_namespace_lock); + mutex_enter(&l2arc_dev_mtx); + + /* if there are no vdevs, there is nothing to do */ + if (l2arc_ndev == 0) + goto out; + + first = NULL; + next = l2arc_dev_last; + do { + /* loop around the list looking for a non-faulted vdev */ + if (next == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, next); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + /* if we have come back to the start, bail out */ + if (first == NULL) + first = next; + else if (next == first) + break; + + } while (vdev_is_dead(next->l2ad_vdev)); + + /* if we were unable to find any usable vdevs, return NULL */ + if (vdev_is_dead(next->l2ad_vdev)) + next = NULL; + + l2arc_dev_last = next; + +out: + mutex_exit(&l2arc_dev_mtx); + + /* + * Grab the config lock to prevent the 'next' device from being + * removed while we are writing to it. + */ + if (next != NULL) + spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); + mutex_exit(&spa_namespace_lock); + + return (next); +} + +/* + * Free buffers that were tagged for destruction. + */ +static void +l2arc_do_free_on_write() +{ + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT(df->l2df_data != NULL); + ASSERT(df->l2df_func != NULL); + df->l2df_func(df->l2df_data, df->l2df_size); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + + mutex_exit(&l2arc_free_on_write_mtx); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_dev_t *dev; + list_t *buflist; + arc_buf_hdr_t *head, *ab, *ab_prev; + l2arc_buf_hdr_t *abl2; + kmutex_t *hash_lock; + + cb = zio->io_private; + ASSERT(cb != NULL); + dev = cb->l2wcb_dev; + ASSERT(dev != NULL); + head = cb->l2wcb_head; + ASSERT(head != NULL); + buflist = dev->l2ad_buflist; + ASSERT(buflist != NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + mutex_enter(&l2arc_buflist_mtx); + + /* + * All writes completed, or an error was hit. + */ + for (ab = list_prev(buflist, head); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * This buffer misses out. It may be in a stage + * of eviction. Its ARC_L2_WRITING flag will be + * left set, denying reads to this buffer. + */ + ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); + continue; + } + + if (zio->io_error != 0) { + /* + * Error - drop L2ARC entry. + */ + list_remove(buflist, ab); + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + + /* + * Allow ARC to begin reads to this L2ARC entry. + */ + ab->b_flags &= ~ARC_L2_WRITING; + + mutex_exit(hash_lock); + } + + atomic_inc_64(&l2arc_writes_done); + list_remove(buflist, head); + kmem_cache_free(hdr_cache, head); + mutex_exit(&l2arc_buflist_mtx); + + l2arc_do_free_on_write(); + + kmem_free(cb, sizeof (l2arc_write_callback_t)); +} + +/* + * A read to a cache device completed. Validate buffer contents before + * handing over to the regular ARC routines. + */ +static void +l2arc_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + int equal; + + ASSERT(zio->io_vd != NULL); + ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); + + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + + cb = zio->io_private; + ASSERT(cb != NULL); + buf = cb->l2rcb_buf; + ASSERT(buf != NULL); + hdr = buf->b_hdr; + ASSERT(hdr != NULL); + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * Check this survived the L2ARC journey. + */ + equal = arc_cksum_equal(buf); + if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + mutex_exit(hash_lock); + zio->io_private = buf; + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + arc_read_done(zio); + } else { + mutex_exit(hash_lock); + /* + * Buffer didn't survive caching. Increment stats and + * reissue to the original storage device. + */ + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_io_error); + } else { + zio->io_error = EIO; + } + if (!equal) + ARCSTAT_BUMP(arcstat_l2_cksum_bad); + + /* + * If there's no waiter, issue an async i/o to the primary + * storage now. If there *is* a waiter, the caller must + * issue the i/o in a context where it's OK to block. + */ + if (zio->io_waiter == NULL) + zio_nowait(zio_read(zio->io_parent, + cb->l2rcb_spa, &cb->l2rcb_bp, + buf->b_data, zio->io_size, arc_read_done, buf, + zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } + + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * This is the list priority from which the L2ARC will search for pages to + * cache. This is used within loops (0..3) to cycle through lists in the + * desired order. This order can have a significant effect on cache + * performance. + * + * Currently the metadata lists are hit first, MFU then MRU, followed by + * the data lists. This function returns a locked list, and also returns + * the lock pointer. + */ +static list_t * +l2arc_list_locked(int list_num, kmutex_t **lock) +{ + list_t *list; + + ASSERT(list_num >= 0 && list_num <= 3); + + switch (list_num) { + case 0: + list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 1: + list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mru->arcs_mtx; + break; + case 2: + list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 3: + list = &arc_mru->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mru->arcs_mtx; + break; + } + + ASSERT(!(MUTEX_HELD(*lock))); + mutex_enter(*lock); + return (list); +} + +/* + * Evict buffers from the device write hand to the distance specified in + * bytes. This distance may span populated buffers, it may span nothing. + * This is clearing a region on the L2ARC device ready for writing. + * If the 'all' boolean is set, every buffer is evicted. + */ +static void +l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) +{ + list_t *buflist; + l2arc_buf_hdr_t *abl2; + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + uint64_t taddr; + + buflist = dev->l2ad_buflist; + + if (buflist == NULL) + return; + + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + return; + } + + if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { + /* + * When nearing the end of the device, evict to the end + * before the device write hand jumps to the start. + */ + taddr = dev->l2ad_end; + } else { + taddr = dev->l2ad_hand + distance; + } + DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, + uint64_t, taddr, boolean_t, all); + +top: + mutex_enter(&l2arc_buflist_mtx); + for (ab = list_tail(buflist); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. Retry. + */ + ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); + mutex_exit(&l2arc_buflist_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + if (HDR_L2_WRITE_HEAD(ab)) { + /* + * We hit a write head node. Leave it for + * l2arc_write_done(). + */ + list_remove(buflist, ab); + mutex_exit(hash_lock); + continue; + } + + if (!all && ab->b_l2hdr != NULL && + (ab->b_l2hdr->b_daddr > taddr || + ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + /* + * We've evicted to the target address, + * or the end of the device. + */ + mutex_exit(hash_lock); + break; + } + + if (HDR_FREE_IN_PROGRESS(ab)) { + /* + * Already on the path to destruction. + */ + mutex_exit(hash_lock); + continue; + } + + if (ab->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(ab)); + /* + * This doesn't exist in the ARC. Destroy. + * arc_hdr_destroy() will call list_remove() + * and decrement arcstat_l2_size. + */ + arc_change_state(arc_anon, ab, hash_lock); + arc_hdr_destroy(ab); + } else { + /* + * Invalidate issued or about to be issued + * reads, since we may be about to write + * over this location. + */ + if (HDR_L2_READING(ab)) { + ARCSTAT_BUMP(arcstat_l2_evict_reading); + ab->b_flags |= ARC_L2_EVICTED; + } + + /* + * Tell ARC this no longer exists in L2ARC. + */ + if (ab->b_l2hdr != NULL) { + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + list_remove(buflist, ab); + + /* + * This may have been leftover after a + * failed write. + */ + ab->b_flags &= ~ARC_L2_WRITING; + } + mutex_exit(hash_lock); + } + mutex_exit(&l2arc_buflist_mtx); + + spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); + dev->l2ad_evict = taddr; +} + +/* + * Find and write ARC buffers to the L2ARC device. + * + * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * for reading until they have completed writing. + */ +static void +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) +{ + arc_buf_hdr_t *ab, *ab_prev, *head; + l2arc_buf_hdr_t *hdrl2; + list_t *list; + uint64_t passed_sz, write_sz, buf_sz, headroom; + void *buf_data; + kmutex_t *hash_lock, *list_lock; + boolean_t have_lock, full; + l2arc_write_callback_t *cb; + zio_t *pio, *wzio; + + ASSERT(dev->l2ad_vdev != NULL); + + pio = NULL; + write_sz = 0; + full = B_FALSE; + head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + head->b_flags |= ARC_L2_WRITE_HEAD; + + /* + * Copy buffers for L2ARC writing. + */ + mutex_enter(&l2arc_buflist_mtx); + for (int try = 0; try <= 3; try++) { + list = l2arc_list_locked(try, &list_lock); + passed_sz = 0; + + /* + * L2ARC fast warmup. + * + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + headroom = target_sz * l2arc_headroom; + if (arc_warm == B_FALSE) + ab = list_head(list); + else + ab = list_tail(list); + + for (; ab; ab = ab_prev) { + if (arc_warm == B_FALSE) + ab_prev = list_next(list, ab); + else + ab_prev = list_prev(list, ab); + + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (!have_lock && !mutex_tryenter(hash_lock)) { + /* + * Skip this buffer rather than waiting. + */ + continue; + } + + passed_sz += ab->b_size; + if (passed_sz > headroom) { + /* + * Searched too far. + */ + mutex_exit(hash_lock); + break; + } + + if (ab->b_spa != spa) { + mutex_exit(hash_lock); + continue; + } + + if (ab->b_l2hdr != NULL) { + /* + * Already in L2ARC. + */ + mutex_exit(hash_lock); + continue; + } + + if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + mutex_exit(hash_lock); + continue; + } + + if ((write_sz + ab->b_size) > target_sz) { + full = B_TRUE; + mutex_exit(hash_lock); + break; + } + + if (ab->b_buf == NULL) { + DTRACE_PROBE1(l2arc__buf__null, void *, ab); + mutex_exit(hash_lock); + continue; + } + + if (pio == NULL) { + /* + * Insert a dummy header on the buflist so + * l2arc_write_done() can find where the + * write buffers begin without searching. + */ + list_insert_head(dev->l2ad_buflist, head); + + cb = kmem_alloc( + sizeof (l2arc_write_callback_t), KM_SLEEP); + cb->l2wcb_dev = dev; + cb->l2wcb_head = head; + pio = zio_root(spa, l2arc_write_done, cb, + ZIO_FLAG_CANFAIL); + } + + /* + * Create and add a new L2ARC header. + */ + hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); + hdrl2->b_dev = dev; + hdrl2->b_daddr = dev->l2ad_hand; + + ab->b_flags |= ARC_L2_WRITING; + ab->b_l2hdr = hdrl2; + list_insert_head(dev->l2ad_buflist, ab); + buf_data = ab->b_buf->b_data; + buf_sz = ab->b_size; + + /* + * Compute and store the buffer cksum before + * writing. On debug the cksum is verified first. + */ + arc_cksum_verify(ab->b_buf); + arc_cksum_compute(ab->b_buf, B_TRUE); + + mutex_exit(hash_lock); + + wzio = zio_write_phys(pio, dev->l2ad_vdev, + dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, + NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + + /* + * Keep the clock hand suitably device-aligned. + */ + buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + + write_sz += buf_sz; + dev->l2ad_hand += buf_sz; + } + + mutex_exit(list_lock); + + if (full == B_TRUE) + break; + } + mutex_exit(&l2arc_buflist_mtx); + + if (pio == NULL) { + ASSERT3U(write_sz, ==, 0); + kmem_cache_free(hdr_cache, head); + return; + } + + ASSERT3U(write_sz, <=, target_sz); + ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_size, write_sz); + spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); + + /* + * Bump device hand to the device start if it is approaching the end. + * l2arc_evict() will already have evicted ahead for this case. + */ + if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { + spa_l2cache_space_update(dev->l2ad_vdev, 0, + dev->l2ad_end - dev->l2ad_hand); + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + } + + (void) zio_wait(pio); +} + +/* + * This thread feeds the L2ARC at regular intervals. This is the beating + * heart of the L2ARC. + */ +static void +l2arc_feed_thread(void) +{ + callb_cpr_t cpr; + l2arc_dev_t *dev; + spa_t *spa; + uint64_t size; + + CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&l2arc_feed_thr_lock); + + while (l2arc_thread_exit == 0) { + /* + * Pause for l2arc_feed_secs seconds between writes. + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, + lbolt + (hz * l2arc_feed_secs)); + CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + + /* + * Quick check for L2ARC devices. + */ + mutex_enter(&l2arc_dev_mtx); + if (l2arc_ndev == 0) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + mutex_exit(&l2arc_dev_mtx); + + /* + * This selects the next l2arc device to write to, and in + * doing so the next spa to feed from: dev->l2ad_spa. This + * will return NULL if there are now no l2arc devices or if + * they are all faulted. + * + * If a device is returned, its spa's config lock is also + * held to prevent device removal. l2arc_dev_get_next() + * will grab and release l2arc_dev_mtx. + */ + if ((dev = l2arc_dev_get_next()) == NULL) + continue; + + spa = dev->l2ad_spa; + ASSERT(spa != NULL); + + /* + * Avoid contributing to memory pressure. + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_abort_lowmem); + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + ARCSTAT_BUMP(arcstat_l2_feeds); + + size = dev->l2ad_write; + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + /* + * Evict L2ARC buffers that will be overwritten. + */ + l2arc_evict(dev, size, B_FALSE); + + /* + * Write ARC buffers. + */ + l2arc_write_buffers(spa, dev, size); + spa_config_exit(spa, SCL_L2ARC, dev); + } + + l2arc_thread_exit = 0; + cv_broadcast(&l2arc_feed_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ + thread_exit(); +} + +boolean_t +l2arc_vdev_present(vdev_t *vd) +{ + l2arc_dev_t *dev; + + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev != NULL; + dev = list_next(l2arc_dev_list, dev)) { + if (dev->l2ad_vdev == vd) + break; + } + mutex_exit(&l2arc_dev_mtx); + + return (dev != NULL); +} + +/* + * Add a vdev for use by the L2ARC. By this point the spa has already + * validated the vdev and opened it. + */ +void +l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +{ + l2arc_dev_t *adddev; + + ASSERT(!l2arc_vdev_present(vd)); + + /* + * Create a new l2arc device entry. + */ + adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev->l2ad_spa = spa; + adddev->l2ad_vdev = vd; + adddev->l2ad_write = l2arc_write_max; + adddev->l2ad_boost = l2arc_write_boost; + adddev->l2ad_start = start; + adddev->l2ad_end = end; + adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; + adddev->l2ad_first = B_TRUE; + ASSERT3U(adddev->l2ad_write, >, 0); + + /* + * This is a list of all ARC buffers that are still valid on the + * device. + */ + adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); + list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2node)); + + spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); + + /* + * Add device to global list + */ + mutex_enter(&l2arc_dev_mtx); + list_insert_head(l2arc_dev_list, adddev); + atomic_inc_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + l2arc_dev_t *dev, *nextdev, *remdev = NULL; + + /* + * Find the device by vdev + */ + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { + nextdev = list_next(l2arc_dev_list, dev); + if (vd == dev->l2ad_vdev) { + remdev = dev; + break; + } + } + ASSERT(remdev != NULL); + + /* + * Remove device from global list + */ + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + atomic_dec_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(remdev->l2ad_buflist); + kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + kmem_free(remdev, sizeof (l2arc_dev_t)); +} + +void +l2arc_init(void) +{ + l2arc_thread_exit = 0; + l2arc_ndev = 0; + l2arc_writes_sent = 0; + l2arc_writes_done = 0; + + mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); + + l2arc_dev_list = &L2ARC_dev_list; + l2arc_free_on_write = &L2ARC_free_on_write; + list_create(l2arc_dev_list, sizeof (l2arc_dev_t), + offsetof(l2arc_dev_t, l2ad_node)); + list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), + offsetof(l2arc_data_free_t, l2df_list_node)); +} + +void +l2arc_fini(void) +{ + /* + * This is called from dmu_fini(), which is called from spa_fini(); + * Because of this, we can assume that all l2arc devices have + * already been removed when the pools themselves were removed. + */ + + l2arc_do_free_on_write(); + + mutex_destroy(&l2arc_feed_thr_lock); + cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_dev_mtx); + mutex_destroy(&l2arc_buflist_mtx); + mutex_destroy(&l2arc_free_on_write_mtx); + + list_destroy(l2arc_dev_list); + list_destroy(l2arc_free_on_write); +} + +void +l2arc_start(void) +{ + if (!(spa_mode & FWRITE)) + return; + + (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +void +l2arc_stop(void) +{ + if (!(spa_mode & FWRITE)) + return; + + mutex_enter(&l2arc_feed_thr_lock); + cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ + l2arc_thread_exit = 1; + while (l2arc_thread_exit != 0) + cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); + mutex_exit(&l2arc_feed_thr_lock); +} diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c new file mode 100644 index 000000000..93b7741d7 --- /dev/null +++ b/module/zfs/bplist.c @@ -0,0 +1,349 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/bplist.h> +#include <sys/zfs_context.h> + +static int +bplist_hold(bplist_t *bpl) +{ + ASSERT(MUTEX_HELD(&bpl->bpl_lock)); + if (bpl->bpl_dbuf == NULL) { + int err = dmu_bonus_hold(bpl->bpl_mos, + bpl->bpl_object, bpl, &bpl->bpl_dbuf); + if (err) + return (err); + bpl->bpl_phys = bpl->bpl_dbuf->db_data; + } + return (0); +} + +uint64_t +bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) +{ + int size; + + size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ? + BPLIST_SIZE_V0 : sizeof (bplist_phys_t); + + return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, + DMU_OT_BPLIST_HDR, size, tx)); +} + +void +bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx) +{ + VERIFY(dmu_object_free(mos, object, tx) == 0); +} + +int +bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(mos, object, &doi); + if (err) + return (err); + + mutex_enter(&bpl->bpl_lock); + + ASSERT(bpl->bpl_dbuf == NULL); + ASSERT(bpl->bpl_phys == NULL); + ASSERT(bpl->bpl_cached_dbuf == NULL); + ASSERT(bpl->bpl_queue == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR); + + bpl->bpl_mos = mos; + bpl->bpl_object = object; + bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1); + bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT; + bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t)); + + mutex_exit(&bpl->bpl_lock); + return (0); +} + +void +bplist_close(bplist_t *bpl) +{ + mutex_enter(&bpl->bpl_lock); + + ASSERT(bpl->bpl_queue == NULL); + + if (bpl->bpl_cached_dbuf) { + dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); + bpl->bpl_cached_dbuf = NULL; + } + if (bpl->bpl_dbuf) { + dmu_buf_rele(bpl->bpl_dbuf, bpl); + bpl->bpl_dbuf = NULL; + bpl->bpl_phys = NULL; + } + + mutex_exit(&bpl->bpl_lock); +} + +boolean_t +bplist_empty(bplist_t *bpl) +{ + boolean_t rv; + + if (bpl->bpl_object == 0) + return (B_TRUE); + + mutex_enter(&bpl->bpl_lock); + VERIFY(0 == bplist_hold(bpl)); /* XXX */ + rv = (bpl->bpl_phys->bpl_entries == 0); + mutex_exit(&bpl->bpl_lock); + + return (rv); +} + +static int +bplist_cache(bplist_t *bpl, uint64_t blkid) +{ + int err = 0; + + if (bpl->bpl_cached_dbuf == NULL || + bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) { + if (bpl->bpl_cached_dbuf != NULL) + dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); + err = dmu_buf_hold(bpl->bpl_mos, + bpl->bpl_object, blkid << bpl->bpl_blockshift, + bpl, &bpl->bpl_cached_dbuf); + ASSERT(err || bpl->bpl_cached_dbuf->db_size == + 1ULL << bpl->bpl_blockshift); + } + return (err); +} + +int +bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) +{ + uint64_t blk, off; + blkptr_t *bparray; + int err; + + mutex_enter(&bpl->bpl_lock); + + err = bplist_hold(bpl); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + if (*itorp >= bpl->bpl_phys->bpl_entries) { + mutex_exit(&bpl->bpl_lock); + return (ENOENT); + } + + blk = *itorp >> bpl->bpl_bpshift; + off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); + + err = bplist_cache(bpl, blk); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + bparray = bpl->bpl_cached_dbuf->db_data; + *bp = bparray[off]; + (*itorp)++; + mutex_exit(&bpl->bpl_lock); + return (0); +} + +int +bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) +{ + uint64_t blk, off; + blkptr_t *bparray; + int err; + + ASSERT(!BP_IS_HOLE(bp)); + mutex_enter(&bpl->bpl_lock); + err = bplist_hold(bpl); + if (err) + return (err); + + blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; + off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); + + err = bplist_cache(bpl, blk); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx); + bparray = bpl->bpl_cached_dbuf->db_data; + bparray[off] = *bp; + + /* We never need the fill count. */ + bparray[off].blk_fill = 0; + + /* The bplist will compress better if we can leave off the checksum */ + bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); + + dmu_buf_will_dirty(bpl->bpl_dbuf, tx); + bpl->bpl_phys->bpl_entries++; + bpl->bpl_phys->bpl_bytes += + bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp); + if (bpl->bpl_havecomp) { + bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp); + bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp); + } + mutex_exit(&bpl->bpl_lock); + + return (0); +} + +/* + * Deferred entry; will be written later by bplist_sync(). + */ +void +bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp) +{ + bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); + + ASSERT(!BP_IS_HOLE(bp)); + mutex_enter(&bpl->bpl_lock); + bpq->bpq_blk = *bp; + bpq->bpq_next = bpl->bpl_queue; + bpl->bpl_queue = bpq; + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_sync(bplist_t *bpl, dmu_tx_t *tx) +{ + bplist_q_t *bpq; + + mutex_enter(&bpl->bpl_lock); + while ((bpq = bpl->bpl_queue) != NULL) { + bpl->bpl_queue = bpq->bpq_next; + mutex_exit(&bpl->bpl_lock); + VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx)); + kmem_free(bpq, sizeof (*bpq)); + mutex_enter(&bpl->bpl_lock); + } + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) +{ + mutex_enter(&bpl->bpl_lock); + ASSERT3P(bpl->bpl_queue, ==, NULL); + VERIFY(0 == bplist_hold(bpl)); + dmu_buf_will_dirty(bpl->bpl_dbuf, tx); + VERIFY(0 == dmu_free_range(bpl->bpl_mos, + bpl->bpl_object, 0, -1ULL, tx)); + bpl->bpl_phys->bpl_entries = 0; + bpl->bpl_phys->bpl_bytes = 0; + if (bpl->bpl_havecomp) { + bpl->bpl_phys->bpl_comp = 0; + bpl->bpl_phys->bpl_uncomp = 0; + } + mutex_exit(&bpl->bpl_lock); +} + +int +bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err; + + mutex_enter(&bpl->bpl_lock); + + err = bplist_hold(bpl); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + *usedp = bpl->bpl_phys->bpl_bytes; + if (bpl->bpl_havecomp) { + *compp = bpl->bpl_phys->bpl_comp; + *uncompp = bpl->bpl_phys->bpl_uncomp; + } + mutex_exit(&bpl->bpl_lock); + + if (!bpl->bpl_havecomp) { + uint64_t itor = 0, comp = 0, uncomp = 0; + blkptr_t bp; + + while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { + comp += BP_GET_PSIZE(&bp); + uncomp += BP_GET_UCSIZE(&bp); + } + if (err == ENOENT) + err = 0; + *compp = comp; + *uncompp = uncomp; + } + + return (err); +} + +/* + * Return (in *dasizep) the amount of space on the deadlist which is: + * mintxg < blk_birth <= maxtxg + */ +int +bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *dasizep) +{ + uint64_t size = 0; + uint64_t itor = 0; + blkptr_t bp; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpl_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) { + mutex_enter(&bpl->bpl_lock); + err = bplist_hold(bpl); + if (err == 0) + *dasizep = bpl->bpl_phys->bpl_bytes; + mutex_exit(&bpl->bpl_lock); + return (err); + } + + while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { + if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) { + size += + bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp); + } + } + if (err == ENOENT) + err = 0; + *dasizep = size; + return (err); +} diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c new file mode 100644 index 000000000..d04610317 --- /dev/null +++ b/module/zfs/dbuf.c @@ -0,0 +1,2356 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> + +static void dbuf_destroy(dmu_buf_impl_t *db); +static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +static arc_done_func_t dbuf_write_ready; +static arc_done_func_t dbuf_write_done; +static zio_done_func_t dbuf_skip_write_ready; +static zio_done_func_t dbuf_skip_write_done; + +/* + * Global data structures and functions for the dbuf cache. + */ +static kmem_cache_t *dbuf_cache; + +/* ARGSUSED */ +static int +dbuf_cons(void *vdb, void *unused, int kmflag) +{ + dmu_buf_impl_t *db = vdb; + bzero(db, sizeof (dmu_buf_impl_t)); + + mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + refcount_create(&db->db_holds); + return (0); +} + +/* ARGSUSED */ +static void +dbuf_dest(void *vdb, void *unused) +{ + dmu_buf_impl_t *db = vdb; + mutex_destroy(&db->db_mtx); + cv_destroy(&db->db_changed); + refcount_destroy(&db->db_holds); +} + +/* + * dbuf hash table routines + */ +static dbuf_hash_table_t dbuf_hash_table; + +static uint64_t dbuf_hash_count; + +static uint64_t +dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) +{ + uintptr_t osv = (uintptr_t)os; + uint64_t crc = -1ULL; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; + + crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); + + return (crc); +} + +#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); + +#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ + ((dbuf)->db.db_object == (obj) && \ + (dbuf)->db_objset == (os) && \ + (dbuf)->db_level == (level) && \ + (dbuf)->db_blkid == (blkid)) + +dmu_buf_impl_t * +dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_impl_t *os = dn->dn_objset; + uint64_t obj = dn->dn_object; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *db; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { + if (DBUF_EQUAL(db, os, obj, level, blkid)) { + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (db); + } + mutex_exit(&db->db_mtx); + } + } + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static dmu_buf_impl_t * +dbuf_hash_insert(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_impl_t *os = db->db_objset; + uint64_t obj = db->db.db_object; + int level = db->db_level; + uint64_t blkid = db->db_blkid; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { + if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { + mutex_enter(&dbf->db_mtx); + if (dbf->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (dbf); + } + mutex_exit(&dbf->db_mtx); + } + } + + mutex_enter(&db->db_mtx); + db->db_hash_next = h->hash_table[idx]; + h->hash_table[idx] = db; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, 1); + + return (NULL); +} + +/* + * Remove an entry from the hash table. This operation will + * fail if there are any existing holds on the db. + */ +static void +dbuf_hash_remove(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf, **dbp; + + /* + * We musn't hold db_mtx to maintin lock ordering: + * DBUF_HASH_MUTEX > db_mtx. + */ + ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(db->db_state == DB_EVICTING); + ASSERT(!MUTEX_HELD(&db->db_mtx)); + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + dbp = &h->hash_table[idx]; + while ((dbf = *dbp) != db) { + dbp = &dbf->db_hash_next; + ASSERT(dbf != NULL); + } + *dbp = db->db_hash_next; + db->db_hash_next = NULL; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, -1); +} + +static arc_evict_func_t dbuf_do_evict; + +static void +dbuf_evict_user(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level != 0 || db->db_evict_func == NULL) + return; + + if (db->db_user_data_ptr_ptr) + *db->db_user_data_ptr_ptr = db->db.db_data; + db->db_evict_func(&db->db, db->db_user_ptr); + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; +} + +void +dbuf_evict(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL); + ASSERT(db->db_data_pending == NULL); + + dbuf_clear(db); + dbuf_destroy(db); +} + +void +dbuf_init(void) +{ + uint64_t hsize = 1ULL << 16; + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 4K block size. The table will take up + * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). + */ + while (hsize * 4096 < physmem * PAGESIZE) + hsize <<= 1; + +retry: + h->hash_table_mask = hsize - 1; + h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); + if (h->hash_table == NULL) { + /* XXX - we should really return an error instead of assert */ + ASSERT(hsize > (1ULL << 10)); + hsize >>= 1; + goto retry; + } + + dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + sizeof (dmu_buf_impl_t), + 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); +} + +void +dbuf_fini(void) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); + kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); + kmem_cache_destroy(dbuf_cache); +} + +/* + * Other stuff. + */ + +#ifdef ZFS_DEBUG +static void +dbuf_verify(dmu_buf_impl_t *db) +{ + dnode_t *dn = db->db_dnode; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + ASSERT(db->db_objset != NULL); + if (dn == NULL) { + ASSERT(db->db_parent == NULL); + ASSERT(db->db_blkptr == NULL); + } else { + ASSERT3U(db->db.db_object, ==, dn->dn_object); + ASSERT3P(db->db_objset, ==, dn->dn_objset); + ASSERT3U(db->db_level, <, dn->dn_nlevels); + ASSERT(db->db_blkid == DB_BONUS_BLKID || + list_head(&dn->dn_dbufs)); + } + if (db->db_blkid == DB_BONUS_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); + } else { + ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); + } + + /* + * We can't assert that db_size matches dn_datablksz because it + * can be momentarily different when another thread is doing + * dnode_set_blksz(). + */ + if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { + dbuf_dirty_record_t *dr = db->db_data_pending; + /* + * It should only be modified in syncing context, so + * make sure we only have one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); + } + + /* verify db->db_blkptr */ + if (db->db_blkptr) { + if (db->db_parent == dn->dn_dbuf) { + /* db is pointed to by the dnode */ + /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ + if (db->db.db_object == DMU_META_DNODE_OBJECT) + ASSERT(db->db_parent == NULL); + else + ASSERT(db->db_parent != NULL); + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + } else { + /* db is pointed to by an indirect block */ + int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; + ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); + ASSERT3U(db->db_parent->db.db_object, ==, + db->db.db_object); + /* + * dnode_grow_indblksz() can make this fail if we don't + * have the struct_rwlock. XXX indblksz no longer + * grows. safe to do this now? + */ + if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); + } + } + } + if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + db->db.db_data && db->db_blkid != DB_BONUS_BLKID && + db->db_state != DB_FILL && !dn->dn_free_txg) { + /* + * If the blkptr isn't set but they have nonzero data, + * it had better be dirty, otherwise we'll lose that + * data when we evict this buffer. + */ + if (db->db_dirtycnt == 0) { + uint64_t *buf = db->db.db_data; + int i; + + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } + } +} +#endif + +static void +dbuf_update_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_level == 0 && db->db_user_data_ptr_ptr) { + ASSERT(!refcount_is_zero(&db->db_holds)); + *db->db_user_data_ptr_ptr = db->db.db_data; + } +} + +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + db->db_buf = buf; + if (buf != NULL) { + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + dbuf_update_data(db); + } else { + dbuf_evict_user(db); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; + } +} + +uint64_t +dbuf_whichblock(dnode_t *dn, uint64_t offset) +{ + if (dn->dn_datablkshift) { + return (offset >> dn->dn_datablkshift); + } else { + ASSERT3U(offset, <, dn->dn_datablksz); + return (0); + } +} + +static void +dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + + mutex_enter(&db->db_mtx); + ASSERT3U(db->db_state, ==, DB_READ); + /* + * All reads are synchronous, so we must have a hold on the dbuf + */ + ASSERT(refcount_count(&db->db_holds) > 0); + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + if (db->db_level == 0 && db->db_freed_in_flight) { + /* we were freed in flight; disregard any error */ + arc_release(buf, db); + bzero(buf->b_data, db->db.db_size); + arc_buf_freeze(buf); + db->db_freed_in_flight = FALSE; + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else if (zio == NULL || zio->io_error == 0) { + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else { + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + db->db_state = DB_UNCACHED; + } + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); + dbuf_rele(db, NULL); +} + +static void +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) +{ + dnode_t *dn = db->db_dnode; + zbookmark_t zb; + uint32_t aflags = ARC_NOWAIT; + arc_buf_t *pbuf; + + ASSERT(!refcount_is_zero(&db->db_holds)); + /* We need the struct_rwlock to prevent db_blkptr from changing. */ + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_buf == NULL); + + if (db->db_blkid == DB_BONUS_BLKID) { + int bonuslen = dn->dn_bonuslen; + + ASSERT3U(bonuslen, <=, db->db.db_size); + db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN); + if (bonuslen < DN_MAX_BONUSLEN) + bzero(db->db.db_data, DN_MAX_BONUSLEN); + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + bonuslen); + dbuf_update_data(db); + db->db_state = DB_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + /* + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || + (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr)))) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, + db->db.db_size, db, type)); + bzero(db->db.db_data, db->db.db_size); + db->db_state = DB_CACHED; + *flags |= DB_RF_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + db->db_state = DB_READ; + mutex_exit(&db->db_mtx); + + if (DBUF_IS_L2CACHEABLE(db)) + aflags |= ARC_L2CACHE; + + zb.zb_objset = db->db_objset->os_dsl_dataset ? + db->db_objset->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + + dbuf_add_ref(db, NULL); + /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ + + if (db->db_parent) + pbuf = db->db_parent->db_buf; + else + pbuf = db->db_objset->os_phys_buf; + + (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, + (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + &aflags, &zb); + if (aflags & ARC_CACHED) + *flags |= DB_RF_CACHED; +} + +int +dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +{ + int err = 0; + int havepzio = (zio != NULL); + int prefetch; + + /* + * We don't have to hold the mutex to check db_state because it + * can't be freed while we have a hold on the buffer. + */ + ASSERT(!refcount_is_zero(&db->db_holds)); + + if (db->db_state == DB_NOFILL) + return (EIO); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + + prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && + DBUF_IS_CACHEABLE(db); + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&db->db_dnode->dn_struct_rwlock); + } else if (db->db_state == DB_UNCACHED) { + if (zio == NULL) { + zio = zio_root(db->db_dnode->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + } + dbuf_read_impl(db, zio, &flags); + + /* dbuf_read_impl has dropped db_mtx for us */ + + if (prefetch) + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size, flags & DB_RF_CACHED); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&db->db_dnode->dn_struct_rwlock); + + if (!havepzio) + err = zio_wait(zio); + } else { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&db->db_dnode->dn_struct_rwlock); + + mutex_enter(&db->db_mtx); + if ((flags & DB_RF_NEVERWAIT) == 0) { + while (db->db_state == DB_READ || + db->db_state == DB_FILL) { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + cv_wait(&db->db_changed, &db->db_mtx); + } + if (db->db_state == DB_UNCACHED) + err = EIO; + } + mutex_exit(&db->db_mtx); + } + + ASSERT(err || havepzio || db->db_state == DB_CACHED); + return (err); +} + +static void +dbuf_noread(dmu_buf_impl_t *db) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + db->db.db_size, db, type)); + db->db_state = DB_FILL; + } else if (db->db_state == DB_NOFILL) { + dbuf_set_data(db, NULL); + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + mutex_exit(&db->db_mtx); +} + +/* + * This is our just-in-time copy function. It makes a copy of + * buffers, that have been modified in a previous transaction + * group, before we modify them in the current active group. + * + * This function is used in two places: when we are dirtying a + * buffer for the first time in a txg, and when we are freeing + * a range in a dnode that includes this buffer. + * + * Note that when we are called from dbuf_free_range() we do + * not put a hold on the buffer, we just traverse the active + * dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + + if (dr == NULL || + (dr->dt.dl.dr_data != + ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + return; + + /* + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. + */ + ASSERT(dr->dr_txg >= txg - 2); + if (db->db_blkid == DB_BONUS_BLKID) { + /* Note that the data bufs here are zio_bufs */ + dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN); + bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dr->dt.dl.dr_data = arc_buf_alloc( + db->db_dnode->dn_objset->os_spa, size, db, type); + bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + } else { + dbuf_set_data(db, NULL); + } +} + +void +dbuf_unoverride(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dr->dr_txg; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); + ASSERT(db->db_level == 0); + + if (db->db_blkid == DB_BONUS_BLKID || + dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) + return; + + /* free this block */ + if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { + /* XXX can get silent EIO here */ + (void) dsl_free(NULL, + spa_get_dsl(db->db_dnode->dn_objset->os_spa), + txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); + } + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + /* + * Release the already-written buffer, so we leave it in + * a consistent dirty state. Note that all callers are + * modifying the buffer, so they will immediately do + * another (redundant) arc_release(). Therefore, leave + * the buf thawed to save the effort of freezing & + * immediately re-thawing it. + */ + arc_release(dr->dt.dl.dr_data, db); +} + +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. Also, if we happen accross any level-1 dbufs in the + * range that have not already been marked dirty, mark them dirty so + * they stay in memory. + */ +void +dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + uint64_t txg = tx->tx_txg; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; + + if (end > dn->dn_maxblkid) { + end = dn->dn_maxblkid; + last_l1 = end >> epbs; + } + dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + mutex_enter(&dn->dn_dbufs_mtx); + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + + if (db->db_level == 1 && + db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { + mutex_enter(&db->db_mtx); + if (db->db_last_dirty && + db->db_last_dirty->dr_txg < txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + + if (db->db_level != 0) + continue; + dprintf_dbuf(db, "found buf %s\n", ""); + if (db->db_blkid < start || db->db_blkid > end) + continue; + + /* found a level 0 buffer in the range */ + if (dbuf_undirty(db, tx)) + continue; + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL || + db->db_state == DB_EVICTING) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + continue; + } + if (db->db_state == DB_READ || db->db_state == DB_FILL) { + /* will be handled in dbuf_read_done or dbuf_rele */ + db->db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + continue; + } + if (refcount_count(&db->db_holds) == 0) { + ASSERT(db->db_buf); + dbuf_clear(db); + continue; + } + /* The dbuf is referenced */ + + if (db->db_last_dirty != NULL) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + if (dr->dr_txg == txg) { + /* + * This buffer is "in-use", re-adjust the file + * size to reflect that this buffer may + * contain new data when we sync. + */ + if (db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + dbuf_unoverride(dr); + } else { + /* + * This dbuf is not dirty in the open context. + * Either uncache it (if its not referenced in + * the open context) or reset its contents to + * empty. + */ + dbuf_fix_old_data(db, txg); + } + } + /* clear the contents if its cached */ + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + arc_release(db->db_buf, db); + bzero(db->db.db_data, db->db.db_size); + arc_buf_freeze(db->db_buf); + } + + mutex_exit(&db->db_mtx); + } + mutex_exit(&dn->dn_dbufs_mtx); +} + +static int +dbuf_block_freeable(dmu_buf_impl_t *db) +{ + dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; + uint64_t birth_txg = 0; + + /* + * We don't need any locking to protect db_blkptr: + * If it's syncing, then db_last_dirty will be set + * so we'll ignore db_blkptr. + */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_last_dirty) + birth_txg = db->db_last_dirty->dr_txg; + else if (db->db_blkptr) + birth_txg = db->db_blkptr->blk_birth; + + /* If we don't exist or are in a snapshot, we can't be freed */ + if (birth_txg) + return (ds == NULL || + dsl_dataset_block_freeable(ds, birth_txg)); + else + return (FALSE); +} + +void +dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) +{ + arc_buf_t *buf, *obuf; + int osize = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + ASSERT(db->db_blkid != DB_BONUS_BLKID); + + /* XXX does *this* func really need the lock? */ + ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); + + /* + * This call to dbuf_will_dirty() with the dn_struct_rwlock held + * is OK, because there can be no other references to the db + * when we are changing its size, so no concurrent DB_FILL can + * be happening. + */ + /* + * XXX we should be doing a dbuf_read, checking the return + * value and returning that up to our callers + */ + dbuf_will_dirty(db, tx); + + /* create the data buffer for the new block */ + buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); + + /* copy old block data to the new block */ + obuf = db->db_buf; + bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + /* zero the remainder */ + if (size > osize) + bzero((uint8_t *)buf->b_data + osize, size - osize); + + mutex_enter(&db->db_mtx); + dbuf_set_data(db, buf); + VERIFY(arc_buf_remove_ref(obuf, db) == 1); + db->db.db_size = size; + + if (db->db_level == 0) { + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + db->db_last_dirty->dt.dl.dr_data = buf; + } + mutex_exit(&db->db_mtx); + + dnode_willuse_space(db->db_dnode, size-osize, tx); +} + +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + dbuf_dirty_record_t **drp, *dr; + int drop_struct_lock = FALSE; + boolean_t do_free_accounting = B_FALSE; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + DMU_TX_DIRTY_BUF(tx, db); + + /* + * Shouldn't dirty a regular buffer in syncing context. Private + * objects may be dirtied in syncing context, but only if they + * were already pre-dirtied in open context. + * XXX We may want to prohibit dirtying in syncing context even + * if they did pre-dirty. + */ + ASSERT(!dmu_tx_is_syncing(tx) || + BP_IS_HOLE(dn->dn_objset->os_rootbp) || + dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_objset->os_dsl_dataset == NULL || + dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); + + /* + * We make this assert for private objects as well, but after we + * check if we're already dirty. They are allowed to re-dirty + * in syncing context. + */ + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + mutex_enter(&db->db_mtx); + /* + * XXX make this true for indirects too? The problem is that + * transactions created with dmu_tx_create_assigned() from + * syncing context don't bother holding ahead. + */ + ASSERT(db->db_level != 0 || + db->db_state == DB_CACHED || db->db_state == DB_FILL || + db->db_state == DB_NOFILL); + + mutex_enter(&dn->dn_mtx); + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED && + !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + dn->dn_dirtyctx = + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); + ASSERT(dn->dn_dirtyctx_firstset == NULL); + dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&dn->dn_mtx); + + /* + * If this buffer is already dirty, we're done. + */ + drp = &db->db_last_dirty; + ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) + drp = &dr->dr_next; + if (dr && dr->dr_txg == tx->tx_txg) { + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(dr); + if (db->db.db_object != DMU_META_DNODE_OBJECT) + arc_buf_thaw(db->db_buf); + } + mutex_exit(&db->db_mtx); + return (dr); + } + + /* + * Only valid if not already dirty. + */ + ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + ASSERT3U(dn->dn_nlevels, >, db->db_level); + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + dn->dn_next_nlevels[txgoff] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + + /* + * We should only be dirtying in syncing context if it's the + * mos, a spa os, or we're initializing the os. However, we are + * allowed to dirty in syncing context provided we already + * dirtied it in open context. Hence we must make this + * assertion only if we're not already dirty. + */ + ASSERT(!dmu_tx_is_syncing(tx) || + os->os_dsl_dataset == NULL || + !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || + !BP_IS_HOLE(os->os_rootbp)); + ASSERT(db->db.db_size != 0); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + if (db->db_blkid != DB_BONUS_BLKID) { + /* + * Update the accounting. + * Note: we delay "free accounting" until after we drop + * the db_mtx. This keeps us from grabbing other locks + * (and possibly deadlocking) in bp_get_dasize() while + * also holding the db_mtx. + */ + dnode_willuse_space(dn, db->db.db_size, tx); + do_free_accounting = dbuf_block_freeable(db); + } + + /* + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. + */ + dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + if (db->db_level == 0) { + void *data_old = db->db_buf; + + if (db->db_state != DB_NOFILL) { + if (db->db_blkid == DB_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so + * that we can modify it without impacting + * possible other users of this cached data + * block. Note that indirect blocks and + * private objects are not released until the + * syncing state (since they are only modified + * then). + */ + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } + ASSERT(data_old != NULL); + } + dr->dt.dl.dr_data = data_old; + } else { + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + dr->dr_dbuf = db; + dr->dr_txg = tx->tx_txg; + dr->dr_next = *drp; + *drp = dr; + + /* + * We could have been freed_in_flight between the dbuf_noread + * and dbuf_dirty. We win, as though the dbuf_noread() had + * happened after the free. + */ + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + + /* + * This buffer is now part of this txg + */ + dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); + db->db_dirtycnt += 1; + ASSERT3U(db->db_dirtycnt, <=, 3); + + mutex_exit(&db->db_mtx); + + if (db->db_blkid == DB_BONUS_BLKID) { + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + dnode_setdirty(dn, tx); + return (dr); + } else if (do_free_accounting) { + blkptr_t *bp = db->db_blkptr; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dasize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + dnode_willuse_space(dn, -willfree, tx); + } + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + + if (db->db_level+1 < dn->dn_nlevels) { + dmu_buf_impl_t *parent = db->db_parent; + dbuf_dirty_record_t *di; + int parent_held = FALSE; + + if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + parent = dbuf_hold_level(dn, db->db_level+1, + db->db_blkid >> epbs, FTAG); + parent_held = TRUE; + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + ASSERT3U(db->db_level+1, ==, parent->db_level); + di = dbuf_dirty(parent, tx); + if (parent_held) + dbuf_rele(parent, FTAG); + + mutex_enter(&db->db_mtx); + /* possible race with dbuf_undirty() */ + if (db->db_last_dirty == dr || + dn->dn_object == DMU_META_DNODE_OBJECT) { + mutex_enter(&di->dt.di.dr_mtx); + ASSERT3U(di->dr_txg, ==, tx->tx_txg); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&di->dt.di.dr_children, dr); + mutex_exit(&di->dt.di.dr_mtx); + dr->dr_parent = di; + } + mutex_exit(&db->db_mtx); + } else { + ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_blkid < dn->dn_nblkptr); + ASSERT(db->db_parent == NULL || + db->db_parent == db->db_dnode->dn_dbuf); + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + } + + dnode_setdirty(dn, tx); + return (dr); +} + +static int +dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + uint64_t txg = tx->tx_txg; + dbuf_dirty_record_t *dr, **drp; + + ASSERT(txg != 0); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + + mutex_enter(&db->db_mtx); + + /* + * If this buffer is not dirty, we're done. + */ + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg <= txg) + break; + if (dr == NULL || dr->dr_txg < txg) { + mutex_exit(&db->db_mtx); + return (0); + } + ASSERT(dr->dr_txg == txg); + + /* + * If this buffer is currently held, we cannot undirty + * it, since one of the current holders may be in the + * middle of an update. Note that users of dbuf_undirty() + * should not place a hold on the dbuf before the call. + */ + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + /* Make sure we don't toss this buffer at sync phase */ + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + return (0); + } + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + ASSERT(db->db.db_size != 0); + + /* XXX would be nice to fix up dn_towrite_space[] */ + + *drp = dr->dr_next; + + if (dr->dr_parent) { + mutex_enter(&dr->dr_parent->dt.di.dr_mtx); + list_remove(&dr->dr_parent->dt.di.dr_children, dr); + mutex_exit(&dr->dr_parent->dt.di.dr_mtx); + } else if (db->db_level+1 == dn->dn_nlevels) { + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); + mutex_exit(&dn->dn_mtx); + } + + if (db->db_level == 0) { + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); + + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + } + } else { + ASSERT(db->db_buf != NULL); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + + if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { + arc_buf_t *buf = db->db_buf; + + ASSERT(arc_released(buf)); + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + return (1); + } + + mutex_exit(&db->db_mtx); + return (0); +} + +#pragma weak dmu_buf_will_dirty = dbuf_will_dirty +void +dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + (void) dbuf_read(db, NULL, rf); + (void) dbuf_dirty(db, tx); +} + +void +dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_state = DB_NOFILL; + + dmu_buf_will_fill(db_fake, tx); +} + +void +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(tx->tx_txg != 0); + ASSERT(db->db_level == 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || + dmu_tx_private_ok(tx)); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); +} + +#pragma weak dmu_buf_fill_done = dbuf_fill_done +/* ARGSUSED */ +void +dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + + if (db->db_state == DB_FILL) { + if (db->db_level == 0 && db->db_freed_in_flight) { + ASSERT(db->db_blkid != DB_BONUS_BLKID); + /* we were freed while filling */ + /* XXX dbuf_undirty? */ + bzero(db->db.db_data, db->db.db_size); + db->db_freed_in_flight = FALSE; + } + db->db_state = DB_CACHED; + cv_broadcast(&db->db_changed); + } + mutex_exit(&db->db_mtx); +} + +/* + * "Clear" the contents of this dbuf. This will mark the dbuf + * EVICTING and clear *most* of its references. Unfortunetely, + * when we are not holding the dn_dbufs_mtx, we can't clear the + * entry in the dn_dbufs list. We have to wait until dbuf_destroy() + * in this case. For callers from the DMU we will usually see: + * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() + * For the arc callback, we will usually see: + * dbuf_do_evict()->dbuf_clear();dbuf_destroy() + * Sometimes, though, we will get a mix of these two: + * DMU: dbuf_clear()->arc_buf_evict() + * ARC: dbuf_do_evict()->dbuf_destroy() + */ +void +dbuf_clear(dmu_buf_impl_t *db) +{ + dnode_t *dn = db->db_dnode; + dmu_buf_impl_t *parent = db->db_parent; + dmu_buf_impl_t *dndb = dn->dn_dbuf; + int dbuf_gone = FALSE; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(refcount_is_zero(&db->db_holds)); + + dbuf_evict_user(db); + + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + if (db->db_blkid == DB_BONUS_BLKID) { + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } + db->db.db_data = NULL; + db->db_state = DB_UNCACHED; + } + + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); + ASSERT(db->db_data_pending == NULL); + + db->db_state = DB_EVICTING; + db->db_blkptr = NULL; + + if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + list_remove(&dn->dn_dbufs, db); + dnode_rele(dn, db); + db->db_dnode = NULL; + } + + if (db->db_buf) + dbuf_gone = arc_buf_evict(db->db_buf); + + if (!dbuf_gone) + mutex_exit(&db->db_mtx); + + /* + * If this dbuf is referened from an indirect dbuf, + * decrement the ref count on the indirect dbuf. + */ + if (parent && parent != dndb) + dbuf_rele(parent, db); +} + +static int +dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, + dmu_buf_impl_t **parentp, blkptr_t **bpp) +{ + int nlevels, epbs; + + *parentp = NULL; + *bpp = NULL; + + ASSERT(blkid != DB_BONUS_BLKID); + + if (dn->dn_phys->dn_nlevels == 0) + nlevels = 1; + else + nlevels = dn->dn_phys->dn_nlevels; + + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT3U(level * epbs, <, 64); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + if (level >= nlevels || + (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { + /* the buffer has no parent yet */ + return (ENOENT); + } else if (level < nlevels-1) { + /* this block is referenced from an indirect block */ + int err = dbuf_hold_impl(dn, level+1, + blkid >> epbs, fail_sparse, NULL, parentp); + if (err) + return (err); + err = dbuf_read(*parentp, NULL, + (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err) { + dbuf_rele(*parentp, NULL); + *parentp = NULL; + return (err); + } + *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + (blkid & ((1ULL << epbs) - 1)); + return (0); + } else { + /* the block is referenced from the dnode */ + ASSERT3U(level, ==, nlevels-1); + ASSERT(dn->dn_phys->dn_nblkptr == 0 || + blkid < dn->dn_phys->dn_nblkptr); + if (dn->dn_dbuf) { + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + } + *bpp = &dn->dn_phys->dn_blkptr[blkid]; + return (0); + } +} + +static dmu_buf_impl_t * +dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, + dmu_buf_impl_t *parent, blkptr_t *blkptr) +{ + objset_impl_t *os = dn->dn_objset; + dmu_buf_impl_t *db, *odb; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(dn->dn_type != DMU_OT_NONE); + + db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + + db->db_objset = os; + db->db.db_object = dn->dn_object; + db->db_level = level; + db->db_blkid = blkid; + db->db_last_dirty = NULL; + db->db_dirtycnt = 0; + db->db_dnode = dn; + db->db_parent = parent; + db->db_blkptr = blkptr; + + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; + db->db_immediate_evict = 0; + db->db_freed_in_flight = 0; + + if (blkid == DB_BONUS_BLKID) { + ASSERT3P(parent, ==, dn->dn_dbuf); + db->db.db_size = DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + db->db.db_offset = DB_BONUS_BLKID; + db->db_state = DB_UNCACHED; + /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t)); + return (db); + } else { + int blocksize = + db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; + db->db.db_size = blocksize; + db->db.db_offset = db->db_blkid * blocksize; + } + + /* + * Hold the dn_dbufs_mtx while we get the new dbuf + * in the hash table *and* added to the dbufs list. + * This prevents a possible deadlock with someone + * trying to look up this dbuf before its added to the + * dn_dbufs list. + */ + mutex_enter(&dn->dn_dbufs_mtx); + db->db_state = DB_EVICTING; + if ((odb = dbuf_hash_insert(db)) != NULL) { + /* someone else inserted it first */ + kmem_cache_free(dbuf_cache, db); + mutex_exit(&dn->dn_dbufs_mtx); + return (odb); + } + list_insert_head(&dn->dn_dbufs, db); + db->db_state = DB_UNCACHED; + mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t)); + + if (parent && parent != dn->dn_dbuf) + dbuf_add_ref(parent, db); + + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + refcount_count(&dn->dn_holds) > 0); + (void) refcount_add(&dn->dn_holds, db); + + dprintf_dbuf(db, "db=%p\n", db); + + return (db); +} + +static int +dbuf_do_evict(void *private) +{ + arc_buf_t *buf = private; + dmu_buf_impl_t *db = buf->b_private; + + if (!MUTEX_HELD(&db->db_mtx)) + mutex_enter(&db->db_mtx); + + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_state != DB_EVICTING) { + ASSERT(db->db_state == DB_CACHED); + DBUF_VERIFY(db); + db->db_buf = NULL; + dbuf_evict(db); + } else { + mutex_exit(&db->db_mtx); + dbuf_destroy(db); + } + return (0); +} + +static void +dbuf_destroy(dmu_buf_impl_t *db) +{ + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_blkid != DB_BONUS_BLKID) { + /* + * If this dbuf is still on the dn_dbufs list, + * remove it from that list. + */ + if (db->db_dnode) { + dnode_t *dn = db->db_dnode; + + mutex_enter(&dn->dn_dbufs_mtx); + list_remove(&dn->dn_dbufs, db); + mutex_exit(&dn->dn_dbufs_mtx); + + dnode_rele(dn, db); + db->db_dnode = NULL; + } + dbuf_hash_remove(db); + } + db->db_parent = NULL; + db->db_buf = NULL; + + ASSERT(!list_link_active(&db->db_link)); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + + kmem_cache_free(dbuf_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t)); +} + +void +dbuf_prefetch(dnode_t *dn, uint64_t blkid) +{ + dmu_buf_impl_t *db = NULL; + blkptr_t *bp = NULL; + + ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + if (dnode_block_freed(dn, blkid)) + return; + + /* dbuf_find() returns with db_mtx held */ + if (db = dbuf_find(dn, 0, blkid)) { + if (refcount_count(&db->db_holds) > 0) { + /* + * This dbuf is active. We assume that it is + * already CACHED, or else about to be either + * read or filled. + */ + mutex_exit(&db->db_mtx); + return; + } + mutex_exit(&db->db_mtx); + db = NULL; + } + + if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { + if (bp && !BP_IS_HOLE(bp)) { + arc_buf_t *pbuf; + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + zbookmark_t zb; + zb.zb_objset = dn->dn_objset->os_dsl_dataset ? + dn->dn_objset->os_dsl_dataset->ds_object : 0; + zb.zb_object = dn->dn_object; + zb.zb_level = 0; + zb.zb_blkid = blkid; + + if (db) + pbuf = db->db_buf; + else + pbuf = dn->dn_objset->os_phys_buf; + + (void) arc_read(NULL, dn->dn_objset->os_spa, + bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zb); + } + if (db) + dbuf_rele(db, NULL); + } +} + +/* + * Returns with db_holds incremented, and db_mtx not held. + * Note: dn_struct_rwlock must be held. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, + void *tag, dmu_buf_impl_t **dbp) +{ + dmu_buf_impl_t *db, *parent = NULL; + + ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(dn->dn_nlevels, >, level); + + *dbp = NULL; +top: + /* dbuf_find() returns with db_mtx held */ + db = dbuf_find(dn, level, blkid); + + if (db == NULL) { + blkptr_t *bp = NULL; + int err; + + ASSERT3P(parent, ==, NULL); + err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); + if (fail_sparse) { + if (err == 0 && bp && BP_IS_HOLE(bp)) + err = ENOENT; + if (err) { + if (parent) + dbuf_rele(parent, NULL); + return (err); + } + } + if (err && err != ENOENT) + return (err); + db = dbuf_create(dn, level, blkid, parent, bp); + } + + if (db->db_buf && refcount_is_zero(&db->db_holds)) { + arc_buf_add_ref(db->db_buf, db); + if (db->db_buf->b_data == NULL) { + dbuf_clear(db); + if (parent) { + dbuf_rele(parent, NULL); + parent = NULL; + } + goto top; + } + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + } + + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + + /* + * If this buffer is currently syncing out, and we are are + * still referencing it from db_data, we need to make a copy + * of it in case we decide we want to dirty it again in this txg. + */ + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + dn->dn_object != DMU_META_DNODE_OBJECT && + db->db_state == DB_CACHED && db->db_data_pending) { + dbuf_dirty_record_t *dr = db->db_data_pending; + + if (dr->dt.dl.dr_data == db->db_buf) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + dbuf_set_data(db, + arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + db->db.db_size, db, type)); + bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, + db->db.db_size); + } + } + + (void) refcount_add(&db->db_holds, tag); + dbuf_update_data(db); + DBUF_VERIFY(db); + mutex_exit(&db->db_mtx); + + /* NOTE: we can't rele the parent until after we drop the db_mtx */ + if (parent) + dbuf_rele(parent, NULL); + + ASSERT3P(db->db_dnode, ==, dn); + ASSERT3U(db->db_blkid, ==, blkid); + ASSERT3U(db->db_level, ==, level); + *dbp = db; + + return (0); +} + +dmu_buf_impl_t * +dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); + return (err ? NULL : db); +} + +dmu_buf_impl_t * +dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + return (err ? NULL : db); +} + +void +dbuf_create_bonus(dnode_t *dn) +{ + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + ASSERT(dn->dn_bonus == NULL); + dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); +} + +#pragma weak dmu_buf_add_ref = dbuf_add_ref +void +dbuf_add_ref(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds = refcount_add(&db->db_holds, tag); + ASSERT(holds > 1); +} + +#pragma weak dmu_buf_rele = dbuf_rele +void +dbuf_rele(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds; + + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + + holds = refcount_remove(&db->db_holds, tag); + ASSERT(holds >= 0); + + /* + * We can't freeze indirects if there is a possibility that they + * may be modified in the current syncing context. + */ + if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + arc_buf_freeze(db->db_buf); + + if (holds == db->db_dirtycnt && + db->db_level == 0 && db->db_immediate_evict) + dbuf_evict_user(db); + + if (holds == 0) { + if (db->db_blkid == DB_BONUS_BLKID) { + mutex_exit(&db->db_mtx); + dnode_rele(db->db_dnode, db); + } else if (db->db_buf == NULL) { + /* + * This is a special case: we never associated this + * dbuf with any data allocated from the ARC. + */ + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); + dbuf_evict(db); + } else if (arc_released(db->db_buf)) { + arc_buf_t *buf = db->db_buf; + /* + * This dbuf has anonymous data associated with it. + */ + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + } else { + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); + if (!DBUF_IS_CACHEABLE(db)) + dbuf_clear(db); + else + mutex_exit(&db->db_mtx); + } + } else { + mutex_exit(&db->db_mtx); + } +} + +#pragma weak dmu_buf_refcount = dbuf_refcount +uint64_t +dbuf_refcount(dmu_buf_impl_t *db) +{ + return (refcount_count(&db->db_holds)); +} + +void * +dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_immediate_evict = TRUE; + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + ASSERT((user_ptr == NULL) == (evict_func == NULL)); + + mutex_enter(&db->db_mtx); + + if (db->db_user_ptr == old_user_ptr) { + db->db_user_ptr = user_ptr; + db->db_user_data_ptr_ptr = user_data_ptr_ptr; + db->db_evict_func = evict_func; + + dbuf_update_data(db); + } else { + old_user_ptr = db->db_user_ptr; + } + + mutex_exit(&db->db_mtx); + return (old_user_ptr); +} + +void * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(!refcount_is_zero(&db->db_holds)); + + return (db->db_user_ptr); +} + +static void +dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) +{ + /* ASSERT(dmu_tx_is_syncing(tx) */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_blkptr != NULL) + return; + + if (db->db_level == dn->dn_phys->dn_nlevels-1) { + /* + * This buffer was allocated at a time when there was + * no available blkptrs from the dnode, or it was + * inappropriate to hook it in (i.e., nlevels mis-match). + */ + ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); + ASSERT(db->db_parent == NULL); + db->db_parent = dn->dn_dbuf; + db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; + DBUF_VERIFY(db); + } else { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT(dn->dn_phys->dn_nlevels > 1); + if (parent == NULL) { + mutex_exit(&db->db_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + (void) dbuf_hold_impl(dn, db->db_level+1, + db->db_blkid >> epbs, FALSE, db, &parent); + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + db->db_parent = parent; + } + db->db_blkptr = (blkptr_t *)parent->db.db_data + + (db->db_blkid & ((1ULL << epbs) - 1)); + DBUF_VERIFY(db); + } +} + +static void +dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + zio_t *zio; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + + ASSERT(db->db_level > 0); + DBUF_VERIFY(db); + + if (db->db_buf == NULL) { + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + mutex_enter(&db->db_mtx); + } + ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + ASSERT(db->db_buf != NULL); + + dbuf_check_blkptr(dn, db); + + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + dbuf_write(dr, db->db_buf, tx); + + zio = dr->dr_zio; + mutex_enter(&dr->dt.di.dr_mtx); + dbuf_sync_list(&dr->dt.di.dr_children, tx); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_exit(&dr->dt.di.dr_mtx); + zio_nowait(zio); +} + +static void +dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + arc_buf_t **datap = &dr->dt.dl.dr_data; + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + uint64_t txg = tx->tx_txg; + int blksz; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + /* + * To be synced, we must be dirtied. But we + * might have been freed after the dirty. + */ + if (db->db_state == DB_UNCACHED) { + /* This buffer has been freed since it was dirtied */ + ASSERT(db->db.db_data == NULL); + } else if (db->db_state == DB_FILL) { + /* This buffer was freed and is now being re-filled */ + ASSERT(db->db.db_data != dr->dt.dl.dr_data); + } else { + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); + } + DBUF_VERIFY(db); + + /* + * If this is a bonus buffer, simply copy the bonus data into the + * dnode. It will be written out when the dnode is synced (and it + * will be synced, since it must have been dirty for dbuf_sync to + * be called). + */ + if (db->db_blkid == DB_BONUS_BLKID) { + dbuf_dirty_record_t **drp; + + ASSERT(*datap != NULL); + ASSERT3U(db->db_level, ==, 0); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); + bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + if (*datap != db->db.db_data) { + zio_buf_free(*datap, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } + db->db_data_pending = NULL; + drp = &db->db_last_dirty; + while (*drp != dr) + drp = &(*drp)->dr_next; + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + dbuf_rele(db, (void *)(uintptr_t)txg); + return; + } + + /* + * This function may have dropped the db_mtx lock allowing a dmu_sync + * operation to sneak in. As a result, we need to ensure that we + * don't check the dr_override_state until we have returned from + * dbuf_check_blkptr. + */ + dbuf_check_blkptr(dn, db); + + /* + * If this buffer is in the middle of an immdiate write, + * wait for the synchronous IO to complete. + */ + while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + cv_wait(&db->db_changed, &db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); + } + + /* + * If this dbuf has already been written out via an immediate write, + * just complete the write by copying over the new block pointer and + * updating the accounting via the write-completion functions. + */ + if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + zio_t zio_fake; + + zio_fake.io_private = &db; + zio_fake.io_error = 0; + zio_fake.io_bp = db->db_blkptr; + zio_fake.io_bp_orig = *db->db_blkptr; + zio_fake.io_txg = txg; + zio_fake.io_flags = 0; + + *db->db_blkptr = dr->dt.dl.dr_overridden_by; + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + db->db_data_pending = dr; + dr->dr_zio = &zio_fake; + mutex_exit(&db->db_mtx); + + ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp), + BP_IDENTITY(&zio_fake.io_bp_orig)) || + BP_IS_HOLE(zio_fake.io_bp)); + + if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) + (void) dsl_dataset_block_kill(os->os_dsl_dataset, + &zio_fake.io_bp_orig, dn->dn_zio, tx); + + dbuf_write_ready(&zio_fake, db->db_buf, db); + dbuf_write_done(&zio_fake, db->db_buf, db); + + return; + } + + if (db->db_state != DB_NOFILL) { + blksz = arc_buf_size(*datap); + + if (dn->dn_object != DMU_META_DNODE_OBJECT) { + /* + * If this buffer is currently "in use" (i.e., there + * are active holds and db_data still references it), + * then make a copy before we start the write so that + * any modifications from the open txg will not leak + * into this write. + * + * NOTE: this copy does not need to be made for + * objects only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ + if (refcount_count(&db->db_holds) > 1 && + *datap == db->db_buf) { + arc_buf_contents_t type = + DBUF_GET_BUFC_TYPE(db); + *datap = + arc_buf_alloc(os->os_spa, blksz, db, type); + bcopy(db->db.db_data, (*datap)->b_data, blksz); + } + } + + ASSERT(*datap != NULL); + } + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + dbuf_write(dr, *datap, tx); + + ASSERT(!list_link_active(&dr->dr_dirty_node)); + if (dn->dn_object == DMU_META_DNODE_OBJECT) + list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); + else + zio_nowait(dr->dr_zio); +} + +void +dbuf_sync_list(list_t *list, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + if (dr->dr_zio != NULL) { + /* + * If we find an already initialized zio then we + * are processing the meta-dnode, and we have finished. + * The dbufs for all dnodes are put back on the list + * during processing, so that we can zio_wait() + * these IOs after initiating all child IOs. + */ + ASSERT3U(dr->dr_dbuf->db.db_object, ==, + DMU_META_DNODE_OBJECT); + break; + } + list_remove(list, dr); + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } +} + +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_t zb; + writeprops_t wp = { 0 }; + zio_t *zio; + + if (!BP_IS_HOLE(db->db_blkptr) && + (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + arc_release(data, db); + } else if (db->db_state != DB_NOFILL) { + ASSERT(arc_released(data)); + /* XXX why do we need to thaw here? */ + arc_buf_thaw(data); + } + + if (parent != dn->dn_dbuf) { + ASSERT(parent && parent->db_data_pending); + ASSERT(db->db_level == parent->db_level-1); + ASSERT(arc_released(parent->db_buf)); + zio = parent->db_data_pending->dr_zio; + } else { + ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + zio = dn->dn_zio; + } + + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(zio); + + zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + + wp.wp_type = dn->dn_type; + wp.wp_level = db->db_level; + wp.wp_copies = os->os_copies; + wp.wp_dncompress = dn->dn_compress; + wp.wp_oscompress = os->os_compress; + wp.wp_dnchecksum = dn->dn_checksum; + wp.wp_oschecksum = os->os_checksum; + + if (BP_IS_OLDER(db->db_blkptr, txg)) + (void) dsl_dataset_block_kill( + os->os_dsl_dataset, db->db_blkptr, zio, tx); + + if (db->db_state == DB_NOFILL) { + zio_prop_t zp = { 0 }; + + write_policy(os->os_spa, &wp, &zp); + dr->dr_zio = zio_write(zio, os->os_spa, + txg, db->db_blkptr, NULL, + db->db.db_size, &zp, dbuf_skip_write_ready, + dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED, &zb); + } else { + dr->dr_zio = arc_write(zio, os->os_spa, &wp, + DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr, + data, dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + } +} + +/* wrapper function for dbuf_write_ready bypassing ARC */ +static void +dbuf_skip_write_ready(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (!BP_IS_GANG(bp)) + zio_skip_write(zio); + + dbuf_write_ready(zio, NULL, zio->io_private); +} + +/* wrapper function for dbuf_write_done bypassing ARC */ +static void +dbuf_skip_write_done(zio_t *zio) +{ + dbuf_write_done(zio, NULL, zio->io_private); +} + +/* ARGSUSED */ +static void +dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint64_t fill = 0; + int old_size, new_size, i; + + ASSERT(db->db_blkptr == bp); + + dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); + + old_size = bp_get_dasize(os->os_spa, bp_orig); + new_size = bp_get_dasize(os->os_spa, bp); + + dnode_diduse_space(dn, new_size - old_size); + + if (BP_IS_HOLE(bp)) { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + if (bp_orig->blk_birth == tx->tx_txg) + (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); + ASSERT3U(bp->blk_fill, ==, 0); + return; + } + + ASSERT(BP_GET_TYPE(bp) == dn->dn_type); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); + + mutex_enter(&db->db_mtx); + + if (db->db_level == 0) { + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > dn->dn_phys->dn_maxblkid) + dn->dn_phys->dn_maxblkid = db->db_blkid; + mutex_exit(&dn->dn_mtx); + + if (dn->dn_type == DMU_OT_DNODE) { + dnode_phys_t *dnp = db->db.db_data; + for (i = db->db.db_size >> DNODE_SHIFT; i > 0; + i--, dnp++) { + if (dnp->dn_type != DMU_OT_NONE) + fill++; + } + } else { + fill = 1; + } + } else { + blkptr_t *ibp = db->db.db_data; + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { + if (BP_IS_HOLE(ibp)) + continue; + ASSERT3U(BP_GET_LSIZE(ibp), ==, + db->db_level == 1 ? dn->dn_datablksz : + (1<<dn->dn_phys->dn_indblkshift)); + fill += ibp->blk_fill; + } + } + + bp->blk_fill = fill; + + mutex_exit(&db->db_mtx); + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + if (bp_orig->blk_birth == tx->tx_txg) + (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); + dsl_dataset_block_born(ds, bp, tx); + } +} + +/* ARGSUSED */ +static void +dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + uint64_t txg = zio->io_txg; + dbuf_dirty_record_t **drp, *dr; + + ASSERT3U(zio->io_error, ==, 0); + + mutex_enter(&db->db_mtx); + + drp = &db->db_last_dirty; + while ((dr = *drp) != db->db_data_pending) + drp = &dr->dr_next; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; + + if (db->db_level == 0) { + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + + if (db->db_state != DB_NOFILL) { + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + else if (!BP_IS_HOLE(db->db_blkptr)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); + else + ASSERT(arc_released(db->db_buf)); + } + } else { + dnode_t *dn = db->db_dnode; + + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs = + dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + db->db.db_size); + ASSERT3U(dn->dn_phys->dn_maxblkid + >> (db->db_level * epbs), >=, db->db_blkid); + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + db->db_data_pending = NULL; + mutex_exit(&db->db_mtx); + + dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); + + dbuf_rele(db, (void *)(uintptr_t)txg); +} diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c new file mode 100644 index 000000000..b6205bd50 --- /dev/null +++ b/module/zfs/dmu.c @@ -0,0 +1,1227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_prop.h> +#include <sys/dmu_zfetch.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#ifdef _KERNEL +#include <sys/vmsystm.h> +#include <sys/zfs_znode.h> +#endif + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { + { byteswap_uint8_array, TRUE, "unallocated" }, + { zap_byteswap, TRUE, "object directory" }, + { byteswap_uint64_array, TRUE, "object array" }, + { byteswap_uint8_array, TRUE, "packed nvlist" }, + { byteswap_uint64_array, TRUE, "packed nvlist size" }, + { byteswap_uint64_array, TRUE, "bplist" }, + { byteswap_uint64_array, TRUE, "bplist header" }, + { byteswap_uint64_array, TRUE, "SPA space map header" }, + { byteswap_uint64_array, TRUE, "SPA space map" }, + { byteswap_uint64_array, TRUE, "ZIL intent log" }, + { dnode_buf_byteswap, TRUE, "DMU dnode" }, + { dmu_objset_byteswap, TRUE, "DMU objset" }, + { byteswap_uint64_array, TRUE, "DSL directory" }, + { zap_byteswap, TRUE, "DSL directory child map"}, + { zap_byteswap, TRUE, "DSL dataset snap map" }, + { zap_byteswap, TRUE, "DSL props" }, + { byteswap_uint64_array, TRUE, "DSL dataset" }, + { zfs_znode_byteswap, TRUE, "ZFS znode" }, + { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, + { byteswap_uint8_array, FALSE, "ZFS plain file" }, + { zap_byteswap, TRUE, "ZFS directory" }, + { zap_byteswap, TRUE, "ZFS master node" }, + { zap_byteswap, TRUE, "ZFS delete queue" }, + { byteswap_uint8_array, FALSE, "zvol object" }, + { zap_byteswap, TRUE, "zvol prop" }, + { byteswap_uint8_array, FALSE, "other uint8[]" }, + { byteswap_uint64_array, FALSE, "other uint64[]" }, + { zap_byteswap, TRUE, "other ZAP" }, + { zap_byteswap, TRUE, "persistent error log" }, + { byteswap_uint8_array, TRUE, "SPA history" }, + { byteswap_uint64_array, TRUE, "SPA history offsets" }, + { zap_byteswap, TRUE, "Pool properties" }, + { zap_byteswap, TRUE, "DSL permissions" }, + { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, + { byteswap_uint8_array, TRUE, "FUID table" }, + { byteswap_uint64_array, TRUE, "FUID table size" }, + { zap_byteswap, TRUE, "DSL dataset next clones"}, + { zap_byteswap, TRUE, "scrub work queue" }, +}; + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + uint64_t blkid; + dmu_buf_impl_t *db; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + blkid = dbuf_whichblock(dn, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) { + err = EIO; + } else { + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err) { + dbuf_rele(db, tag); + db = NULL; + } + } + + dnode_rele(dn, FTAG); + *dbp = &db->db; + return (err); +} + +int +dmu_bonus_max(void) +{ + return (DN_MAX_BONUSLEN); +} + +int +dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + + if (dn->dn_bonus != (dmu_buf_impl_t *)db) + return (EINVAL); + if (newsize < 0 || newsize > db->db_size) + return (EINVAL); + dnode_setbonuslen(dn, newsize, tx); + return (0); +} + +/* + * returns ENOENT, EIO, or 0. + */ +int +dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + dmu_buf_impl_t *db; + int error; + + error = dnode_hold(os->os, object, FTAG, &dn); + if (error) + return (error); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus == NULL) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dbuf_create_bonus(dn); + } + db = dn->dn_bonus; + rw_exit(&dn->dn_struct_rwlock); + + /* as long as the bonus buf is held, the dnode will be held */ + if (refcount_add(&db->db_holds, tag) == 1) + VERIFY(dnode_add_ref(dn, db)); + + dnode_rele(dn, FTAG); + + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); + + *dbp = &db->db; + return (0); +} + +/* + * Note: longer-term, we should modify all of the dmu_buf_*() interfaces + * to take a held dnode rather than <os, object> -- the lookup is wasteful, + * and can induce severe lock contention when writing to several files + * whose dnodes are in the same block. + */ +static int +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dsl_pool_t *dp = NULL; + dmu_buf_t **dbp; + uint64_t blkid, nblks, i; + uint32_t flags; + int err; + zio_t *zio; + hrtime_t start; + + ASSERT(length <= DMU_MAX_ACCESS); + + flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; + if (length > zfetch_array_rd_sz) + flags |= DB_RF_NOPREFETCH; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - + P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; + } else { + if (offset + length > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)offset, (longlong_t)length); + return (EIO); + } + nblks = 1; + } + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + if (db == NULL) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array(dbp, nblks, tag); + zio_nowait(zio); + return (EIO); + } + /* initiate async i/o */ + if (read) { + rw_exit(&dn->dn_struct_rwlock); + (void) dbuf_read(db, zio, flags); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + } + dbp[i] = &db->db; + } + rw_exit(&dn->dn_struct_rwlock); + + /* wait for async i/o */ + err = zio_wait(zio); + /* track read overhead when we are in sync context */ + if (dp && dsl_pool_sync_context(dp)) + dp->dp_read_overhead += gethrtime() - start; + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ + if (read) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || + db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = EIO; + mutex_exit(&db->db_mtx); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + } + } + + *numbufsp = nblks; + *dbpp = dbp; + return (0); +} + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + int err; + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp); + + return (err); +} + +void +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) +{ + int i; + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + + if (numbufs == 0) + return; + + for (i = 0; i < numbufs; i++) { + if (dbp[i]) + dbuf_rele(dbp[i], tag); + } + + kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); +} + +void +dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +{ + dnode_t *dn; + uint64_t blkid; + int nblks, i, err; + + if (zfs_prefetch_disable) + return; + + if (len == 0) { /* they're interested in the bonus buffer */ + dn = os->os->os_meta_dnode; + + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, blkid); + rw_exit(&dn->dn_struct_rwlock); + return; + } + + /* + * XXX - Note, if the dnode for the requested object is not + * already cached, we will do a *synchronous* read in the + * dnode_hold() call. The same is true for any indirects. + */ + err = dnode_hold(os->os, object, FTAG, &dn); + if (err != 0) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - + P2ALIGN(offset, 1<<blkshift)) >> blkshift; + } else { + nblks = (offset < dn->dn_datablksz); + } + + if (nblks != 0) { + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) + dbuf_prefetch(dn, blkid+i); + } + + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); +} + +static int +get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) +{ + uint64_t len = *offset - limit; + uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; + uint64_t subchunk = + dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); + + ASSERT(limit <= *offset); + + if (len <= chunk_len) { + *offset = limit; + return (0); + } + + ASSERT(ISP2(subchunk)); + + while (*offset > limit) { + uint64_t initial_offset = P2ROUNDUP(*offset, subchunk); + uint64_t delta; + int err; + + /* skip over allocated data */ + err = dnode_next_offset(dn, + DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); + if (err == ESRCH) + *offset = limit; + else if (err) + return (err); + + ASSERT3U(*offset, <=, initial_offset); + *offset = P2ALIGN(*offset, subchunk); + delta = initial_offset - *offset; + if (delta >= chunk_len) { + *offset += delta - chunk_len; + return (0); + } + chunk_len -= delta; + + /* skip over unallocated data */ + err = dnode_next_offset(dn, + DNODE_FIND_BACKWARDS, offset, 1, 1, 0); + if (err == ESRCH) + *offset = limit; + else if (err) + return (err); + + if (*offset < limit) + *offset = limit; + ASSERT3U(*offset, <, initial_offset); + } + return (0); +} + +static int +dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, + uint64_t length, boolean_t free_dnode) +{ + dmu_tx_t *tx; + uint64_t object_size, start, end, len; + boolean_t trunc = (length == DMU_OBJECT_END); + int align, err; + + align = 1 << dn->dn_datablkshift; + ASSERT(align > 0); + object_size = align == 1 ? dn->dn_datablksz : + (dn->dn_maxblkid + 1) << dn->dn_datablkshift; + + if (trunc || (end = offset + length) > object_size) + end = object_size; + if (end <= offset) + return (0); + length = end - offset; + + while (length) { + start = end; + err = get_next_chunk(dn, &start, offset); + if (err) + return (err); + len = trunc ? DMU_OBJECT_END : end - start; + + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, dn->dn_object, start, len); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + dnode_free_range(dn, start, trunc ? -1 : len, tx); + + if (start == 0 && free_dnode) { + ASSERT(trunc); + dnode_free(dn, tx); + } + + length -= end - start; + + dmu_tx_commit(tx); + end = start; + } + return (0); +} + +int +dmu_free_long_range(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err != 0) + return (err); + err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_object(objset_t *os, uint64_t object) +{ + dnode_t *dn; + dmu_tx_t *tx; + int err; + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err != 0) + return (err); + if (dn->dn_nlevels == 1) { + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } else { + err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); + } + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + dnode_t *dn; + int err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + ASSERT(offset < UINT64_MAX); + ASSERT(size == -1ULL || size <= UINT64_MAX - offset); + dnode_free_range(dn, offset, size, tx); + dnode_rele(dn, FTAG); + return (0); +} + +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf) +{ + dnode_t *dn; + dmu_buf_t **dbp; + int numbufs, i, err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + + /* + * Deal with odd block sizes, where there can't be data past the first + * block. If we ever do the tail block optimization, we will need to + * handle that here as well. + */ + if (dn->dn_datablkshift == 0) { + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)buf + newsz, size - newsz); + size = newsz; + } + + while (size > 0) { + uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, + TRUE, FTAG, &numbufs, &dbp); + if (err) + break; + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + dnode_rele(dn, FTAG); + return (err); +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + bcopy(buf, (char *)db->db_data + bufoff, tocpy); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; + + dmu_buf_will_not_fill(db, tx); + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +#ifdef _KERNEL +int +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +{ + dmu_buf_t **dbp; + int numbufs, i, err; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, + &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); + if (err) + break; + + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + int err = 0; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + /* + * XXX uiomove could block forever (eg. nfs-backed + * pages). There needs to be a uiolockdown() function + * to lock the pages in memory, so that uiomove won't + * block. + */ + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_WRITE, uio); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + if (err) + break; + + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + page_t *pp, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(pp, S_READ); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(pp, va); + pp = pp->p_next; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + if (err) + break; + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} +#endif + +typedef struct { + dbuf_dirty_record_t *dr; + dmu_sync_cb_t *done; + void *arg; +} dmu_sync_arg_t; + +/* ARGSUSED */ +static void +dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) +{ + blkptr_t *bp = zio->io_bp; + + if (!BP_IS_HOLE(bp)) { + dmu_sync_arg_t *in = varg; + dbuf_dirty_record_t *dr = in->dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); + ASSERT(BP_GET_LEVEL(bp) == 0); + bp->blk_fill = 1; + } +} + +/* ARGSUSED */ +static void +dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *in = varg; + dbuf_dirty_record_t *dr = in->dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + dmu_sync_cb_t *done = in->done; + + mutex_enter(&db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); + dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); + + if (done) + done(&(db->db), in->arg); + + kmem_free(in, sizeof (dmu_sync_arg_t)); +} + +/* + * Intent log support: sync the block associated with db to disk. + * N.B. and XXX: the caller is responsible for making sure that the + * data isn't changing while dmu_sync() is writing it. + * + * Return values: + * + * EEXIST: this txg has already been synced, so there's nothing to to. + * The caller should not log the write. + * + * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. + * The caller should not log the write. + * + * EALREADY: this block is already in the process of being synced. + * The caller should track its progress (somehow). + * + * EINPROGRESS: the IO has been initiated. + * The caller should log this blkptr in the callback. + * + * 0: completed. Sets *bp to the blkptr just written. + * The caller should log this blkptr immediately. + */ +int +dmu_sync(zio_t *pio, dmu_buf_t *db_fake, + blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + objset_impl_t *os = db->db_objset; + dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; + tx_state_t *tx = &dp->dp_tx; + dbuf_dirty_record_t *dr; + dmu_sync_arg_t *in; + zbookmark_t zb; + writeprops_t wp = { 0 }; + zio_t *zio; + int err; + + ASSERT(BP_IS_HOLE(bp)); + ASSERT(txg != 0); + + dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", + txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); + + /* + * XXX - would be nice if we could do this without suspending... + */ + txg_suspend(dp); + + /* + * If this txg already synced, there's nothing to do. + */ + if (txg <= tx->tx_synced_txg) { + txg_resume(dp); + /* + * If we're running ziltest, we need the blkptr regardless. + */ + if (txg > spa_freeze_txg(dp->dp_spa)) { + /* if db_blkptr == NULL, this was an empty write */ + if (db->db_blkptr) + *bp = *db->db_blkptr; /* structure assignment */ + return (0); + } + return (EEXIST); + } + + mutex_enter(&db->db_mtx); + + if (txg == tx->tx_syncing_txg) { + while (db->db_data_pending) { + /* + * IO is in-progress. Wait for it to finish. + * XXX - would be nice to be able to somehow "attach" + * this zio to the parent zio passed in. + */ + cv_wait(&db->db_changed, &db->db_mtx); + if (!db->db_data_pending && + db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { + /* + * IO was compressed away + */ + *bp = *db->db_blkptr; /* structure assignment */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (0); + } + ASSERT(db->db_data_pending || + (db->db_blkptr && db->db_blkptr->blk_birth == txg)); + } + + if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { + /* + * IO is already completed. + */ + *bp = *db->db_blkptr; /* structure assignment */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (0); + } + } + + dr = db->db_last_dirty; + while (dr && dr->dr_txg > txg) + dr = dr->dr_next; + if (dr == NULL || dr->dr_txg < txg) { + /* + * This dbuf isn't dirty, must have been free_range'd. + * There's no need to log writes to freed blocks, so we're done. + */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (ENOENT); + } + + ASSERT(dr->dr_txg == txg); + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + /* + * We have already issued a sync write for this buffer. + */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (EALREADY); + } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * This buffer has already been synced. It could not + * have been dirtied since, or we would have cleared the state. + */ + *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (0); + } + + dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + in->dr = dr; + in->done = done; + in->arg = arg; + mutex_exit(&db->db_mtx); + txg_resume(dp); + + zb.zb_objset = os->os_dsl_dataset->ds_object; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + + wp.wp_type = db->db_dnode->dn_type; + wp.wp_level = db->db_level; + wp.wp_copies = os->os_copies; + wp.wp_dnchecksum = db->db_dnode->dn_checksum; + wp.wp_oschecksum = os->os_checksum; + wp.wp_dncompress = db->db_dnode->dn_compress; + wp.wp_oscompress = os->os_compress; + + ASSERT(BP_IS_HOLE(bp)); + + zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db), + txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + if (pio) { + zio_nowait(zio); + err = EINPROGRESS; + } else { + err = zio_wait(zio); + ASSERT(err == 0); + } + return (err); +} + +int +dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + err = dnode_set_blksz(dn, size, ibs, tx); + dnode_rele(dn, FTAG); + return (err); +} + +void +dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os->os, object, FTAG, &dn); + ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + dn->dn_checksum = checksum; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +void +dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os->os, object, FTAG, &dn); + ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); + dn->dn_compress = compress; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +int +dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) +{ + dnode_t *dn; + int i, err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + /* + * Sync any current changes before + * we go trundling through the block pointers. + */ + for (i = 0; i < TXG_SIZE; i++) { + if (list_link_active(&dn->dn_dirty_link[i])) + break; + } + if (i != TXG_SIZE) { + dnode_rele(dn, FTAG); + txg_wait_synced(dmu_objset_pool(os), 0); + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + } + + err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); + dnode_rele(dn, FTAG); + + return (err); +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + doi->doi_data_block_size = dn->dn_datablksz; + doi->doi_metadata_block_size = dn->dn_indblkshift ? + 1ULL << dn->dn_indblkshift : 0; + doi->doi_indirection = dn->dn_nlevels; + doi->doi_checksum = dn->dn_checksum; + doi->doi_compress = dn->dn_compress; + doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; + doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; + doi->doi_type = dn->dn_type; + doi->doi_bonus_size = dn->dn_bonuslen; + doi->doi_bonus_type = dn->dn_bonustype; + + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); +} + +/* + * Get information on a DMU object. + * If doi is NULL, just indicates whether the object exists. + */ +int +dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) +{ + dnode_t *dn; + int err = dnode_hold(os->os, object, FTAG, &dn); + + if (err) + return (err); + + if (doi != NULL) + dmu_object_info_from_dnode(dn, doi); + + dnode_rele(dn, FTAG); + return (0); +} + +/* + * As above, but faster; can be used when you have a held dbuf in hand. + */ +void +dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) +{ + dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); +} + +/* + * Faster still when you only care about the size. + * This is specifically optimized for zfs_getattr(). + */ +void +dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + + *blksize = dn->dn_datablksz; + /* add 1 for dnode space */ + *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> + SPA_MINBLOCKSHIFT) + 1; +} + +void +byteswap_uint64_array(void *vbuf, size_t size) +{ + uint64_t *buf = vbuf; + size_t count = size >> 3; + int i; + + ASSERT((size & 7) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_64(buf[i]); +} + +void +byteswap_uint32_array(void *vbuf, size_t size) +{ + uint32_t *buf = vbuf; + size_t count = size >> 2; + int i; + + ASSERT((size & 3) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_32(buf[i]); +} + +void +byteswap_uint16_array(void *vbuf, size_t size) +{ + uint16_t *buf = vbuf; + size_t count = size >> 1; + int i; + + ASSERT((size & 1) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_16(buf[i]); +} + +/* ARGSUSED */ +void +byteswap_uint8_array(void *vbuf, size_t size) +{ +} + +void +dmu_init(void) +{ + dbuf_init(); + dnode_init(); + arc_init(); + l2arc_init(); +} + +void +dmu_fini(void) +{ + arc_fini(); + dnode_fini(); + dbuf_fini(); + l2arc_fini(); +} diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c new file mode 100644 index 000000000..1b9247d66 --- /dev/null +++ b/module/zfs/dmu_object.c @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dnode.h> + +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + objset_impl_t *osi = os->os; + uint64_t object; + uint64_t L2_dnode_count = DNODES_PER_BLOCK << + (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); + dnode_t *dn = NULL; + int restarted = B_FALSE; + + mutex_enter(&osi->os_obj_lock); + for (;;) { + object = osi->os_obj_next; + /* + * Each time we polish off an L2 bp worth of dnodes + * (2^13 objects), move to another L2 bp that's still + * reasonably sparse (at most 1/4 full). Look from the + * beginning once, but after that keep looking from here. + * If we can't find one, just keep going from here. + */ + if (P2PHASE(object, L2_dnode_count) == 0) { + uint64_t offset = restarted ? object << DNODE_SHIFT : 0; + int error = dnode_next_offset(osi->os_meta_dnode, + DNODE_FIND_HOLE, + &offset, 2, DNODES_PER_BLOCK >> 2, 0); + restarted = B_TRUE; + if (error == 0) + object = offset >> DNODE_SHIFT; + } + osi->os_obj_next = ++object; + + /* + * XXX We should check for an i/o error here and return + * up to our caller. Actually we should pre-read it in + * dmu_tx_assign(), but there is currently no mechanism + * to do so. + */ + (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, + FTAG, &dn); + if (dn) + break; + + if (dmu_object_next(os, &object, B_TRUE, 0) == 0) + osi->os_obj_next = object - 1; + } + + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + mutex_exit(&osi->os_obj_lock); + + dmu_tx_add_new_object(tx, os, object); + return (object); +} + +int +dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + return (EBADF); + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn); + if (err) + return (err); + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + dmu_tx_add_new_object(tx, os, object); + return (0); +} + +int +dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + return (EBADF); + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err) + return (err); + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err) + return (err); + + ASSERT(dn->dn_type != DMU_OT_NONE); + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) +{ + uint64_t offset = (*objectp + 1) << DNODE_SHIFT; + int error; + + error = dnode_next_offset(os->os->os_meta_dnode, + (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); + + *objectp = offset >> DNODE_SHIFT; + + return (error); +} diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c new file mode 100644 index 000000000..7981e0682 --- /dev/null +++ b/module/zfs/dmu_objset.c @@ -0,0 +1,1228 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/cred.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/dnode.h> +#include <sys/dbuf.h> +#include <sys/zvol.h> +#include <sys/dmu_tx.h> +#include <sys/zio_checksum.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/dmu_impl.h> +#include <sys/zfs_ioctl.h> + +spa_t * +dmu_objset_spa(objset_t *os) +{ + return (os->os->os_spa); +} + +zilog_t * +dmu_objset_zil(objset_t *os) +{ + return (os->os->os_zil); +} + +dsl_pool_t * +dmu_objset_pool(objset_t *os) +{ + dsl_dataset_t *ds; + + if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir) + return (ds->ds_dir->dd_pool); + else + return (spa_get_dsl(os->os->os_spa)); +} + +dsl_dataset_t * +dmu_objset_ds(objset_t *os) +{ + return (os->os->os_dsl_dataset); +} + +dmu_objset_type_t +dmu_objset_type(objset_t *os) +{ + return (os->os->os_phys->os_type); +} + +void +dmu_objset_name(objset_t *os, char *buf) +{ + dsl_dataset_name(os->os->os_dsl_dataset, buf); +} + +uint64_t +dmu_objset_id(objset_t *os) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + + return (ds ? ds->ds_object : 0); +} + +static void +checksum_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); +} + +static void +compression_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval != ZIO_COMPRESS_INHERIT); + + osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); +} + +static void +copies_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval > 0); + ASSERT(newval <= spa_max_replication(osi->os_spa)); + + osi->os_copies = newval; +} + +static void +primary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + osi->os_primary_cache = newval; +} + +static void +secondary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + osi->os_secondary_cache = newval; +} + +void +dmu_objset_byteswap(void *buf, size_t size) +{ + objset_phys_t *osp = buf; + + ASSERT(size == sizeof (objset_phys_t)); + dnode_byteswap(&osp->os_meta_dnode); + byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); + osp->os_type = BSWAP_64(osp->os_type); +} + +int +dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + objset_impl_t **osip) +{ + objset_impl_t *osi; + int i, err; + + ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); + + osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); + osi->os.os = osi; + osi->os_dsl_dataset = ds; + osi->os_spa = spa; + osi->os_rootbp = bp; + if (!BP_IS_HOLE(osi->os_rootbp)) { + uint32_t aflags = ARC_WAIT; + zbookmark_t zb; + zb.zb_objset = ds ? ds->ds_object : 0; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = 0; + if (DMU_OS_IS_L2CACHEABLE(osi)) + aflags |= ARC_L2CACHE; + + dprintf_bp(osi->os_rootbp, "reading %s", ""); + /* + * NB: when bprewrite scrub can change the bp, + * and this is called from dmu_objset_open_ds_os, the bp + * could change, and we'll need a lock. + */ + err = arc_read_nolock(NULL, spa, osi->os_rootbp, + arc_getbuf_func, &osi->os_phys_buf, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); + if (err) { + kmem_free(osi, sizeof (objset_impl_t)); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = EIO; + return (err); + } + osi->os_phys = osi->os_phys_buf->b_data; + } else { + osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), + &osi->os_phys_buf, ARC_BUFC_METADATA); + osi->os_phys = osi->os_phys_buf->b_data; + bzero(osi->os_phys, sizeof (objset_phys_t)); + } + + /* + * Note: the changed_cb will be called once before the register + * func returns, thus changing the checksum/compression from the + * default (fletcher2/off). Snapshots don't need to know about + * checksum/compression/copies. + */ + if (ds) { + err = dsl_prop_register(ds, "primarycache", + primary_cache_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "secondarycache", + secondary_cache_changed_cb, osi); + if (!dsl_dataset_is_snapshot(ds)) { + if (err == 0) + err = dsl_prop_register(ds, "checksum", + checksum_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "compression", + compression_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "copies", + copies_changed_cb, osi); + } + if (err) { + VERIFY(arc_buf_remove_ref(osi->os_phys_buf, + &osi->os_phys_buf) == 1); + kmem_free(osi, sizeof (objset_impl_t)); + return (err); + } + } else if (ds == NULL) { + /* It's the meta-objset. */ + osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + osi->os_compress = ZIO_COMPRESS_LZJB; + osi->os_copies = spa_max_replication(spa); + osi->os_primary_cache = ZFS_CACHE_ALL; + osi->os_secondary_cache = ZFS_CACHE_ALL; + } + + osi->os_zil_header = osi->os_phys->os_zil_header; + osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header); + + for (i = 0; i < TXG_SIZE; i++) { + list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i])); + list_create(&osi->os_free_dnodes[i], sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i])); + } + list_create(&osi->os_dnodes, sizeof (dnode_t), + offsetof(dnode_t, dn_link)); + list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + + osi->os_meta_dnode = dnode_special_open(osi, + &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + + /* + * We should be the only thread trying to do this because we + * have ds_opening_lock + */ + if (ds) { + VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi, + dmu_objset_evict)); + } + + *osip = osi; + return (0); +} + +static int +dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type) +{ + objset_impl_t *osi; + + mutex_enter(&ds->ds_opening_lock); + osi = dsl_dataset_get_user_ptr(ds); + if (osi == NULL) { + int err; + + err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), + ds, &ds->ds_phys->ds_bp, &osi); + if (err) { + mutex_exit(&ds->ds_opening_lock); + return (err); + } + } + mutex_exit(&ds->ds_opening_lock); + + os->os = osi; + os->os_mode = DS_MODE_NOHOLD; + + if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) + return (EINVAL); + return (0); +} + +int +dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp) +{ + objset_t *os; + int err; + + os = kmem_alloc(sizeof (objset_t), KM_SLEEP); + err = dmu_objset_open_ds_os(ds, os, type); + if (err) + kmem_free(os, sizeof (objset_t)); + else + *osp = os; + return (err); +} + +/* called from zpl */ +int +dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp) +{ + objset_t *os; + dsl_dataset_t *ds; + int err; + + ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER || + DS_MODE_TYPE(mode) == DS_MODE_OWNER); + + os = kmem_alloc(sizeof (objset_t), KM_SLEEP); + if (DS_MODE_TYPE(mode) == DS_MODE_USER) + err = dsl_dataset_hold(name, os, &ds); + else + err = dsl_dataset_own(name, mode, os, &ds); + if (err) { + kmem_free(os, sizeof (objset_t)); + return (err); + } + + err = dmu_objset_open_ds_os(ds, os, type); + if (err) { + if (DS_MODE_TYPE(mode) == DS_MODE_USER) + dsl_dataset_rele(ds, os); + else + dsl_dataset_disown(ds, os); + kmem_free(os, sizeof (objset_t)); + } else { + os->os_mode = mode; + *osp = os; + } + return (err); +} + +void +dmu_objset_close(objset_t *os) +{ + ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER || + DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER || + DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD); + + if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER) + dsl_dataset_rele(os->os->os_dsl_dataset, os); + else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER) + dsl_dataset_disown(os->os->os_dsl_dataset, os); + kmem_free(os, sizeof (objset_t)); +} + +int +dmu_objset_evict_dbufs(objset_t *os) +{ + objset_impl_t *osi = os->os; + dnode_t *dn; + + mutex_enter(&osi->os_lock); + + /* process the mdn last, since the other dnodes have holds on it */ + list_remove(&osi->os_dnodes, osi->os_meta_dnode); + list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode); + + /* + * Find the first dnode with holds. We have to do this dance + * because dnode_add_ref() only works if you already have a + * hold. If there are no holds then it has no dbufs so OK to + * skip. + */ + for (dn = list_head(&osi->os_dnodes); + dn && !dnode_add_ref(dn, FTAG); + dn = list_next(&osi->os_dnodes, dn)) + continue; + + while (dn) { + dnode_t *next_dn = dn; + + do { + next_dn = list_next(&osi->os_dnodes, next_dn); + } while (next_dn && !dnode_add_ref(next_dn, FTAG)); + + mutex_exit(&osi->os_lock); + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); + mutex_enter(&osi->os_lock); + dn = next_dn; + } + mutex_exit(&osi->os_lock); + return (list_head(&osi->os_dnodes) != osi->os_meta_dnode); +} + +void +dmu_objset_evict(dsl_dataset_t *ds, void *arg) +{ + objset_impl_t *osi = arg; + objset_t os; + int i; + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL); + ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); + } + + if (ds) { + if (!dsl_dataset_is_snapshot(ds)) { + VERIFY(0 == dsl_prop_unregister(ds, "checksum", + checksum_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "compression", + compression_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "copies", + copies_changed_cb, osi)); + } + VERIFY(0 == dsl_prop_unregister(ds, "primarycache", + primary_cache_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", + secondary_cache_changed_cb, osi)); + } + + /* + * We should need only a single pass over the dnode list, since + * nothing can be added to the list at this point. + */ + os.os = osi; + (void) dmu_objset_evict_dbufs(&os); + + ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); + ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); + ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); + + dnode_special_close(osi->os_meta_dnode); + zil_free(osi->os_zil); + + VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); + mutex_destroy(&osi->os_lock); + mutex_destroy(&osi->os_obj_lock); + mutex_destroy(&osi->os_user_ptr_lock); + kmem_free(osi, sizeof (objset_impl_t)); +} + +/* called from dsl for meta-objset */ +objset_impl_t * +dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, dmu_tx_t *tx) +{ + objset_impl_t *osi; + dnode_t *mdn; + + ASSERT(dmu_tx_is_syncing(tx)); + if (ds) + mutex_enter(&ds->ds_opening_lock); + VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); + if (ds) + mutex_exit(&ds->ds_opening_lock); + mdn = osi->os_meta_dnode; + + dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, + DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); + + /* + * We don't want to have to increase the meta-dnode's nlevels + * later, because then we could do it in quescing context while + * we are also accessing it in open context. + * + * This precaution is not necessary for the MOS (ds == NULL), + * because the MOS is only updated in syncing context. + * This is most fortunate: the MOS is the only objset that + * needs to be synced multiple times as spa_sync() iterates + * to convergence, so minimizing its dn_nlevels matters. + */ + if (ds != NULL) { + int levels = 1; + + /* + * Determine the number of levels necessary for the meta-dnode + * to contain DN_MAX_OBJECT dnodes. + */ + while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < + DN_MAX_OBJECT * sizeof (dnode_phys_t)) + levels++; + + mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = + mdn->dn_nlevels = levels; + } + + ASSERT(type != DMU_OST_NONE); + ASSERT(type != DMU_OST_ANY); + ASSERT(type < DMU_OST_NUMTYPES); + osi->os_phys->os_type = type; + + dsl_dataset_dirty(ds, tx); + + return (osi); +} + +struct oscarg { + void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); + void *userarg; + dsl_dataset_t *clone_parent; + const char *lastname; + dmu_objset_type_t type; + uint64_t flags; +}; + +/*ARGSUSED*/ +static int +dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct oscarg *oa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + int err; + uint64_t ddobj; + + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + oa->lastname, sizeof (uint64_t), 1, &ddobj); + if (err != ENOENT) + return (err ? err : EEXIST); + + if (oa->clone_parent != NULL) { + /* + * You can't clone across pools. + */ + if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool) + return (EXDEV); + + /* + * You can only clone snapshots, not the head datasets. + */ + if (oa->clone_parent->ds_phys->ds_num_children == 0) + return (EINVAL); + } + + return (0); +} + +static void +dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct oscarg *oa = arg2; + dsl_dataset_t *ds; + blkptr_t *bp; + uint64_t dsobj; + + ASSERT(dmu_tx_is_syncing(tx)); + + dsobj = dsl_dataset_create_sync(dd, oa->lastname, + oa->clone_parent, oa->flags, cr, tx); + + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds)); + bp = dsl_dataset_get_blkptr(ds); + if (BP_IS_HOLE(bp)) { + objset_impl_t *osi; + + /* This is an empty dmu_objset; not a clone. */ + osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds), + ds, bp, oa->type, tx); + + if (oa->userfunc) + oa->userfunc(&osi->os, oa->userarg, cr, tx); + } + + spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, + tx, cr, "dataset = %llu", dsobj); + + dsl_dataset_rele(ds, FTAG); +} + +int +dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) +{ + dsl_dir_t *pdd; + const char *tail; + int err = 0; + struct oscarg oa = { 0 }; + + ASSERT(strchr(name, '@') == NULL); + err = dsl_dir_open(name, FTAG, &pdd, &tail); + if (err) + return (err); + if (tail == NULL) { + dsl_dir_close(pdd, FTAG); + return (EEXIST); + } + + dprintf("name=%s\n", name); + + oa.userfunc = func; + oa.userarg = arg; + oa.lastname = tail; + oa.type = type; + oa.flags = flags; + + if (clone_parent != NULL) { + /* + * You can't clone to a different type. + */ + if (clone_parent->os->os_phys->os_type != type) { + dsl_dir_close(pdd, FTAG); + return (EINVAL); + } + oa.clone_parent = clone_parent->os->os_dsl_dataset; + } + err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, + dmu_objset_create_sync, pdd, &oa, 5); + dsl_dir_close(pdd, FTAG); + return (err); +} + +int +dmu_objset_destroy(const char *name) +{ + objset_t *os; + int error; + + /* + * If it looks like we'll be able to destroy it, and there's + * an unplayed replay log sitting around, destroy the log. + * It would be nicer to do this in dsl_dataset_destroy_sync(), + * but the replay log objset is modified in open context. + */ + error = dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os); + if (error == 0) { + dsl_dataset_t *ds = os->os->os_dsl_dataset; + zil_destroy(dmu_objset_zil(os), B_FALSE); + + error = dsl_dataset_destroy(ds, os); + /* + * dsl_dataset_destroy() closes the ds. + */ + kmem_free(os, sizeof (objset_t)); + } + + return (error); +} + +/* + * This will close the objset. + */ +int +dmu_objset_rollback(objset_t *os) +{ + int err; + dsl_dataset_t *ds; + + ds = os->os->os_dsl_dataset; + + if (!dsl_dataset_tryown(ds, TRUE, os)) { + dmu_objset_close(os); + return (EBUSY); + } + + err = dsl_dataset_rollback(ds, os->os->os_phys->os_type); + + /* + * NB: we close the objset manually because the rollback + * actually implicitly called dmu_objset_evict(), thus freeing + * the objset_impl_t. + */ + dsl_dataset_disown(ds, os); + kmem_free(os, sizeof (objset_t)); + return (err); +} + +struct snaparg { + dsl_sync_task_group_t *dstg; + char *snapname; + char failed[MAXPATHLEN]; + boolean_t checkperms; + list_t objsets; +}; + +struct osnode { + list_node_t node; + objset_t *os; +}; + +static int +dmu_objset_snapshot_one(char *name, void *arg) +{ + struct snaparg *sn = arg; + objset_t *os; + int err; + + (void) strcpy(sn->failed, name); + + /* + * Check permissions only when requested. This only applies when + * doing a recursive snapshot. The permission checks for the starting + * dataset have already been performed in zfs_secpolicy_snapshot() + */ + if (sn->checkperms == B_TRUE && + (err = zfs_secpolicy_snapshot_perms(name, CRED()))) + return (err); + + err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os); + if (err != 0) + return (err); + + /* If the objset is in an inconsistent state, return busy */ + if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { + dmu_objset_close(os); + return (EBUSY); + } + + /* + * NB: we need to wait for all in-flight changes to get to disk, + * so that we snapshot those changes. zil_suspend does this as + * a side effect. + */ + err = zil_suspend(dmu_objset_zil(os)); + if (err == 0) { + struct osnode *osn; + dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, os->os->os_dsl_dataset, + sn->snapname, 3); + osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP); + osn->os = os; + list_insert_tail(&sn->objsets, osn); + } else { + dmu_objset_close(os); + } + + return (err); +} + +int +dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) +{ + dsl_sync_task_t *dst; + struct osnode *osn; + struct snaparg sn = { 0 }; + spa_t *spa; + int err; + + (void) strcpy(sn.failed, fsname); + + err = spa_open(fsname, &spa, FTAG); + if (err) + return (err); + + sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + sn.snapname = snapname; + list_create(&sn.objsets, sizeof (struct osnode), + offsetof(struct osnode, node)); + + if (recursive) { + sn.checkperms = B_TRUE; + err = dmu_objset_find(fsname, + dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); + } else { + sn.checkperms = B_FALSE; + err = dmu_objset_snapshot_one(fsname, &sn); + } + + if (err) + goto out; + + err = dsl_sync_task_group_wait(sn.dstg); + + for (dst = list_head(&sn.dstg->dstg_tasks); dst; + dst = list_next(&sn.dstg->dstg_tasks, dst)) { + dsl_dataset_t *ds = dst->dst_arg1; + if (dst->dst_err) + dsl_dataset_name(ds, sn.failed); + } + +out: + while (osn = list_head(&sn.objsets)) { + list_remove(&sn.objsets, osn); + zil_resume(dmu_objset_zil(osn->os)); + dmu_objset_close(osn->os); + kmem_free(osn, sizeof (struct osnode)); + } + list_destroy(&sn.objsets); + + if (err) + (void) strcpy(fsname, sn.failed); + dsl_sync_task_group_destroy(sn.dstg); + spa_close(spa, FTAG); + return (err); +} + +static void +dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) +{ + dnode_t *dn; + + while (dn = list_head(list)) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(dn->dn_dbuf->db_data_pending); + /* + * Initialize dn_zio outside dnode_sync() + * to accomodate meta-dnode + */ + dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; + ASSERT(dn->dn_zio); + + ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); + list_remove(list, dn); + dnode_sync(dn, tx); + } +} + +/* ARGSUSED */ +static void +ready(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + objset_impl_t *os = arg; + dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + + ASSERT(bp == os->os_rootbp); + ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); + ASSERT(BP_GET_LEVEL(bp) == 0); + + /* + * Update rootbp fill count. + */ + bp->blk_fill = 1; /* count the meta-dnode */ + for (int i = 0; i < dnp->dn_nblkptr; i++) + bp->blk_fill += dnp->dn_blkptr[i].blk_fill; + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + } else { + if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) + (void) dsl_dataset_block_kill(os->os_dsl_dataset, + &zio->io_bp_orig, zio, os->os_synctx); + dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); + } +} + +/* called from dsl */ +void +dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) +{ + int txgoff; + zbookmark_t zb; + writeprops_t wp = { 0 }; + zio_t *zio; + list_t *list; + dbuf_dirty_record_t *dr; + + dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); + + ASSERT(dmu_tx_is_syncing(tx)); + /* XXX the write_done callback should really give us the tx... */ + os->os_synctx = tx; + + if (os->os_dsl_dataset == NULL) { + /* + * This is the MOS. If we have upgraded, + * spa_max_replication() could change, so reset + * os_copies here. + */ + os->os_copies = spa_max_replication(os->os_spa); + } + + /* + * Create the root block IO + */ + zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; + zb.zb_object = 0; + zb.zb_level = -1; /* for block ordering; it's level 0 on disk */ + zb.zb_blkid = 0; + + wp.wp_type = DMU_OT_OBJSET; + wp.wp_level = 0; /* on-disk BP level; see above */ + wp.wp_copies = os->os_copies; + wp.wp_oschecksum = os->os_checksum; + wp.wp_oscompress = os->os_compress; + + if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) { + (void) dsl_dataset_block_kill(os->os_dsl_dataset, + os->os_rootbp, pio, tx); + } + + arc_release(os->os_phys_buf, &os->os_phys_buf); + zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), + tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + + /* + * Sync meta-dnode - the parent IO for the sync is the root block + */ + os->os_meta_dnode->dn_zio = zio; + dnode_sync(os->os_meta_dnode, tx); + + txgoff = tx->tx_txg & TXG_MASK; + + dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); + dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); + + list = &os->os_meta_dnode->dn_dirty_records[txgoff]; + while (dr = list_head(list)) { + ASSERT(dr->dr_dbuf->db_level == 0); + list_remove(list, dr); + if (dr->dr_zio) + zio_nowait(dr->dr_zio); + } + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + zio_nowait(zio); +} + +void +dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp, + usedobjsp, availobjsp); +} + +uint64_t +dmu_objset_fsid_guid(objset_t *os) +{ + return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset)); +} + +void +dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) +{ + stat->dds_type = os->os->os_phys->os_type; + if (os->os->os_dsl_dataset) + dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat); +} + +void +dmu_objset_stats(objset_t *os, nvlist_t *nv) +{ + ASSERT(os->os->os_dsl_dataset || + os->os->os_phys->os_type == DMU_OST_META); + + if (os->os->os_dsl_dataset != NULL) + dsl_dataset_stats(os->os->os_dsl_dataset, nv); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, + os->os->os_phys->os_type); +} + +int +dmu_objset_is_snapshot(objset_t *os) +{ + if (os->os->os_dsl_dataset != NULL) + return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset)); + else + return (B_FALSE); +} + +int +dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, + boolean_t *conflict) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + uint64_t ignored; + + if (ds->ds_phys->ds_snapnames_zapobj == 0) + return (ENOENT); + + return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, + real, maxlen, conflict)); +} + +int +dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + zap_cursor_t cursor; + zap_attribute_t attr; + + if (ds->ds_phys->ds_snapnames_zapobj == 0) + return (ENOENT); + + zap_cursor_init_serialized(&cursor, + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (ENOENT); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (ENAMETOOLONG); + } + + (void) strcpy(name, attr.za_name); + if (idp) + *idp = attr.za_first_integer; + if (case_conflict) + *case_conflict = attr.za_normalization_conflict; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +int +dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp) +{ + dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir; + zap_cursor_t cursor; + zap_attribute_t attr; + + /* there is no next dir on a snapshot! */ + if (os->os->os_dsl_dataset->ds_object != + dd->dd_phys->dd_head_dataset_obj) + return (ENOENT); + + zap_cursor_init_serialized(&cursor, + dd->dd_pool->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (ENOENT); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (ENAMETOOLONG); + } + + (void) strcpy(name, attr.za_name); + if (idp) + *idp = attr.za_first_integer; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +struct findarg { + int (*func)(char *, void *); + void *arg; +}; + +/* ARGSUSED */ +static int +findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct findarg *fa = arg; + return (fa->func((char *)dsname, fa->arg)); +} + +/* + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * Perhaps change all callers to use dmu_objset_find_spa()? + */ +int +dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) +{ + struct findarg fa; + fa.func = func; + fa.arg = arg; + return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); +} + +/* + * Find all objsets under name, call func on each + */ +int +dmu_objset_find_spa(spa_t *spa, const char *name, + int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) +{ + dsl_dir_t *dd; + dsl_pool_t *dp; + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + char *child; + uint64_t thisobj; + int err; + + if (name == NULL) + name = spa_name(spa); + err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); + if (err) + return (err); + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_close(dd, FTAG); + return (0); + } + + thisobj = dd->dd_phys->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + dp = dd->dd_pool; + + /* + * Iterate over all children. + */ + if (flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr->za_integer_length == sizeof (uint64_t)); + ASSERT(attr->za_num_integers == 1); + + child = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strcpy(child, name); + (void) strcat(child, "/"); + (void) strcat(child, attr->za_name); + err = dmu_objset_find_spa(spa, child, func, arg, flags); + kmem_free(child, MAXPATHLEN); + if (err) + break; + } + zap_cursor_fini(&zc); + + if (err) { + dsl_dir_close(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + return (err); + } + } + + /* + * Iterate over all snapshots. + */ + if (flags & DS_FIND_SNAPSHOTS) { + if (!dsl_pool_sync_context(dp)) + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + if (!dsl_pool_sync_context(dp)) + rw_exit(&dp->dp_config_rwlock); + + if (err == 0) { + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr->za_integer_length == + sizeof (uint64_t)); + ASSERT(attr->za_num_integers == 1); + + child = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strcpy(child, name); + (void) strcat(child, "@"); + (void) strcat(child, attr->za_name); + err = func(spa, attr->za_first_integer, + child, arg); + kmem_free(child, MAXPATHLEN); + if (err) + break; + } + zap_cursor_fini(&zc); + } + } + + dsl_dir_close(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + + if (err) + return (err); + + /* + * Apply to self if appropriate. + */ + err = func(spa, thisobj, name, arg); + return (err); +} + +void +dmu_objset_set_user(objset_t *os, void *user_ptr) +{ + ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); + os->os->os_user_ptr = user_ptr; +} + +void * +dmu_objset_get_user(objset_t *os) +{ + ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); + return (os->os->os_user_ptr); +} diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c new file mode 100644 index 000000000..857b9a343 --- /dev/null +++ b/module/zfs/dmu_send.c @@ -0,0 +1,1181 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> + +static char *dmu_recv_tag = "dmu_recv_tag"; + +struct backuparg { + dmu_replay_record_t *drr; + vnode_t *vp; + offset_t *off; + objset_t *os; + zio_cksum_t zc; + int err; +}; + +static int +dump_bytes(struct backuparg *ba, void *buf, int len) +{ + ssize_t resid; /* have to get resid to get detailed errno */ + ASSERT3U(len % 8, ==, 0); + + fletcher_4_incremental_native(buf, len, &ba->zc); + ba->err = vn_rdwr(UIO_WRITE, ba->vp, + (caddr_t)buf, len, + 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); + *ba->off += len; + return (ba->err); +} + +static int +dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, + uint64_t length) +{ + /* write a FREE record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREE; + ba->drr->drr_u.drr_free.drr_object = object; + ba->drr->drr_u.drr_free.drr_offset = offset; + ba->drr->drr_u.drr_free.drr_length = length; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + return (0); +} + +static int +dump_data(struct backuparg *ba, dmu_object_type_t type, + uint64_t object, uint64_t offset, int blksz, void *data) +{ + /* write a DATA record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_WRITE; + ba->drr->drr_u.drr_write.drr_object = object; + ba->drr->drr_u.drr_write.drr_type = type; + ba->drr->drr_u.drr_write.drr_offset = offset; + ba->drr->drr_u.drr_write.drr_length = blksz; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + if (dump_bytes(ba, data, blksz)) + return (EINTR); + return (0); +} + +static int +dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) +{ + /* write a FREEOBJECTS record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREEOBJECTS; + ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; + ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + return (0); +} + +static int +dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) +{ + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) + return (dump_freeobjects(ba, object, 1)); + + /* write an OBJECT record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_OBJECT; + ba->drr->drr_u.drr_object.drr_object = object; + ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; + ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; + ba->drr->drr_u.drr_object.drr_blksz = + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; + ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; + ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + + if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) + return (EINTR); + + /* free anything past the end of the file */ + if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + return (EINTR); + if (ba->err) + return (EINTR); + return (0); +} + +#define BP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +static int +backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) +{ + struct backuparg *ba = arg; + dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + if (bp == NULL && zb->zb_object == 0) { + uint64_t span = BP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); + } else if (bp == NULL) { + uint64_t span = BP_SPAN(dnp, zb->zb_level); + err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); + } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { + return (0); + } else if (type == DMU_OT_DNODE) { + dnode_phys_t *blk; + int i; + int blksz = BP_GET_LSIZE(bp); + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + + if (arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + blk = abuf->b_data; + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = dump_dnode(ba, dnobj, blk+i); + if (err) + break; + } + (void) arc_buf_remove_ref(abuf, &abuf); + } else { /* it's a level-0 block of a regular object */ + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + int blksz = BP_GET_LSIZE(bp); + + if (arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, + blksz, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); + } + + ASSERT(err == 0 || err == EINTR); + return (err); +} + +int +dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + vnode_t *vp, offset_t *off) +{ + dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; + dmu_replay_record_t *drr; + struct backuparg ba; + int err; + uint64_t fromtxg = 0; + + /* tosnap must be a snapshot */ + if (ds->ds_phys->ds_next_snap_obj == 0) + return (EINVAL); + + /* fromsnap must be an earlier snapshot from the same fs as tosnap */ + if (fromds && (ds->ds_dir != fromds->ds_dir || + fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) + return (EXDEV); + + if (fromorigin) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (fromsnap) + return (EINVAL); + + if (dsl_dir_is_clone(ds->ds_dir)) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + } else { + fromorigin = B_FALSE; + } + } + + + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION; + drr->drr_u.drr_begin.drr_creation_time = + ds->ds_phys->ds_creation_time; + drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; + if (fromorigin) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; + drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + + if (fromds) + drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; + dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); + + if (fromds) + fromtxg = fromds->ds_phys->ds_creation_txg; + if (fromorigin) + dsl_dataset_rele(fromds, FTAG); + + ba.drr = drr; + ba.vp = vp; + ba.os = tosnap; + ba.off = off; + ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, + backup_cb, &ba); + + if (err) { + if (err == EINTR && ba.err) + err = ba.err; + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (err); + } + + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = ba.zc; + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + kmem_free(drr, sizeof (dmu_replay_record_t)); + + return (0); +} + +struct recvbeginsyncarg { + const char *tofs; + const char *tosnap; + dsl_dataset_t *origin; + uint64_t fromguid; + dmu_objset_type_t type; + void *tag; + boolean_t force; + uint64_t dsflags; + char clonelastname[MAXNAMELEN]; + dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ +}; + +static dsl_dataset_t * +recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type, + cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + + /* This should always work, since we just created it */ + /* XXX - create should return an owned ds */ + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, + DS_MODE_INCONSISTENT, dmu_recv_tag, &ds)); + + if (type != DMU_OST_NONE) { + (void) dmu_objset_create_impl(dp->dp_spa, + ds, &ds->ds_phys->ds_bp, type, tx); + } + + spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, + dp->dp_spa, tx, cr, "dataset = %lld", dsobj); + + return (ds); +} + +/* ARGSUSED */ +static int +recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t val; + int err; + + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); + + if (err != ENOENT) + return (err ? err : EEXIST); + + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) + return (EXDEV); + if (rbsa->origin->ds_phys->ds_num_children == 0) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } + + return (0); +} + +static void +recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, + rbsa->origin, flags, cr, tx); + + rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, + rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); +} + +static int +recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + int err; + + /* must be a head ds */ + if (ds->ds_phys->ds_next_snap_obj != 0) + return (EINVAL); + + /* must not be a clone ds */ + if (dsl_dir_is_clone(ds->ds_dir)) + return (EINVAL); + + err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); + if (err) + return (err); + + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool) + return (EXDEV); + if (rbsa->origin->ds_phys->ds_num_children == 0) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } + + return (0); +} + +static void +recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + dsl_dir_t *dd = ds->ds_dir; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + /* + * NB: caller must provide an extra hold on the dsl_dir_t, so it + * won't go away when dsl_dataset_destroy_sync() closes the + * dataset. + */ + dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); + + dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx); + + rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, + rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); +} + +/* ARGSUSED */ +static int +recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + int err; + uint64_t val; + + /* must not have any changes since most recent snapshot */ + if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) + return (ETXTBSY); + + /* must already be a snapshot of this fs */ + if (ds->ds_phys->ds_prev_snap_obj == 0) + return (ENODEV); + + /* most recent snapshot must match fromguid */ + if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + + /* temporary clone name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, + rbsa->clonelastname, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + /* new snapshot name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + return (0); +} + +/* ARGSUSED */ +static void +recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ohds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + dsl_pool_t *dp = ohds->ds_dir->dd_pool; + dsl_dataset_t *ods, *cds; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + /* create the temporary clone */ + VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj, + FTAG, &ods)); + dsobj = dsl_dataset_create_sync(ohds->ds_dir, + rbsa->clonelastname, ods, flags, cr, tx); + dsl_dataset_rele(ods, FTAG); + + /* open the temporary clone */ + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, + DS_MODE_INCONSISTENT, dmu_recv_tag, &cds)); + + /* copy the refquota from the target fs to the clone */ + if (ohds->ds_quota > 0) + dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx); + + rbsa->ds = cds; + + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + dp->dp_spa, tx, cr, "dataset = %lld", dsobj); +} + +/* ARGSUSED */ +static void +recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", + ds->ds_object); +} + +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, + boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) +{ + int err = 0; + boolean_t byteswap; + struct recvbeginsyncarg rbsa; + uint64_t version; + int flags; + dsl_dataset_t *ds; + + if (drrb->drr_magic == DMU_BACKUP_MAGIC) + byteswap = FALSE; + else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + byteswap = TRUE; + else + return (EINVAL); + + rbsa.tofs = tofs; + rbsa.tosnap = tosnap; + rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL; + rbsa.fromguid = drrb->drr_fromguid; + rbsa.type = drrb->drr_type; + rbsa.tag = FTAG; + rbsa.dsflags = 0; + version = drrb->drr_version; + flags = drrb->drr_flags; + + if (byteswap) { + rbsa.type = BSWAP_32(rbsa.type); + rbsa.fromguid = BSWAP_64(rbsa.fromguid); + version = BSWAP_64(version); + flags = BSWAP_32(flags); + } + + if (version != DMU_BACKUP_STREAM_VERSION || + rbsa.type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && origin == NULL)) + return (EINVAL); + + if (flags & DRR_FLAG_CI_DATA) + rbsa.dsflags = DS_FLAG_CI_DATASET; + + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drrb = drrb; + drc->drc_tosnap = tosnap; + drc->drc_force = force; + + /* + * Process the begin in syncing context. + */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { + /* offline incremental receive */ + err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds); + if (err) + return (err); + + /* + * Only do the rollback if the most recent snapshot + * matches the incremental source + */ + if (force) { + if (ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_guid != + rbsa.fromguid) { + dsl_dataset_disown(ds, dmu_recv_tag); + return (ENODEV); + } + (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + } + rbsa.force = B_FALSE; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_incremental_check, + recv_offline_incremental_sync, ds, &rbsa, 1); + if (err) { + dsl_dataset_disown(ds, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = drc->drc_real_ds = ds; + } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { + /* online incremental receive */ + + /* tmp clone name is: tofs/%tosnap" */ + (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), + "%%%s", tosnap); + + /* open the dataset we are logically receiving into */ + err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); + if (err) + return (err); + + rbsa.force = force; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_incremental_check, + recv_online_incremental_sync, ds, &rbsa, 5); + if (err) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = ds; + drc->drc_real_ds = rbsa.ds; + } else { + /* create new fs -- full backup or clone */ + dsl_dir_t *dd = NULL; + const char *tail; + + err = dsl_dir_open(tofs, FTAG, &dd, &tail); + if (err) + return (err); + if (tail == NULL) { + if (!force) { + dsl_dir_close(dd, FTAG); + return (EEXIST); + } + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_dataset_own_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, + DS_MODE_INCONSISTENT, FTAG, &ds); + rw_exit(&dd->dd_pool->dp_config_rwlock); + if (err) { + dsl_dir_close(dd, FTAG); + return (err); + } + + dsl_dataset_make_exclusive(ds, FTAG); + err = dsl_sync_task_do(dd->dd_pool, + recv_full_existing_check, + recv_full_existing_sync, ds, &rbsa, 5); + dsl_dataset_disown(ds, FTAG); + } else { + err = dsl_sync_task_do(dd->dd_pool, recv_full_check, + recv_full_sync, dd, &rbsa, 5); + } + dsl_dir_close(dd, FTAG); + if (err) + return (err); + drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; + drc->drc_newfs = B_TRUE; + } + + return (0); +} + +struct restorearg { + int err; + int byteswap; + vnode_t *vp; + char *buf; + uint64_t voff; + int bufsize; /* amount of memory allocated for buf */ + zio_cksum_t cksum; +}; + +static void * +restore_read(struct restorearg *ra, int len) +{ + void *rv; + int done = 0; + + /* some things will require 8-byte alignment, so everything must */ + ASSERT3U(len % 8, ==, 0); + + while (done < len) { + ssize_t resid; + + ra->err = vn_rdwr(UIO_READ, ra->vp, + (caddr_t)ra->buf + done, len - done, + ra->voff, UIO_SYSSPACE, FAPPEND, + RLIM64_INFINITY, CRED(), &resid); + + if (resid == len - done) + ra->err = EINVAL; + ra->voff += len - done - resid; + done = len - resid; + if (ra->err) + return (NULL); + } + + ASSERT3U(done, ==, len); + rv = ra->buf; + if (ra->byteswap) + fletcher_4_incremental_byteswap(rv, len, &ra->cksum); + else + fletcher_4_incremental_native(rv, len, &ra->cksum); + return (rv); +} + +static void +backup_byteswap(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_version); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + /* DO64(drr_object.drr_allocation_txg); */ + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_bonuslen); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_length); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + break; + case DRR_END: + DO64(drr_end.drr_checksum.zc_word[0]); + DO64(drr_end.drr_checksum.zc_word[1]); + DO64(drr_end.drr_checksum.zc_word[2]); + DO64(drr_end.drr_checksum.zc_word[3]); + break; + } +#undef DO64 +#undef DO32 +} + +static int +restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) +{ + int err; + dmu_tx_t *tx; + void *data = NULL; + + err = dmu_object_info(os, drro->drr_object, NULL); + + if (err != 0 && err != ENOENT) + return (EINVAL); + + if (drro->drr_type == DMU_OT_NONE || + drro->drr_type >= DMU_OT_NUMTYPES || + drro->drr_bonustype >= DMU_OT_NUMTYPES || + drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || + P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || + drro->drr_blksz < SPA_MINBLOCKSIZE || + drro->drr_blksz > SPA_MAXBLOCKSIZE || + drro->drr_bonuslen > DN_MAX_BONUSLEN) { + return (EINVAL); + } + + if (drro->drr_bonuslen) { + data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); + if (ra->err) + return (ra->err); + } + + tx = dmu_tx_create(os); + + if (err == ENOENT) { + /* currently free, want to be allocated */ + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_object_claim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } else { + /* currently allocated, want to be allocated */ + dmu_tx_hold_bonus(tx, drro->drr_object); + /* + * We may change blocksize, so need to + * hold_write + */ + dmu_tx_hold_write(tx, drro->drr_object, 0, 1); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + err = dmu_object_reclaim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } + if (err) { + dmu_tx_commit(tx); + return (EINVAL); + } + + dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); + dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); + + if (data != NULL) { + dmu_buf_t *db; + + VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + bcopy(data, db->db_data, drro->drr_bonuslen); + if (ra->byteswap) { + dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + drro->drr_bonuslen); + } + dmu_buf_rele(db, FTAG); + } + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_freeobjects(struct restorearg *ra, objset_t *os, + struct drr_freeobjects *drrfo) +{ + uint64_t obj; + + if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) + return (EINVAL); + + for (obj = drrfo->drr_firstobj; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs; + (void) dmu_object_next(os, &obj, FALSE, 0)) { + int err; + + if (dmu_object_info(os, obj, NULL) != 0) + continue; + + err = dmu_free_object(os, obj); + if (err) + return (err); + } + return (0); +} + +static int +restore_write(struct restorearg *ra, objset_t *os, + struct drr_write *drrw) +{ + dmu_tx_t *tx; + void *data; + int err; + + if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || + drrw->drr_type >= DMU_OT_NUMTYPES) + return (EINVAL); + + data = restore_read(ra, drrw->drr_length); + if (data == NULL) + return (ra->err); + + if (dmu_object_info(os, drrw->drr_object, NULL) != 0) + return (EINVAL); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrw->drr_object, + drrw->drr_offset, drrw->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + if (ra->byteswap) + dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + dmu_write(os, drrw->drr_object, + drrw->drr_offset, drrw->drr_length, data, tx); + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_free(struct restorearg *ra, objset_t *os, + struct drr_free *drrf) +{ + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (EINVAL); + + if (dmu_object_info(os, drrf->drr_object, NULL) != 0) + return (EINVAL); + + err = dmu_free_long_range(os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + return (err); +} + +void +dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) +{ + if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { + /* + * online incremental or new fs: destroy the fs (which + * may be a clone) that we created + */ + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (drc->drc_real_ds != drc->drc_logical_ds) + dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); + } else { + /* + * offline incremental: rollback to most recent snapshot. + */ + (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE); + dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag); + } +} + +/* + * NB: callers *must* call dmu_recv_end() if this succeeds. + */ +int +dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) +{ + struct restorearg ra = { 0 }; + dmu_replay_record_t *drr; + objset_t *os; + zio_cksum_t pcksum; + + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + ra.byteswap = TRUE; + + { + /* compute checksum of drr_begin record */ + dmu_replay_record_t *drr; + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin = *drc->drc_drrb; + if (ra.byteswap) { + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } else { + fletcher_4_incremental_native(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } + kmem_free(drr, sizeof (dmu_replay_record_t)); + } + + if (ra.byteswap) { + struct drr_begin *drrb = drc->drc_drrb; + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_version = BSWAP_64(drrb->drr_version); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } + + ra.vp = vp; + ra.voff = *voffp; + ra.bufsize = 1<<20; + ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + + /* these were verified in dmu_recv_begin */ + ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION); + ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); + + /* + * Open the objset we are modifying. + */ + VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0); + + ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + + /* + * Read records and process them. + */ + pcksum = ra.cksum; + while (ra.err == 0 && + NULL != (drr = restore_read(&ra, sizeof (*drr)))) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + ra.err = EINTR; + goto out; + } + + if (ra.byteswap) + backup_byteswap(drr); + + switch (drr->drr_type) { + case DRR_OBJECT: + { + /* + * We need to make a copy of the record header, + * because restore_{object,write} may need to + * restore_read(), which will invalidate drr. + */ + struct drr_object drro = drr->drr_u.drr_object; + ra.err = restore_object(&ra, os, &drro); + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects drrfo = + drr->drr_u.drr_freeobjects; + ra.err = restore_freeobjects(&ra, os, &drrfo); + break; + } + case DRR_WRITE: + { + struct drr_write drrw = drr->drr_u.drr_write; + ra.err = restore_write(&ra, os, &drrw); + break; + } + case DRR_FREE: + { + struct drr_free drrf = drr->drr_u.drr_free; + ra.err = restore_free(&ra, os, &drrf); + break; + } + case DRR_END: + { + struct drr_end drre = drr->drr_u.drr_end; + /* + * We compare against the *previous* checksum + * value, because the stored checksum is of + * everything before the DRR_END record. + */ + if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) + ra.err = ECKSUM; + goto out; + } + default: + ra.err = EINVAL; + goto out; + } + pcksum = ra.cksum; + } + ASSERT(ra.err != 0); + +out: + dmu_objset_close(os); + + if (ra.err != 0) { + /* + * rollback or destroy what we created, so we don't + * leave it in the restoring state. + */ + txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); + dmu_recv_abort_cleanup(drc); + } + + kmem_free(ra.buf, ra.bufsize); + *voffp = ra.voff; + return (ra.err); +} + +struct recvendsyncarg { + char *tosnap; + uint64_t creation_time; + uint64_t toguid; +}; + +static int +recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); +} + +static void +recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; + ds->ds_prev->ds_phys->ds_guid = resa->toguid; + ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc) +{ + struct recvendsyncarg resa; + dsl_dataset_t *ds = drc->drc_logical_ds; + int err; + + /* + * XXX hack; seems the ds is still dirty and + * dsl_pool_zil_clean() expects it to have a ds_user_ptr + * (and zil), but clone_swap() can close it. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + + if (ds != drc->drc_real_ds) { + /* we are doing an online recv */ + if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { + err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, + drc->drc_force); + if (err) + dsl_dataset_disown(ds, dmu_recv_tag); + } else { + err = EBUSY; + dsl_dataset_rele(ds, dmu_recv_tag); + } + /* dsl_dataset_destroy() will disown the ds */ + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (err) + return (err); + } + + resa.creation_time = drc->drc_drrb->drr_creation_time; + resa.toguid = drc->drc_drrb->drr_toguid; + resa.tosnap = drc->drc_tosnap; + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_end_check, recv_end_sync, ds, &resa, 3); + if (err) { + if (drc->drc_newfs) { + ASSERT(ds == drc->drc_real_ds); + (void) dsl_dataset_destroy(ds, dmu_recv_tag); + return (err); + } else { + (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + } + } + + /* release the hold from dmu_recv_begin */ + dsl_dataset_disown(ds, dmu_recv_tag); + return (err); +} diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c new file mode 100644 index 000000000..512401470 --- /dev/null +++ b/module/zfs/dmu_traverse.c @@ -0,0 +1,406 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_impl.h> +#include <sys/callb.h> + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +struct prefetch_data { + kmutex_t pd_mtx; + kcondvar_t pd_cv; + int pd_blks_max; + int pd_blks_fetched; + int pd_flags; + boolean_t pd_cancel; + boolean_t pd_exited; +}; + +struct traverse_data { + spa_t *td_spa; + uint64_t td_objset; + blkptr_t *td_rootbp; + uint64_t td_min_txg; + int td_flags; + struct prefetch_data *td_pfd; + blkptr_cb_t *td_func; + void *td_arg; +}; + +/* ARGSUSED */ +static void +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + struct traverse_data *td = arg; + zbookmark_t zb; + + if (bp->blk_birth == 0) + return; + + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) + return; + + zb.zb_objset = td->td_objset; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); +} + +/* ARGSUSED */ +static void +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + struct traverse_data *td = arg; + + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth == 0) + return; + + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return; + + zb.zb_objset = td->td_objset; + zb.zb_object = lr->lr_foid; + zb.zb_level = BP_GET_LEVEL(bp); + zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); + } +} + +static void +traverse_zil(struct traverse_data *td, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && (spa_mode & FWRITE)) + return; + + zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); + + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, + claim_txg); + + zil_free(zilog); +} + +static int +traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) +{ + zbookmark_t czb; + int err = 0; + arc_buf_t *buf = NULL; + struct prefetch_data *pd = td->td_pfd; + + if (bp->blk_birth == 0) { + err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg); + return (err); + } + + if (bp->blk_birth <= td->td_min_txg) + return (0); + + if (pd && !pd->pd_exited && + ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { + mutex_enter(&pd->pd_mtx); + ASSERT(pd->pd_blks_fetched >= 0); + while (pd->pd_blks_fetched == 0 && !pd->pd_exited) + cv_wait(&pd->pd_cv, &pd->pd_mtx); + pd->pd_blks_fetched--; + cv_broadcast(&pd->pd_cv); + mutex_exit(&pd->pd_mtx); + } + + if (td->td_flags & TRAVERSE_PRE) { + err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + if (err) + return (err); + } + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read(NULL, td->td_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visitbp() blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = traverse_visitbp(td, dnp, buf, cbp, &czb); + if (err) + break; + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read(NULL, td->td_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visitbp() blocks below this */ + dnp = buf->b_data; + for (i = 0; i < epb && err == 0; i++, dnp++) { + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, zb->zb_objset, + zb->zb_blkid * epb + i, + dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_blkptr[j], &czb); + if (err) + break; + } + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + int j; + + err = arc_read_nolock(NULL, td->td_spa, bp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + osp = buf->b_data; + /* + * traverse_zil is just here for zdb's leak checking. + * For other consumers, there will be no ZIL blocks. + */ + traverse_zil(td, &osp->os_zil_header); + + for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { + SET_BOOKMARK(&czb, zb->zb_objset, 0, + osp->os_meta_dnode.dn_nlevels - 1, j); + err = traverse_visitbp(td, &osp->os_meta_dnode, buf, + (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j], + &czb); + if (err) + break; + } + } + + if (buf) + (void) arc_buf_remove_ref(buf, &buf); + + if (err == 0 && (td->td_flags & TRAVERSE_POST)) + err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + + return (err); +} + +/* ARGSUSED */ +static int +traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) +{ + struct prefetch_data *pfd = arg; + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + + ASSERT(pfd->pd_blks_fetched >= 0); + if (pfd->pd_cancel) + return (EINTR); + + if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) + return (0); + + mutex_enter(&pfd->pd_mtx); + while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) + cv_wait(&pfd->pd_cv, &pfd->pd_mtx); + pfd->pd_blks_fetched++; + cv_broadcast(&pfd->pd_cv); + mutex_exit(&pfd->pd_mtx); + + (void) arc_read_nolock(NULL, spa, bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, zb); + + return (0); +} + +static void +traverse_prefetch_thread(void *arg) +{ + struct traverse_data *td_main = arg; + struct traverse_data td = *td_main; + zbookmark_t czb; + + td.td_func = traverse_prefetcher; + td.td_arg = td_main->td_pfd; + td.td_pfd = NULL; + + SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0); + (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); + + mutex_enter(&td_main->td_pfd->pd_mtx); + td_main->td_pfd->pd_exited = B_TRUE; + cv_broadcast(&td_main->td_pfd->pd_cv); + mutex_exit(&td_main->td_pfd->pd_mtx); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +static int +traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) +{ + struct traverse_data td; + struct prefetch_data pd = { 0 }; + zbookmark_t czb; + int err; + + td.td_spa = spa; + td.td_objset = objset; + td.td_rootbp = rootbp; + td.td_min_txg = txg_start; + td.td_func = func; + td.td_arg = arg; + td.td_pfd = &pd; + td.td_flags = flags; + + pd.pd_blks_max = 100; + pd.pd_flags = flags; + mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + + if (!(flags & TRAVERSE_PREFETCH) || + 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, + &td, TQ_NOQUEUE)) + pd.pd_exited = B_TRUE; + + SET_BOOKMARK(&czb, objset, 0, -1, 0); + err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); + + mutex_enter(&pd.pd_mtx); + pd.pd_cancel = B_TRUE; + cv_broadcast(&pd.pd_cv); + while (!pd.pd_exited) + cv_wait(&pd.pd_cv, &pd.pd_mtx); + mutex_exit(&pd.pd_mtx); + + mutex_destroy(&pd.pd_mtx); + cv_destroy(&pd.pd_cv); + + return (err); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object, + &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); +} + +/* + * NB: pool must not be changing on-disk (eg, from zdb or sync context). + */ +int +traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg) +{ + int err; + uint64_t obj; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + + /* visit the MOS */ + err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), + 0, TRAVERSE_PRE, func, arg); + if (err) + return (err); + + /* visit each dataset */ + for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) { + dmu_object_info_t doi; + + err = dmu_object_info(mos, obj, &doi); + if (err) + return (err); + + if (doi.doi_type == DMU_OT_DSL_DATASET) { + dsl_dataset_t *ds; + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + err = traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE, + func, arg); + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + } + } + if (err == ESRCH) + err = 0; + return (err); +} diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c new file mode 100644 index 000000000..bf560e565 --- /dev/null +++ b/module/zfs/dmu_tx.c @@ -0,0 +1,1068 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ +#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ +#include <sys/dsl_pool.h> +#include <sys/zap_impl.h> /* for fzap_default_block_shift */ +#include <sys/spa.h> +#include <sys/zfs_context.h> + +typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, + uint64_t arg1, uint64_t arg2); + + +dmu_tx_t * +dmu_tx_create_dd(dsl_dir_t *dd) +{ + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + tx->tx_dir = dd; + if (dd) + tx->tx_pool = dd->dd_pool; + list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), + offsetof(dmu_tx_hold_t, txh_node)); +#ifdef ZFS_DEBUG + refcount_create(&tx->tx_space_written); + refcount_create(&tx->tx_space_freed); +#endif + return (tx); +} + +dmu_tx_t * +dmu_tx_create(objset_t *os) +{ + dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); + tx->tx_objset = os; + tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); + return (tx); +} + +dmu_tx_t * +dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) +{ + dmu_tx_t *tx = dmu_tx_create_dd(NULL); + + ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); + tx->tx_pool = dp; + tx->tx_txg = txg; + tx->tx_anyobj = TRUE; + + return (tx); +} + +int +dmu_tx_is_syncing(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +int +dmu_tx_private_ok(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +static dmu_tx_hold_t * +dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, + enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +{ + dmu_tx_hold_t *txh; + dnode_t *dn = NULL; + int err; + + if (object != DMU_NEW_OBJECT) { + err = dnode_hold(os->os, object, tx, &dn); + if (err) { + tx->tx_err = err; + return (NULL); + } + + if (err == 0 && tx->tx_txg != 0) { + mutex_enter(&dn->dn_mtx); + /* + * dn->dn_assigned_txg == tx->tx_txg doesn't pose a + * problem, but there's no way for it to happen (for + * now, at least). + */ + ASSERT(dn->dn_assigned_txg == 0); + dn->dn_assigned_txg = tx->tx_txg; + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + } + + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh->txh_tx = tx; + txh->txh_dnode = dn; +#ifdef ZFS_DEBUG + txh->txh_type = type; + txh->txh_arg1 = arg1; + txh->txh_arg2 = arg2; +#endif + list_insert_tail(&tx->tx_holds, txh); + + return (txh); +} + +void +dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) +{ + /* + * If we're syncing, they can manipulate any object anyhow, and + * the hold on the dnode_t can cause problems. + */ + if (!dmu_tx_is_syncing(tx)) { + (void) dmu_tx_hold_object_impl(tx, os, + object, THT_NEWOBJECT, 0, 0); + } +} + +static int +dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) +{ + int err; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, level, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) + return (EIO); + err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); + dbuf_rele(db, FTAG); + return (err); +} + +/* ARGSUSED */ +static void +dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + uint64_t start, end, i; + int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; + int err = 0; + + if (len == 0) + return; + + min_bs = SPA_MINBLOCKSHIFT; + max_bs = SPA_MAXBLOCKSHIFT; + min_ibs = DN_MIN_INDBLKSHIFT; + max_ibs = DN_MAX_INDBLKSHIFT; + + /* + * For i/o error checking, read the first and last level-0 + * blocks (if they are not aligned), and all the level-1 blocks. + */ + + if (dn) { + if (dn->dn_maxblkid == 0) { + if ((off > 0 || len < dn->dn_datablksz) && + off < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) + goto out; + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || + len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err) + goto out; + } + + /* last level-0 block */ + end = (off+len-1) >> dn->dn_datablkshift; + if (end != start && end <= dn->dn_maxblkid && + P2PHASE(off+len, dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(zio, dn, 0, end); + if (err) + goto out; + } + + /* level-1 blocks */ + if (dn->dn_nlevels > 1) { + start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; + end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (i = start+1; i < end; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err) + goto out; + } + } + + err = zio_wait(zio); + if (err) + goto out; + } + } + + /* + * If there's more than one block, the blocksize can't change, + * so we can make a more precise estimate. Alternatively, + * if the dnode's ibs is larger than max_ibs, always use that. + * This ensures that if we reduce DN_MAX_INDBLKSHIFT, + * the code will still work correctly on existing pools. + */ + if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { + min_ibs = max_ibs = dn->dn_indblkshift; + if (dn->dn_datablkshift != 0) + min_bs = max_bs = dn->dn_datablkshift; + } + + /* + * 'end' is the last thing we will access, not one past. + * This way we won't overflow when accessing the last byte. + */ + start = P2ALIGN(off, 1ULL << max_bs); + end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; + txh->txh_space_towrite += end - start + 1; + + start >>= min_bs; + end >>= min_bs; + + epbs = min_ibs - SPA_BLKPTRSHIFT; + + /* + * The object contains at most 2^(64 - min_bs) blocks, + * and each indirect level maps 2^epbs. + */ + for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { + start >>= epbs; + end >>= epbs; + /* + * If we increase the number of levels of indirection, + * we'll need new blkid=0 indirect blocks. If start == 0, + * we're already accounting for that blocks; and if end == 0, + * we can't increase the number of levels beyond that. + */ + if (start != 0 && end != 0) + txh->txh_space_towrite += 1ULL << max_ibs; + txh->txh_space_towrite += (end - start + 1) << max_ibs; + } + + ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); + +out: + if (err) + txh->txh_tx->tx_err = err; +} + +static void +dmu_tx_count_dnode(dmu_tx_hold_t *txh) +{ + dnode_t *dn = txh->txh_dnode; + dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; + uint64_t space = mdn->dn_datablksz + + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); + + if (dn && dn->dn_dbuf->db_blkptr && + dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + dn->dn_dbuf->db_blkptr->blk_birth)) { + txh->txh_space_tooverwrite += space; + } else { + txh->txh_space_towrite += space; + if (dn && dn->dn_dbuf->db_blkptr) + txh->txh_space_tounref += space; + } +} + +void +dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + ASSERT(len < DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_WRITE, off, len); + if (txh == NULL) + return; + + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); +} + +static void +dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + uint64_t blkid, nblks, lastblk; + uint64_t space = 0, unref = 0, skipped = 0; + dnode_t *dn = txh->txh_dnode; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + spa_t *spa = txh->txh_tx->tx_pool->dp_spa; + int epbs; + + if (dn->dn_nlevels == 0) + return; + + /* + * The struct_rwlock protects us against dn_nlevels + * changing, in case (against all odds) we manage to dirty & + * sync out the changes after we check for being dirty. + * Also, dbuf_hold_level() wants us to have the struct_rwlock. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_maxblkid == 0) { + if (off == 0 && len >= dn->dn_datablksz) { + blkid = 0; + nblks = 1; + } else { + rw_exit(&dn->dn_struct_rwlock); + return; + } + } else { + blkid = off >> dn->dn_datablkshift; + nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; + + if (blkid >= dn->dn_maxblkid) { + rw_exit(&dn->dn_struct_rwlock); + return; + } + if (blkid + nblks > dn->dn_maxblkid) + nblks = dn->dn_maxblkid - blkid; + + } + if (dn->dn_nlevels == 1) { + int i; + for (i = 0; i < nblks; i++) { + blkptr_t *bp = dn->dn_phys->dn_blkptr; + ASSERT3U(blkid + i, <, dn->dn_nblkptr); + bp += blkid + i; + if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { + dprintf_bp(bp, "can free old%s", ""); + space += bp_get_dasize(spa, bp); + } + unref += BP_GET_ASIZE(bp); + } + nblks = 0; + } + + /* + * Add in memory requirements of higher-level indirects. + * This assumes a worst-possible scenario for dn_nlevels. + */ + { + uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); + int level = (dn->dn_nlevels > 1) ? 2 : 1; + + while (level++ < DN_MAX_LEVELS) { + txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; + blkcnt = 1 + (blkcnt >> epbs); + } + ASSERT(blkcnt <= dn->dn_nblkptr); + } + + lastblk = blkid + nblks - 1; + while (nblks) { + dmu_buf_impl_t *dbuf; + uint64_t ibyte, new_blkid; + int epb = 1 << epbs; + int err, i, blkoff, tochk; + blkptr_t *bp; + + ibyte = blkid << dn->dn_datablkshift; + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); + new_blkid = ibyte >> dn->dn_datablkshift; + if (err == ESRCH) { + skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; + break; + } + if (err) { + txh->txh_tx->tx_err = err; + break; + } + if (new_blkid > lastblk) { + skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; + break; + } + + if (new_blkid > blkid) { + ASSERT((new_blkid >> epbs) > (blkid >> epbs)); + skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; + nblks -= new_blkid - blkid; + blkid = new_blkid; + } + blkoff = P2PHASE(blkid, epb); + tochk = MIN(epb - blkoff, nblks); + + dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); + + txh->txh_memory_tohold += dbuf->db.db_size; + if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { + txh->txh_tx->tx_err = E2BIG; + dbuf_rele(dbuf, FTAG); + break; + } + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); + if (err != 0) { + txh->txh_tx->tx_err = err; + dbuf_rele(dbuf, FTAG); + break; + } + + bp = dbuf->db.db_data; + bp += blkoff; + + for (i = 0; i < tochk; i++) { + if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { + dprintf_bp(&bp[i], "can free old%s", ""); + space += bp_get_dasize(spa, &bp[i]); + } + unref += BP_GET_ASIZE(bp); + } + dbuf_rele(dbuf, FTAG); + + blkid += tochk; + nblks -= tochk; + } + rw_exit(&dn->dn_struct_rwlock); + + /* account for new level 1 indirect blocks that might show up */ + if (skipped > 0) { + txh->txh_fudge += skipped << dn->dn_indblkshift; + skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); + txh->txh_memory_tohold += skipped << dn->dn_indblkshift; + } + txh->txh_space_tofree += space; + txh->txh_space_tounref += unref; +} + +void +dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + dnode_t *dn; + uint64_t start, end, i; + int err, shift; + zio_t *zio; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_FREE, off, len); + if (txh == NULL) + return; + dn = txh->txh_dnode; + + /* first block */ + if (off != 0) + dmu_tx_count_write(txh, off, 1); + /* last block */ + if (len != DMU_OBJECT_END) + dmu_tx_count_write(txh, off+len, 1); + + if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) + return; + if (len == DMU_OBJECT_END) + len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + + /* + * For i/o error checking, read the first and last level-0 + * blocks, and all the level-1 blocks. The above count_write's + * have already taken care of the level-0 blocks. + */ + if (dn->dn_nlevels > 1) { + shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + start = off >> shift; + end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + + zio = zio_root(tx->tx_pool->dp_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + for (i = start; i <= end; i++) { + uint64_t ibyte = i << shift; + err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (err == ESRCH) + break; + if (err) { + tx->tx_err = err; + return; + } + + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err) { + tx->tx_err = err; + return; + } + } + err = zio_wait(zio); + if (err) { + tx->tx_err = err; + return; + } + } + + dmu_tx_count_dnode(txh); + dmu_tx_count_free(txh, off, len); +} + +void +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) +{ + dmu_tx_hold_t *txh; + dnode_t *dn; + uint64_t nblocks; + int epbs, err; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_ZAP, add, (uintptr_t)name); + if (txh == NULL) + return; + dn = txh->txh_dnode; + + dmu_tx_count_dnode(txh); + + if (dn == NULL) { + /* + * We will be able to fit a new object's entries into one leaf + * block. So there will be at most 2 blocks total, + * including the header block. + */ + dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); + return; + } + + ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + + if (dn->dn_maxblkid == 0 && !add) { + /* + * If there is only one block (i.e. this is a micro-zap) + * and we are not adding anything, the accounting is simple. + */ + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) { + tx->tx_err = err; + return; + } + + /* + * Use max block size here, since we don't know how much + * the size will change between now and the dbuf dirty call. + */ + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + dn->dn_phys->dn_blkptr[0].blk_birth)) { + txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + } else { + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref += + BP_GET_ASIZE(dn->dn_phys->dn_blkptr); + } + return; + } + + if (dn->dn_maxblkid > 0 && name) { + /* + * access the name in this fat-zap so that we'll check + * for i/o errors to the leaf blocks, etc. + */ + err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, + 8, 0, NULL); + if (err == EIO) { + tx->tx_err = err; + return; + } + } + + /* + * 3 blocks overwritten: target leaf, ptrtbl block, header block + * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks + */ + dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, + (3 + (add ? 3 : 0)) << dn->dn_datablkshift); + + /* + * If the modified blocks are scattered to the four winds, + * we'll have to modify an indirect twig for each. + */ + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) + txh->txh_space_towrite += 3 << dn->dn_indblkshift; +} + +void +dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) +{ + dmu_tx_hold_t *txh; + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + DMU_NEW_OBJECT, THT_SPACE, space, 0); + + txh->txh_space_towrite += space; +} + +int +dmu_tx_holds(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + int holds = 0; + + /* + * By asserting that the tx is assigned, we're counting the + * number of dn_tx_holds, which is the same as the number of + * dn_holds. Otherwise, we'd be counting dn_holds, but + * dn_tx_holds could be 0. + */ + ASSERT(tx->tx_txg != 0); + + /* if (tx->tx_anyobj == TRUE) */ + /* return (0); */ + + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + if (txh->txh_dnode && txh->txh_dnode->dn_object == object) + holds++; + } + + return (holds); +} + +#ifdef ZFS_DEBUG +void +dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) +{ + dmu_tx_hold_t *txh; + int match_object = FALSE, match_offset = FALSE; + dnode_t *dn = db->db_dnode; + + ASSERT(tx->tx_txg != 0); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); + ASSERT3U(dn->dn_object, ==, db->db.db_object); + + if (tx->tx_anyobj) + return; + + /* XXX No checking on the meta dnode for now */ + if (db->db.db_object == DMU_META_DNODE_OBJECT) + return; + + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); + if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) + match_object = TRUE; + if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { + int datablkshift = dn->dn_datablkshift ? + dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int shift = datablkshift + epbs * db->db_level; + uint64_t beginblk = shift >= 64 ? 0 : + (txh->txh_arg1 >> shift); + uint64_t endblk = shift >= 64 ? 0 : + ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); + uint64_t blkid = db->db_blkid; + + /* XXX txh_arg2 better not be zero... */ + + dprintf("found txh type %x beginblk=%llx endblk=%llx\n", + txh->txh_type, beginblk, endblk); + + switch (txh->txh_type) { + case THT_WRITE: + if (blkid >= beginblk && blkid <= endblk) + match_offset = TRUE; + /* + * We will let this hold work for the bonus + * buffer so that we don't need to hold it + * when creating a new object. + */ + if (blkid == DB_BONUS_BLKID) + match_offset = TRUE; + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; + case THT_FREE: + /* + * We will dirty all the level 1 blocks in + * the free range and perhaps the first and + * last level 0 block. + */ + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + break; + case THT_BONUS: + if (blkid == DB_BONUS_BLKID) + match_offset = TRUE; + break; + case THT_ZAP: + match_offset = TRUE; + break; + case THT_NEWOBJECT: + match_object = TRUE; + break; + default: + ASSERT(!"bad txh_type"); + } + } + if (match_object && match_offset) + return; + } + panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", + (u_longlong_t)db->db.db_object, db->db_level, + (u_longlong_t)db->db_blkid); +} +#endif + +static int +dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + dmu_tx_hold_t *txh; + spa_t *spa = tx->tx_pool->dp_spa; + uint64_t memory, asize, fsize, usize; + uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; + + ASSERT3U(tx->tx_txg, ==, 0); + + if (tx->tx_err) + return (tx->tx_err); + + if (spa_suspended(spa)) { + /* + * If the user has indicated a blocking failure mode + * then return ERESTART which will block in dmu_tx_wait(). + * Otherwise, return EIO so that an error can get + * propagated back to the VOP calls. + * + * Note that we always honor the txg_how flag regardless + * of the failuremode setting. + */ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && + txg_how != TXG_WAIT) + return (EIO); + + return (ERESTART); + } + + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); + tx->tx_needassign_txh = NULL; + + /* + * NB: No error returns are allowed after txg_hold_open, but + * before processing the dnode holds, due to the + * dmu_tx_unassign() logic. + */ + + towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + if (dn != NULL) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_assigned_txg == tx->tx_txg - 1) { + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = txh; + return (ERESTART); + } + if (dn->dn_assigned_txg == 0) + dn->dn_assigned_txg = tx->tx_txg; + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + towrite += txh->txh_space_towrite; + tofree += txh->txh_space_tofree; + tooverwrite += txh->txh_space_tooverwrite; + tounref += txh->txh_space_tounref; + tohold += txh->txh_memory_tohold; + fudge += txh->txh_fudge; + } + + /* + * NB: This check must be after we've held the dnodes, so that + * the dmu_tx_unassign() logic will work properly + */ + if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) + return (ERESTART); + + /* + * If a snapshot has been taken since we made our estimates, + * assume that we won't be able to free or overwrite anything. + */ + if (tx->tx_objset && + dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > + tx->tx_lastsnap_txg) { + towrite += tooverwrite; + tooverwrite = tofree = 0; + } + + /* needed allocation: worst-case estimate of write space */ + asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); + /* freed space estimate: worst-case overwrite + free estimate */ + fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; + /* convert unrefd space to worst-case estimate */ + usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); + /* calculate memory footprint estimate */ + memory = towrite + tooverwrite + tohold; + +#ifdef ZFS_DEBUG + /* + * Add in 'tohold' to account for our dirty holds on this memory + * XXX - the "fudge" factor is to account for skipped blocks that + * we missed because dnode_next_offset() misses in-core-only blocks. + */ + tx->tx_space_towrite = asize + + spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); + tx->tx_space_tofree = tofree; + tx->tx_space_tooverwrite = tooverwrite; + tx->tx_space_tounref = tounref; +#endif + + if (tx->tx_dir && asize != 0) { + int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, + asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); + if (err) + return (err); + } + + return (0); +} + +static void +dmu_tx_unassign(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + if (tx->tx_txg == 0) + return; + + txg_rele_to_quiesce(&tx->tx_txgh); + + for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + txg_rele_to_sync(&tx->tx_txgh); + + tx->tx_lasttried_txg = tx->tx_txg; + tx->tx_txg = 0; +} + +/* + * Assign tx to a transaction group. txg_how can be one of: + * + * (1) TXG_WAIT. If the current open txg is full, waits until there's + * a new one. This should be used when you're not holding locks. + * If will only fail if we're truly out of space (or over quota). + * + * (2) TXG_NOWAIT. If we can't assign into the current open txg without + * blocking, returns immediately with ERESTART. This should be used + * whenever you're holding locks. On an ERESTART error, the caller + * should drop locks, do a dmu_tx_wait(tx), and try again. + * + * (3) A specific txg. Use this if you need to ensure that multiple + * transactions all sync in the same txg. Like TXG_NOWAIT, it + * returns ERESTART if it can't assign you into the requested txg. + */ +int +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + int err; + + ASSERT(tx->tx_txg == 0); + ASSERT(txg_how != 0); + ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { + dmu_tx_unassign(tx); + + if (err != ERESTART || txg_how != TXG_WAIT) + return (err); + + dmu_tx_wait(tx); + } + + txg_rele_to_quiesce(&tx->tx_txgh); + + return (0); +} + +void +dmu_tx_wait(dmu_tx_t *tx) +{ + spa_t *spa = tx->tx_pool->dp_spa; + + ASSERT(tx->tx_txg == 0); + + /* + * It's possible that the pool has become active after this thread + * has tried to obtain a tx. If that's the case then his + * tx_lasttried_txg would not have been assigned. + */ + if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { + txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); + } else if (tx->tx_needassign_txh) { + dnode_t *dn = tx->tx_needassign_txh->txh_dnode; + + mutex_enter(&dn->dn_mtx); + while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) + cv_wait(&dn->dn_notxholds, &dn->dn_mtx); + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = NULL; + } else { + txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); + } +} + +void +dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) +{ +#ifdef ZFS_DEBUG + if (tx->tx_dir == NULL || delta == 0) + return; + + if (delta > 0) { + ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, + tx->tx_space_towrite); + (void) refcount_add_many(&tx->tx_space_written, delta, NULL); + } else { + (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); + } +#endif +} + +void +dmu_tx_commit(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg != 0); + + while (txh = list_head(&tx->tx_holds)) { + dnode_t *dn = txh->txh_dnode; + + list_remove(&tx->tx_holds, txh); + kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + dnode_rele(dn, tx); + } + + if (tx->tx_tempreserve_cookie) + dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + + if (tx->tx_anyobj == FALSE) + txg_rele_to_sync(&tx->tx_txgh); + list_destroy(&tx->tx_holds); +#ifdef ZFS_DEBUG + dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", + tx->tx_space_towrite, refcount_count(&tx->tx_space_written), + tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +void +dmu_tx_abort(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + while (txh = list_head(&tx->tx_holds)) { + dnode_t *dn = txh->txh_dnode; + + list_remove(&tx->tx_holds, txh); + kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn != NULL) + dnode_rele(dn, tx); + } + list_destroy(&tx->tx_holds); +#ifdef ZFS_DEBUG + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +uint64_t +dmu_tx_get_txg(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + return (tx->tx_txg); +} diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c new file mode 100644 index 000000000..4d79fe98e --- /dev/null +++ b/module/zfs/dmu_zfetch.c @@ -0,0 +1,651 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_zfetch.h> +#include <sys/dmu.h> +#include <sys/dbuf.h> + +/* + * I'm against tune-ables, but these should probably exist as tweakable globals + * until we can get this working the way we want it to. + */ + +int zfs_prefetch_disable = 0; + +/* max # of streams per zfetch */ +uint32_t zfetch_max_streams = 8; +/* min time before stream reclaim */ +uint32_t zfetch_min_sec_reap = 2; +/* max number of blocks to fetch at a time */ +uint32_t zfetch_block_cap = 256; +/* number of bytes in a array_read at which we stop prefetching (1Mb) */ +uint64_t zfetch_array_rd_sz = 1024 * 1024; + +/* forward decls for static routines */ +static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); +static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); +static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); +static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); +static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); +static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); +static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); +static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); +static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); + +/* + * Given a zfetch structure and a zstream structure, determine whether the + * blocks to be read are part of a co-linear pair of existing prefetch + * streams. If a set is found, coalesce the streams, removing one, and + * configure the prefetch so it looks for a strided access pattern. + * + * In other words: if we find two sequential access streams that are + * the same length and distance N appart, and this read is N from the + * last stream, then we are probably in a strided access pattern. So + * combine the two sequential streams into a single strided stream. + * + * If no co-linear streams are found, return NULL. + */ +static int +dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) +{ + zstream_t *z_walk; + zstream_t *z_comp; + + if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) + return (0); + + if (zh == NULL) { + rw_exit(&zf->zf_rwlock); + return (0); + } + + for (z_walk = list_head(&zf->zf_stream); z_walk; + z_walk = list_next(&zf->zf_stream, z_walk)) { + for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; + z_comp = list_next(&zf->zf_stream, z_comp)) { + int64_t diff; + + if (z_walk->zst_len != z_walk->zst_stride || + z_comp->zst_len != z_comp->zst_stride) { + continue; + } + + diff = z_comp->zst_offset - z_walk->zst_offset; + if (z_comp->zst_offset + diff == zh->zst_offset) { + z_walk->zst_offset = zh->zst_offset; + z_walk->zst_direction = diff < 0 ? -1 : 1; + z_walk->zst_stride = + diff * z_walk->zst_direction; + z_walk->zst_ph_offset = + zh->zst_offset + z_walk->zst_stride; + dmu_zfetch_stream_remove(zf, z_comp); + mutex_destroy(&z_comp->zst_lock); + kmem_free(z_comp, sizeof (zstream_t)); + + dmu_zfetch_dofetch(zf, z_walk); + + rw_exit(&zf->zf_rwlock); + return (1); + } + + diff = z_walk->zst_offset - z_comp->zst_offset; + if (z_walk->zst_offset + diff == zh->zst_offset) { + z_walk->zst_offset = zh->zst_offset; + z_walk->zst_direction = diff < 0 ? -1 : 1; + z_walk->zst_stride = + diff * z_walk->zst_direction; + z_walk->zst_ph_offset = + zh->zst_offset + z_walk->zst_stride; + dmu_zfetch_stream_remove(zf, z_comp); + mutex_destroy(&z_comp->zst_lock); + kmem_free(z_comp, sizeof (zstream_t)); + + dmu_zfetch_dofetch(zf, z_walk); + + rw_exit(&zf->zf_rwlock); + return (1); + } + } + } + + rw_exit(&zf->zf_rwlock); + return (0); +} + +/* + * Given a zstream_t, determine the bounds of the prefetch. Then call the + * routine that actually prefetches the individual blocks. + */ +static void +dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) +{ + uint64_t prefetch_tail; + uint64_t prefetch_limit; + uint64_t prefetch_ofst; + uint64_t prefetch_len; + uint64_t blocks_fetched; + + zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); + zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); + + prefetch_tail = MAX((int64_t)zs->zst_ph_offset, + (int64_t)(zs->zst_offset + zs->zst_stride)); + /* + * XXX: use a faster division method? + */ + prefetch_limit = zs->zst_offset + zs->zst_len + + (zs->zst_cap * zs->zst_stride) / zs->zst_len; + + while (prefetch_tail < prefetch_limit) { + prefetch_ofst = zs->zst_offset + zs->zst_direction * + (prefetch_tail - zs->zst_offset); + + prefetch_len = zs->zst_len; + + /* + * Don't prefetch beyond the end of the file, if working + * backwards. + */ + if ((zs->zst_direction == ZFETCH_BACKWARD) && + (prefetch_ofst > prefetch_tail)) { + prefetch_len += prefetch_ofst; + prefetch_ofst = 0; + } + + /* don't prefetch more than we're supposed to */ + if (prefetch_len > zs->zst_len) + break; + + blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, + prefetch_ofst, zs->zst_len); + + prefetch_tail += zs->zst_stride; + /* stop if we've run out of stuff to prefetch */ + if (blocks_fetched < zs->zst_len) + break; + } + zs->zst_ph_offset = prefetch_tail; + zs->zst_last = lbolt; +} + +/* + * This takes a pointer to a zfetch structure and a dnode. It performs the + * necessary setup for the zfetch structure, grokking data from the + * associated dnode. + */ +void +dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) +{ + if (zf == NULL) { + return; + } + + zf->zf_dnode = dno; + zf->zf_stream_cnt = 0; + zf->zf_alloc_fail = 0; + + list_create(&zf->zf_stream, sizeof (zstream_t), + offsetof(zstream_t, zst_node)); + + rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); +} + +/* + * This function computes the actual size, in blocks, that can be prefetched, + * and fetches it. + */ +static uint64_t +dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) +{ + uint64_t fetchsz; + uint64_t i; + + fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); + + for (i = 0; i < fetchsz; i++) { + dbuf_prefetch(dn, blkid + i); + } + + return (fetchsz); +} + +/* + * this function returns the number of blocks that would be prefetched, based + * upon the supplied dnode, blockid, and nblks. This is used so that we can + * update streams in place, and then prefetch with their old value after the + * fact. This way, we can delay the prefetch, but subsequent accesses to the + * stream won't result in the same data being prefetched multiple times. + */ +static uint64_t +dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) +{ + uint64_t fetchsz; + + if (blkid > dn->dn_maxblkid) { + return (0); + } + + /* compute fetch size */ + if (blkid + nblks + 1 > dn->dn_maxblkid) { + fetchsz = (dn->dn_maxblkid - blkid) + 1; + ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); + } else { + fetchsz = nblks; + } + + + return (fetchsz); +} + +/* + * given a zfetch and a zsearch structure, see if there is an associated zstream + * for this block read. If so, it starts a prefetch for the stream it + * located and returns true, otherwise it returns false + */ +static int +dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) +{ + zstream_t *zs; + int64_t diff; + int reset = !prefetched; + int rc = 0; + + if (zh == NULL) + return (0); + + /* + * XXX: This locking strategy is a bit coarse; however, it's impact has + * yet to be tested. If this turns out to be an issue, it can be + * modified in a number of different ways. + */ + + rw_enter(&zf->zf_rwlock, RW_READER); +top: + + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + + /* + * XXX - should this be an assert? + */ + if (zs->zst_len == 0) { + /* bogus stream */ + continue; + } + + /* + * We hit this case when we are in a strided prefetch stream: + * we will read "len" blocks before "striding". + */ + if (zh->zst_offset >= zs->zst_offset && + zh->zst_offset < zs->zst_offset + zs->zst_len) { + /* already fetched */ + rc = 1; + goto out; + } + + /* + * This is the forward sequential read case: we increment + * len by one each time we hit here, so we will enter this + * case on every read. + */ + if (zh->zst_offset == zs->zst_offset + zs->zst_len) { + + reset = !prefetched && zs->zst_len > 1; + + mutex_enter(&zs->zst_lock); + + if (zh->zst_offset != zs->zst_offset + zs->zst_len) { + mutex_exit(&zs->zst_lock); + goto top; + } + zs->zst_len += zh->zst_len; + diff = zs->zst_len - zfetch_block_cap; + if (diff > 0) { + zs->zst_offset += diff; + zs->zst_len = zs->zst_len > diff ? + zs->zst_len - diff : 0; + } + zs->zst_direction = ZFETCH_FORWARD; + + break; + + /* + * Same as above, but reading backwards through the file. + */ + } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { + /* backwards sequential access */ + + reset = !prefetched && zs->zst_len > 1; + + mutex_enter(&zs->zst_lock); + + if (zh->zst_offset != zs->zst_offset - zh->zst_len) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset = zs->zst_offset > zh->zst_len ? + zs->zst_offset - zh->zst_len : 0; + zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? + zs->zst_ph_offset - zh->zst_len : 0; + zs->zst_len += zh->zst_len; + + diff = zs->zst_len - zfetch_block_cap; + if (diff > 0) { + zs->zst_ph_offset = zs->zst_ph_offset > diff ? + zs->zst_ph_offset - diff : 0; + zs->zst_len = zs->zst_len > diff ? + zs->zst_len - diff : zs->zst_len; + } + zs->zst_direction = ZFETCH_BACKWARD; + + break; + + } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < + zs->zst_len) && (zs->zst_len != zs->zst_stride)) { + /* strided forward access */ + + mutex_enter(&zs->zst_lock); + + if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= + zs->zst_len) || (zs->zst_len == zs->zst_stride)) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset += zs->zst_stride; + zs->zst_direction = ZFETCH_FORWARD; + + break; + + } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < + zs->zst_len) && (zs->zst_len != zs->zst_stride)) { + /* strided reverse access */ + + mutex_enter(&zs->zst_lock); + + if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= + zs->zst_len) || (zs->zst_len == zs->zst_stride)) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset = zs->zst_offset > zs->zst_stride ? + zs->zst_offset - zs->zst_stride : 0; + zs->zst_ph_offset = (zs->zst_ph_offset > + (2 * zs->zst_stride)) ? + (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; + zs->zst_direction = ZFETCH_BACKWARD; + + break; + } + } + + if (zs) { + if (reset) { + zstream_t *remove = zs; + + rc = 0; + mutex_exit(&zs->zst_lock); + rw_exit(&zf->zf_rwlock); + rw_enter(&zf->zf_rwlock, RW_WRITER); + /* + * Relocate the stream, in case someone removes + * it while we were acquiring the WRITER lock. + */ + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + if (zs == remove) { + dmu_zfetch_stream_remove(zf, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + break; + } + } + } else { + rc = 1; + dmu_zfetch_dofetch(zf, zs); + mutex_exit(&zs->zst_lock); + } + } +out: + rw_exit(&zf->zf_rwlock); + return (rc); +} + +/* + * Clean-up state associated with a zfetch structure. This frees allocated + * structure members, empties the zf_stream tree, and generally makes things + * nice. This doesn't free the zfetch_t itself, that's left to the caller. + */ +void +dmu_zfetch_rele(zfetch_t *zf) +{ + zstream_t *zs; + zstream_t *zs_next; + + ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); + + for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + } + list_destroy(&zf->zf_stream); + rw_destroy(&zf->zf_rwlock); + + zf->zf_dnode = NULL; +} + +/* + * Given a zfetch and zstream structure, insert the zstream structure into the + * AVL tree contained within the zfetch structure. Peform the appropriate + * book-keeping. It is possible that another thread has inserted a stream which + * matches one that we are about to insert, so we must be sure to check for this + * case. If one is found, return failure, and let the caller cleanup the + * duplicates. + */ +static int +dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) +{ + zstream_t *zs_walk; + zstream_t *zs_next; + + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { + zs_next = list_next(&zf->zf_stream, zs_walk); + + if (dmu_zfetch_streams_equal(zs_walk, zs)) { + return (0); + } + } + + list_insert_head(&zf->zf_stream, zs); + zf->zf_stream_cnt++; + + return (1); +} + + +/* + * Walk the list of zstreams in the given zfetch, find an old one (by time), and + * reclaim it for use by the caller. + */ +static zstream_t * +dmu_zfetch_stream_reclaim(zfetch_t *zf) +{ + zstream_t *zs; + + if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) + return (0); + + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + + if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap) + break; + } + + if (zs) { + dmu_zfetch_stream_remove(zf, zs); + mutex_destroy(&zs->zst_lock); + bzero(zs, sizeof (zstream_t)); + } else { + zf->zf_alloc_fail++; + } + rw_exit(&zf->zf_rwlock); + + return (zs); +} + +/* + * Given a zfetch and zstream structure, remove the zstream structure from its + * container in the zfetch structure. Perform the appropriate book-keeping. + */ +static void +dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + list_remove(&zf->zf_stream, zs); + zf->zf_stream_cnt--; +} + +static int +dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) +{ + if (zs1->zst_offset != zs2->zst_offset) + return (0); + + if (zs1->zst_len != zs2->zst_len) + return (0); + + if (zs1->zst_stride != zs2->zst_stride) + return (0); + + if (zs1->zst_ph_offset != zs2->zst_ph_offset) + return (0); + + if (zs1->zst_cap != zs2->zst_cap) + return (0); + + if (zs1->zst_direction != zs2->zst_direction) + return (0); + + return (1); +} + +/* + * This is the prefetch entry point. It calls all of the other dmu_zfetch + * routines to create, delete, find, or operate upon prefetch streams. + */ +void +dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) +{ + zstream_t zst; + zstream_t *newstream; + int fetched; + int inserted; + unsigned int blkshft; + uint64_t blksz; + + if (zfs_prefetch_disable) + return; + + /* files that aren't ln2 blocksz are only one block -- nothing to do */ + if (!zf->zf_dnode->dn_datablkshift) + return; + + /* convert offset and size, into blockid and nblocks */ + blkshft = zf->zf_dnode->dn_datablkshift; + blksz = (1 << blkshft); + + bzero(&zst, sizeof (zstream_t)); + zst.zst_offset = offset >> blkshft; + zst.zst_len = (P2ROUNDUP(offset + size, blksz) - + P2ALIGN(offset, blksz)) >> blkshft; + + fetched = dmu_zfetch_find(zf, &zst, prefetched); + if (!fetched) { + fetched = dmu_zfetch_colinear(zf, &zst); + } + + if (!fetched) { + newstream = dmu_zfetch_stream_reclaim(zf); + + /* + * we still couldn't find a stream, drop the lock, and allocate + * one if possible. Otherwise, give up and go home. + */ + if (newstream == NULL) { + uint64_t maxblocks; + uint32_t max_streams; + uint32_t cur_streams; + + cur_streams = zf->zf_stream_cnt; + maxblocks = zf->zf_dnode->dn_maxblkid; + + max_streams = MIN(zfetch_max_streams, + (maxblocks / zfetch_block_cap)); + if (max_streams == 0) { + max_streams++; + } + + if (cur_streams >= max_streams) { + return; + } + + newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); + } + + newstream->zst_offset = zst.zst_offset; + newstream->zst_len = zst.zst_len; + newstream->zst_stride = zst.zst_len; + newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; + newstream->zst_cap = zst.zst_len; + newstream->zst_direction = ZFETCH_FORWARD; + newstream->zst_last = lbolt; + + mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); + + rw_enter(&zf->zf_rwlock, RW_WRITER); + inserted = dmu_zfetch_stream_insert(zf, newstream); + rw_exit(&zf->zf_rwlock); + + if (!inserted) { + mutex_destroy(&newstream->zst_lock); + kmem_free(newstream, sizeof (zstream_t)); + } + } +} diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c new file mode 100644 index 000000000..e77834d60 --- /dev/null +++ b/module/zfs/dnode.c @@ -0,0 +1,1443 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> + +static int free_range_compar(const void *node1, const void *node2); + +static kmem_cache_t *dnode_cache; + +static dnode_phys_t dnode_phys_zero; + +int zfs_default_bs = SPA_MINBLOCKSHIFT; +int zfs_default_ibs = DN_MAX_INDBLKSHIFT; + +/* ARGSUSED */ +static int +dnode_cons(void *arg, void *unused, int kmflag) +{ + int i; + dnode_t *dn = arg; + bzero(dn, sizeof (dnode_t)); + + rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); + refcount_create(&dn->dn_holds); + refcount_create(&dn->dn_tx_holds); + + for (i = 0; i < TXG_SIZE; i++) { + avl_create(&dn->dn_ranges[i], free_range_compar, + sizeof (free_range_t), + offsetof(struct free_range, fr_node)); + list_create(&dn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + + list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + return (0); +} + +/* ARGSUSED */ +static void +dnode_dest(void *arg, void *unused) +{ + int i; + dnode_t *dn = arg; + + rw_destroy(&dn->dn_struct_rwlock); + mutex_destroy(&dn->dn_mtx); + mutex_destroy(&dn->dn_dbufs_mtx); + refcount_destroy(&dn->dn_holds); + refcount_destroy(&dn->dn_tx_holds); + + for (i = 0; i < TXG_SIZE; i++) { + avl_destroy(&dn->dn_ranges[i]); + list_destroy(&dn->dn_dirty_records[i]); + } + + list_destroy(&dn->dn_dbufs); +} + +void +dnode_init(void) +{ + dnode_cache = kmem_cache_create("dnode_t", + sizeof (dnode_t), + 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); +} + +void +dnode_fini(void) +{ + kmem_cache_destroy(dnode_cache); +} + + +#ifdef ZFS_DEBUG +void +dnode_verify(dnode_t *dn) +{ + int drop_struct_lock = FALSE; + + ASSERT(dn->dn_phys); + ASSERT(dn->dn_objset); + + ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + + if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) + return; + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { + int i; + ASSERT3U(dn->dn_indblkshift, >=, 0); + ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); + if (dn->dn_datablkshift) { + ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); + ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); + } + ASSERT3U(dn->dn_nlevels, <=, 30); + ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT3U(dn->dn_nblkptr, >=, 1); + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(dn->dn_datablksz, ==, + dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); + ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + + dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); + } + } + if (dn->dn_phys->dn_type != DMU_OT_NONE) + ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL); + if (dn->dn_dbuf != NULL) { + ASSERT3P(dn->dn_phys, ==, + (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); +} +#endif + +void +dnode_byteswap(dnode_phys_t *dnp) +{ + uint64_t *buf64 = (void*)&dnp->dn_blkptr; + int i; + + if (dnp->dn_type == DMU_OT_NONE) { + bzero(dnp, sizeof (dnode_phys_t)); + return; + } + + dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); + dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); + dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); + dnp->dn_used = BSWAP_64(dnp->dn_used); + + /* + * dn_nblkptr is only one byte, so it's OK to read it in either + * byte order. We can't read dn_bouslen. + */ + ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); + ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); + for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) + buf64[i] = BSWAP_64(buf64[i]); + + /* + * OK to check dn_bonuslen for zero, because it won't matter if + * we have the wrong byte order. This is necessary because the + * dnode dnode is smaller than a regular dnode. + */ + if (dnp->dn_bonuslen != 0) { + /* + * Note that the bonus length calculated here may be + * longer than the actual bonus buffer. This is because + * we always put the bonus buffer after the last block + * pointer (instead of packing it against the end of the + * dnode buffer). + */ + int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); + size_t len = DN_MAX_BONUSLEN - off; + ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); + dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + } +} + +void +dnode_buf_byteswap(void *vbuf, size_t size) +{ + dnode_phys_t *buf = vbuf; + int i; + + ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); + ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); + + size >>= DNODE_SHIFT; + for (i = 0; i < size; i++) { + dnode_byteswap(buf); + buf++; + } +} + +static int +free_range_compar(const void *node1, const void *node2) +{ + const free_range_t *rp1 = node1; + const free_range_t *rp2 = node2; + + if (rp1->fr_blkid < rp2->fr_blkid) + return (-1); + else if (rp1->fr_blkid > rp2->fr_blkid) + return (1); + else return (0); +} + +void +dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + dn->dn_bonuslen = newsize; + if (newsize == 0) + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; + else + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + rw_exit(&dn->dn_struct_rwlock); +} + +static void +dnode_setdblksz(dnode_t *dn, int size) +{ + ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, >=, SPA_MINBLOCKSIZE); + ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, + 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); + dn->dn_datablksz = size; + dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; + dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0; +} + +static dnode_t * +dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, + uint64_t object) +{ + dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + (void) dnode_cons(dn, NULL, 0); /* XXX */ + + dn->dn_objset = os; + dn->dn_object = object; + dn->dn_dbuf = db; + dn->dn_phys = dnp; + + if (dnp->dn_datablkszsec) + dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_indblkshift = dnp->dn_indblkshift; + dn->dn_nlevels = dnp->dn_nlevels; + dn->dn_type = dnp->dn_type; + dn->dn_nblkptr = dnp->dn_nblkptr; + dn->dn_checksum = dnp->dn_checksum; + dn->dn_compress = dnp->dn_compress; + dn->dn_bonustype = dnp->dn_bonustype; + dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_maxblkid = dnp->dn_maxblkid; + + dmu_zfetch_init(&dn->dn_zfetch, dn); + + ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + mutex_enter(&os->os_lock); + list_insert_head(&os->os_dnodes, dn); + mutex_exit(&os->os_lock); + + arc_space_consume(sizeof (dnode_t)); + return (dn); +} + +static void +dnode_destroy(dnode_t *dn) +{ + objset_impl_t *os = dn->dn_objset; + +#ifdef ZFS_DEBUG + int i; + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); + ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); + } + ASSERT(NULL == list_head(&dn->dn_dbufs)); +#endif + + mutex_enter(&os->os_lock); + list_remove(&os->os_dnodes, dn); + mutex_exit(&os->os_lock); + + if (dn->dn_dirtyctx_firstset) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + dmu_zfetch_rele(&dn->dn_zfetch); + if (dn->dn_bonus) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_evict(dn->dn_bonus); + dn->dn_bonus = NULL; + } + kmem_cache_free(dnode_cache, dn); + arc_space_return(sizeof (dnode_t)); +} + +void +dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int i; + + if (blocksize == 0) + blocksize = 1 << zfs_default_bs; + else if (blocksize > SPA_MAXBLOCKSIZE) + blocksize = SPA_MAXBLOCKSIZE; + else + blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); + + if (ibs == 0) + ibs = zfs_default_ibs; + + ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); + + dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, + dn->dn_object, tx->tx_txg, blocksize, ibs); + + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); + ASSERT(ot != DMU_OT_NONE); + ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT3U(dn->dn_maxblkid, ==, 0); + ASSERT3U(dn->dn_allocated_txg, ==, 0); + ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], ==, 0); + ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); + ASSERT3U(dn->dn_next_blksz[i], ==, 0); + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); + ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0); + } + + dn->dn_type = ot; + dnode_setdblksz(dn, blocksize); + dn->dn_indblkshift = ibs; + dn->dn_nlevels = 1; + dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + dn->dn_dirtyctx = 0; + + dn->dn_free_txg = 0; + if (dn->dn_dirtyctx_firstset) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + + dn->dn_allocated_txg = tx->tx_txg; + + dnode_setdirty(dn, tx); + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; +} + +void +dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int i, old_nblkptr; + dmu_buf_impl_t *db = NULL; + + ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); + ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0); + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + ASSERT(tx->tx_txg != 0); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + + for (i = 0; i < TXG_SIZE; i++) + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + + /* clean up any unreferenced dbufs */ + dnode_evict_dbufs(dn); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + /* + * XXX I should really have a generation number to tell if we + * need to do this... + */ + if (blocksize != dn->dn_datablksz || + dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) { + /* free all old data */ + dnode_free_range(dn, 0, -1ULL, tx); + } + + /* change blocksize */ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (blocksize != dn->dn_datablksz && + (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || + list_head(&dn->dn_dbufs) != NULL)) { + db = dbuf_hold(dn, 0, FTAG); + dbuf_new_size(db, blocksize, tx); + } + dnode_setdblksz(dn, blocksize); + dnode_setdirty(dn, tx); + dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + rw_exit(&dn->dn_struct_rwlock); + if (db) + dbuf_rele(db, FTAG); + + /* change type */ + dn->dn_type = ot; + + /* change bonus size and type */ + mutex_enter(&dn->dn_mtx); + old_nblkptr = dn->dn_nblkptr; + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + + /* XXX - for now, we can't make nblkptr smaller */ + ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr); + + /* fix up the bonus db_size if dn_nblkptr has changed */ + if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) { + dn->dn_bonus->db.db_size = + DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); + } + + dn->dn_allocated_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); +} + +void +dnode_special_close(dnode_t *dn) +{ + /* + * Wait for final references to the dnode to clear. This can + * only happen if the arc is asyncronously evicting state that + * has a hold on this dnode while we are trying to evict this + * dnode. + */ + while (refcount_count(&dn->dn_holds) > 0) + delay(1); + dnode_destroy(dn); +} + +dnode_t * +dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object) +{ + dnode_t *dn = dnode_create(os, dnp, NULL, object); + DNODE_VERIFY(dn); + return (dn); +} + +static void +dnode_buf_pageout(dmu_buf_t *db, void *arg) +{ + dnode_t **children_dnodes = arg; + int i; + int epb = db->db_size >> DNODE_SHIFT; + + for (i = 0; i < epb; i++) { + dnode_t *dn = children_dnodes[i]; + int n; + + if (dn == NULL) + continue; +#ifdef ZFS_DEBUG + /* + * If there are holds on this dnode, then there should + * be holds on the dnode's containing dbuf as well; thus + * it wouldn't be eligable for eviction and this function + * would not have been called. + */ + ASSERT(refcount_is_zero(&dn->dn_holds)); + ASSERT(list_head(&dn->dn_dbufs) == NULL); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + + for (n = 0; n < TXG_SIZE; n++) + ASSERT(!list_link_active(&dn->dn_dirty_link[n])); +#endif + children_dnodes[i] = NULL; + dnode_destroy(dn); + } + kmem_free(children_dnodes, epb * sizeof (dnode_t *)); +} + +/* + * errors: + * EINVAL - invalid object number. + * EIO - i/o error. + * succeeds even for free dnodes. + */ +int +dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, + void *tag, dnode_t **dnp) +{ + int epb, idx, err; + int drop_struct_lock = FALSE; + int type; + uint64_t blk; + dnode_t *mdn, *dn; + dmu_buf_impl_t *db; + dnode_t **children_dnodes; + + /* + * If you are holding the spa config lock as writer, you shouldn't + * be asking the DMU to do *anything*. + */ + ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); + + if (object == 0 || object >= DN_MAX_OBJECT) + return (EINVAL); + + mdn = os->os_meta_dnode; + + DNODE_VERIFY(mdn); + + if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { + rw_enter(&mdn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); + + db = dbuf_hold(mdn, blk, FTAG); + if (drop_struct_lock) + rw_exit(&mdn->dn_struct_rwlock); + if (db == NULL) + return (EIO); + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err) { + dbuf_rele(db, FTAG); + return (err); + } + + ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); + epb = db->db.db_size >> DNODE_SHIFT; + + idx = object & (epb-1); + + children_dnodes = dmu_buf_get_user(&db->db); + if (children_dnodes == NULL) { + dnode_t **winner; + children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *), + KM_SLEEP); + if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, + dnode_buf_pageout)) { + kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + children_dnodes = winner; + } + } + + if ((dn = children_dnodes[idx]) == NULL) { + dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx; + dnode_t *winner; + + dn = dnode_create(os, dnp, db, object); + winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); + if (winner != NULL) { + dnode_destroy(dn); + dn = winner; + } + } + + mutex_enter(&dn->dn_mtx); + type = dn->dn_type; + if (dn->dn_free_txg || + ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || + ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) { + mutex_exit(&dn->dn_mtx); + dbuf_rele(db, FTAG); + return (type == DMU_OT_NONE ? ENOENT : EEXIST); + } + mutex_exit(&dn->dn_mtx); + + if (refcount_add(&dn->dn_holds, tag) == 1) + dbuf_add_ref(db, dn); + + DNODE_VERIFY(dn); + ASSERT3P(dn->dn_dbuf, ==, db); + ASSERT3U(dn->dn_object, ==, object); + dbuf_rele(db, FTAG); + + *dnp = dn; + return (0); +} + +/* + * Return held dnode if the object is allocated, NULL if not. + */ +int +dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); +} + +/* + * Can only add a reference if there is already at least one + * reference on the dnode. Returns FALSE if unable to add a + * new reference. + */ +boolean_t +dnode_add_ref(dnode_t *dn, void *tag) +{ + mutex_enter(&dn->dn_mtx); + if (refcount_is_zero(&dn->dn_holds)) { + mutex_exit(&dn->dn_mtx); + return (FALSE); + } + VERIFY(1 < refcount_add(&dn->dn_holds, tag)); + mutex_exit(&dn->dn_mtx); + return (TRUE); +} + +void +dnode_rele(dnode_t *dn, void *tag) +{ + uint64_t refs; + + mutex_enter(&dn->dn_mtx); + refs = refcount_remove(&dn->dn_holds, tag); + mutex_exit(&dn->dn_mtx); + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ + if (refs == 0 && dn->dn_dbuf) + dbuf_rele(dn->dn_dbuf, dn); +} + +void +dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) +{ + objset_impl_t *os = dn->dn_objset; + uint64_t txg = tx->tx_txg; + + if (dn->dn_object == DMU_META_DNODE_OBJECT) + return; + + DNODE_VERIFY(dn); + +#ifdef ZFS_DEBUG + mutex_enter(&dn->dn_mtx); + ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); + /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */ + mutex_exit(&dn->dn_mtx); +#endif + + mutex_enter(&os->os_lock); + + /* + * If we are already marked dirty, we're done. + */ + if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { + mutex_exit(&os->os_lock); + return; + } + + ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(dn->dn_datablksz != 0); + ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); + ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); + + dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", + dn->dn_object, txg); + + if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { + list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); + } else { + list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); + } + + mutex_exit(&os->os_lock); + + /* + * The dnode maintains a hold on its containing dbuf as + * long as there are holds on it. Each instantiated child + * dbuf maintaines a hold on the dnode. When the last child + * drops its hold, the dnode will drop its hold on the + * containing dbuf. We add a "dirty hold" here so that the + * dnode will hang around after we finish processing its + * children. + */ + VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); + + (void) dbuf_dirty(dn->dn_dbuf, tx); + + dsl_dataset_dirty(os->os_dsl_dataset, tx); +} + +void +dnode_free(dnode_t *dn, dmu_tx_t *tx) +{ + int txgoff = tx->tx_txg & TXG_MASK; + + dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); + + /* we should be the only holder... hopefully */ + /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ + + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { + mutex_exit(&dn->dn_mtx); + return; + } + dn->dn_free_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); + + /* + * If the dnode is already dirty, it needs to be moved from + * the dirty list to the free list. + */ + mutex_enter(&dn->dn_objset->os_lock); + if (list_link_active(&dn->dn_dirty_link[txgoff])) { + list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn); + list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn); + mutex_exit(&dn->dn_objset->os_lock); + } else { + mutex_exit(&dn->dn_objset->os_lock); + dnode_setdirty(dn, tx); + } +} + +/* + * Try to change the block size for the indicated dnode. This can only + * succeed if there are no blocks allocated or dirty beyond first block + */ +int +dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + int err; + + if (size == 0) + size = SPA_MINBLOCKSIZE; + if (size > SPA_MAXBLOCKSIZE) + size = SPA_MAXBLOCKSIZE; + else + size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); + + if (ibs == dn->dn_indblkshift) + ibs = 0; + + if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + return (0); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* Check for any allocated blocks beyond the first */ + if (dn->dn_phys->dn_maxblkid != 0) + goto fail; + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + + if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) { + mutex_exit(&dn->dn_dbufs_mtx); + goto fail; + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + if (ibs && dn->dn_nlevels != 1) + goto fail; + + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + if (err == 0) + dbuf_new_size(db, size, tx); + else if (err != ENOENT) + goto fail; + + dnode_setdblksz(dn, size); + dnode_setdirty(dn, tx); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (ibs) { + dn->dn_indblkshift = ibs; + dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + } + /* rele after we have fixed the blocksize in the dnode */ + if (db) + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); + return (0); + +fail: + rw_exit(&dn->dn_struct_rwlock); + return (ENOTSUP); +} + +/* read-holding callers must not rely on the lock being continuously held */ +void +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) +{ + uint64_t txgoff = tx->tx_txg & TXG_MASK; + int epbs, new_nlevels; + uint64_t sz; + + ASSERT(blkid != DB_BONUS_BLKID); + + ASSERT(have_read ? + RW_READ_HELD(&dn->dn_struct_rwlock) : + RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * if we have a read-lock, check to see if we need to do any work + * before upgrading to a write-lock. + */ + if (have_read) { + if (blkid <= dn->dn_maxblkid) + return; + + if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + } + } + + if (blkid <= dn->dn_maxblkid) + goto out; + + dn->dn_maxblkid = blkid; + + /* + * Compute the number of levels necessary to support the new maxblkid. + */ + new_nlevels = 1; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (sz = dn->dn_nblkptr; + sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) + new_nlevels++; + + if (new_nlevels > dn->dn_nlevels) { + int old_nlevels = dn->dn_nlevels; + dmu_buf_impl_t *db; + list_t *list; + dbuf_dirty_record_t *new, *dr, *dr_next; + + dn->dn_nlevels = new_nlevels; + + ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); + dn->dn_next_nlevels[txgoff] = new_nlevels; + + /* dirty the left indirects */ + db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + new = dbuf_dirty(db, tx); + dbuf_rele(db, FTAG); + + /* transfer the dirty records to the new indirect */ + mutex_enter(&dn->dn_mtx); + mutex_enter(&new->dt.di.dr_mtx); + list = &dn->dn_dirty_records[txgoff]; + for (dr = list_head(list); dr; dr = dr_next) { + dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); + if (dr->dr_dbuf->db_level != new_nlevels-1 && + dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) { + ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + list_remove(&dn->dn_dirty_records[txgoff], dr); + list_insert_tail(&new->dt.di.dr_children, dr); + dr->dr_parent = new; + } + } + mutex_exit(&new->dt.di.dr_mtx); + mutex_exit(&dn->dn_mtx); + } + +out: + if (have_read) + rw_downgrade(&dn->dn_struct_rwlock); +} + +void +dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +{ + avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; + avl_index_t where; + free_range_t *rp; + free_range_t rp_tofind; + uint64_t endblk = blkid + nblks; + + ASSERT(MUTEX_HELD(&dn->dn_mtx)); + ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */ + + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); + rp_tofind.fr_blkid = blkid; + rp = avl_find(tree, &rp_tofind, &where); + if (rp == NULL) + rp = avl_nearest(tree, where, AVL_BEFORE); + if (rp == NULL) + rp = avl_nearest(tree, where, AVL_AFTER); + + while (rp && (rp->fr_blkid <= blkid + nblks)) { + uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks; + free_range_t *nrp = AVL_NEXT(tree, rp); + + if (blkid <= rp->fr_blkid && endblk >= fr_endblk) { + /* clear this entire range */ + avl_remove(tree, rp); + kmem_free(rp, sizeof (free_range_t)); + } else if (blkid <= rp->fr_blkid && + endblk > rp->fr_blkid && endblk < fr_endblk) { + /* clear the beginning of this range */ + rp->fr_blkid = endblk; + rp->fr_nblks = fr_endblk - endblk; + } else if (blkid > rp->fr_blkid && blkid < fr_endblk && + endblk >= fr_endblk) { + /* clear the end of this range */ + rp->fr_nblks = blkid - rp->fr_blkid; + } else if (blkid > rp->fr_blkid && endblk < fr_endblk) { + /* clear a chunk out of this range */ + free_range_t *new_rp = + kmem_alloc(sizeof (free_range_t), KM_SLEEP); + + new_rp->fr_blkid = endblk; + new_rp->fr_nblks = fr_endblk - endblk; + avl_insert_here(tree, new_rp, rp, AVL_AFTER); + rp->fr_nblks = blkid - rp->fr_blkid; + } + /* there may be no overlap */ + rp = nrp; + } +} + +void +dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + uint64_t blkoff, blkid, nblks; + int blksz, blkshift, head, tail; + int trunc = FALSE; + int epbs; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + blksz = dn->dn_datablksz; + blkshift = dn->dn_datablkshift; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + if (len == -1ULL) { + len = UINT64_MAX - off; + trunc = TRUE; + } + + /* + * First, block align the region to free: + */ + if (ISP2(blksz)) { + head = P2NPHASE(off, blksz); + blkoff = P2PHASE(off, blksz); + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; + } else { + ASSERT(dn->dn_maxblkid == 0); + if (off == 0 && len >= blksz) { + /* Freeing the whole block; fast-track this request */ + blkid = 0; + nblks = 1; + goto done; + } else if (off >= blksz) { + /* Freeing past end-of-data */ + goto out; + } else { + /* Freeing part of the block. */ + head = blksz - off; + ASSERT3U(head, >, 0); + } + blkoff = off; + } + /* zero out any partial block data at the start of the range */ + if (head) { + ASSERT3U(blkoff + head, ==, blksz); + if (len < head) + head = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, + FTAG, &db) == 0) { + caddr_t data; + + /* don't dirty if it isn't on disk and isn't dirty */ + if (db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + data = db->db.db_data; + bzero(data + blkoff, head); + } + dbuf_rele(db, FTAG); + } + off += head; + len -= head; + } + + /* If the range was less than one block, we're done */ + if (len == 0) + goto out; + + /* If the remaining range is past end of file, we're done */ + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; + + ASSERT(ISP2(blksz)); + if (trunc) + tail = 0; + else + tail = P2PHASE(len, blksz); + + ASSERT3U(P2PHASE(off, blksz), ==, 0); + /* zero out any partial block data at the end of the range */ + if (tail) { + if (len < tail) + tail = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), + TRUE, FTAG, &db) == 0) { + /* don't dirty if not on disk and not dirty */ + if (db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + bzero(db->db.db_data, tail); + } + dbuf_rele(db, FTAG); + } + len -= tail; + } + + /* If the range did not include a full block, we are done */ + if (len == 0) + goto out; + + ASSERT(IS_P2ALIGNED(off, blksz)); + ASSERT(trunc || IS_P2ALIGNED(len, blksz)); + blkid = off >> blkshift; + nblks = len >> blkshift; + if (trunc) + nblks += 1; + + /* + * Read in and mark all the level-1 indirects dirty, + * so that they will stay in memory until syncing phase. + * Always dirty the first and last indirect to make sure + * we dirty all the partial indirects. + */ + if (dn->dn_nlevels > 1) { + uint64_t i, first, last; + int shift = epbs + dn->dn_datablkshift; + + first = blkid >> epbs; + if (db = dbuf_hold_level(dn, 1, first, FTAG)) { + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } + if (trunc) + last = dn->dn_maxblkid >> epbs; + else + last = (blkid + nblks - 1) >> epbs; + if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } + for (i = first + 1; i < last; i++) { + uint64_t ibyte = i << shift; + int err; + + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0); + i = ibyte >> shift; + if (err == ESRCH || i >= last) + break; + ASSERT(err == 0); + db = dbuf_hold_level(dn, 1, i, FTAG); + if (db) { + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } + } + } +done: + /* + * Add this range to the dnode range list. + * We will finish up this free operation in the syncing phase. + */ + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, blkid, nblks, tx); + { + free_range_t *rp, *found; + avl_index_t where; + avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; + + /* Add new range to dn_ranges */ + rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP); + rp->fr_blkid = blkid; + rp->fr_nblks = nblks; + found = avl_find(tree, rp, &where); + ASSERT(found == NULL); + avl_insert(tree, rp, where); + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); + } + mutex_exit(&dn->dn_mtx); + + dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); + dnode_setdirty(dn, tx); +out: + if (trunc && dn->dn_maxblkid >= (off >> blkshift)) + dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0); + + rw_exit(&dn->dn_struct_rwlock); +} + +/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ +uint64_t +dnode_block_freed(dnode_t *dn, uint64_t blkid) +{ + free_range_t range_tofind; + void *dp = spa_get_dsl(dn->dn_objset->os_spa); + int i; + + if (blkid == DB_BONUS_BLKID) + return (FALSE); + + /* + * If we're in the process of opening the pool, dp will not be + * set yet, but there shouldn't be anything dirty. + */ + if (dp == NULL) + return (FALSE); + + if (dn->dn_free_txg) + return (TRUE); + + /* + * If dn_datablkshift is not set, then there's only a single + * block, in which case there will never be a free range so it + * won't matter. + */ + range_tofind.fr_blkid = blkid; + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + free_range_t *range_found; + avl_index_t idx; + + range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx); + if (range_found) { + ASSERT(range_found->fr_nblks > 0); + break; + } + range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE); + if (range_found && + range_found->fr_blkid + range_found->fr_nblks > blkid) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + +/* call from syncing context when we actually write/free space for this dnode */ +void +dnode_diduse_space(dnode_t *dn, int64_t delta) +{ + uint64_t space; + dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", + dn, dn->dn_phys, + (u_longlong_t)dn->dn_phys->dn_used, + (longlong_t)delta); + + mutex_enter(&dn->dn_mtx); + space = DN_USED_BYTES(dn->dn_phys); + if (delta > 0) { + ASSERT3U(space + delta, >=, space); /* no overflow */ + } else { + ASSERT3U(space, >=, -delta); /* no underflow */ + } + space += delta; + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { + ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); + ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0); + dn->dn_phys->dn_used = space >> DEV_BSHIFT; + } else { + dn->dn_phys->dn_used = space; + dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; + } + mutex_exit(&dn->dn_mtx); +} + +/* + * Call when we think we're going to write/free space in open context. + * Be conservative (ie. OK to write less than this or free more than + * this, but don't write more or free less). + */ +void +dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) +{ + objset_impl_t *os = dn->dn_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + + if (space > 0) + space = spa_get_asize(os->os_spa, space); + + if (ds) + dsl_dir_willuse_space(ds->ds_dir, space, tx); + + dmu_tx_willuse_space(tx, space); +} + +static int +dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, + int lvl, uint64_t blkfill, uint64_t txg) +{ + dmu_buf_impl_t *db = NULL; + void *data = NULL; + uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t epb = 1ULL << epbs; + uint64_t minfill, maxfill; + boolean_t hole; + int i, inc, error, span; + + dprintf("probing object %llu offset %llx level %d of %u\n", + dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + + hole = flags & DNODE_FIND_HOLE; + inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; + ASSERT(txg == 0 || !hole); + + if (lvl == dn->dn_phys->dn_nlevels) { + error = 0; + epb = dn->dn_phys->dn_nblkptr; + data = dn->dn_phys->dn_blkptr; + } else { + uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + if (error) { + if (error != ENOENT) + return (error); + if (hole) + return (0); + /* + * This can only happen when we are searching up + * the block tree for data. We don't really need to + * adjust the offset, as we will just end up looking + * at the pointer to this block in its parent, and its + * going to be unallocated, so we will skip over it. + */ + return (ESRCH); + } + error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); + if (error) { + dbuf_rele(db, FTAG); + return (error); + } + data = db->db.db_data; + } + + if (db && txg && + (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { + /* + * This can only happen when we are searching up the tree + * and these conditions mean that we need to keep climbing. + */ + error = ESRCH; + } else if (lvl == 0) { + dnode_phys_t *dnp = data; + span = DNODE_SHIFT; + ASSERT(dn->dn_type == DMU_OT_DNODE); + + for (i = (*offset >> span) & (blkfill - 1); + i >= 0 && i < blkfill; i += inc) { + boolean_t newcontents = B_TRUE; + if (txg) { + int j; + newcontents = B_FALSE; + for (j = 0; j < dnp[i].dn_nblkptr; j++) { + if (dnp[i].dn_blkptr[j].blk_birth > txg) + newcontents = B_TRUE; + } + } + if (!dnp[i].dn_type == hole && newcontents) + break; + *offset += (1ULL << span) * inc; + } + if (i < 0 || i == blkfill) + error = ESRCH; + } else { + blkptr_t *bp = data; + span = (lvl - 1) * epbs + dn->dn_datablkshift; + minfill = 0; + maxfill = blkfill << ((lvl - 1) * epbs); + + if (hole) + maxfill--; + else + minfill++; + + for (i = (*offset >> span) & ((1ULL << epbs) - 1); + i >= 0 && i < epb; i += inc) { + if (bp[i].blk_fill >= minfill && + bp[i].blk_fill <= maxfill && + (hole || bp[i].blk_birth > txg)) + break; + if (inc < 0 && *offset < (1ULL << span)) + *offset = 0; + else + *offset += (1ULL << span) * inc; + } + if (i < 0 || i == epb) + error = ESRCH; + } + + if (db) + dbuf_rele(db, FTAG); + + return (error); +} + +/* + * Find the next hole, data, or sparse region at or after *offset. + * The value 'blkfill' tells us how many items we expect to find + * in an L0 data block; this value is 1 for normal objects, + * DNODES_PER_BLOCK for the meta dnode, and some fraction of + * DNODES_PER_BLOCK when searching for sparse regions thereof. + * + * Examples: + * + * dnode_next_offset(dn, flags, offset, 1, 1, 0); + * Finds the next/previous hole/data in a file. + * Used in dmu_offset_next(). + * + * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); + * Finds the next free/allocated dnode an objset's meta-dnode. + * Only finds objects that have new contents since txg (ie. + * bonus buffer changes and content removal are ignored). + * Used in dmu_object_next(). + * + * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); + * Finds the next L2 meta-dnode bp that's at most 1/4 full. + * Used in dmu_object_alloc(). + */ +int +dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, + int minlvl, uint64_t blkfill, uint64_t txg) +{ + uint64_t initial_offset = *offset; + int lvl, maxlvl; + int error = 0; + + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (dn->dn_phys->dn_nlevels == 0) { + error = ESRCH; + goto out; + } + + if (dn->dn_datablkshift == 0) { + if (*offset < dn->dn_datablksz) { + if (flags & DNODE_FIND_HOLE) + *offset = dn->dn_datablksz; + } else { + error = ESRCH; + } + goto out; + } + + maxlvl = dn->dn_phys->dn_nlevels; + + for (lvl = minlvl; lvl <= maxlvl; lvl++) { + error = dnode_next_offset_level(dn, + flags, offset, lvl, blkfill, txg); + if (error != ESRCH) + break; + } + + while (error == 0 && --lvl >= minlvl) { + error = dnode_next_offset_level(dn, + flags, offset, lvl, blkfill, txg); + } + + if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? + initial_offset < *offset : initial_offset > *offset)) + error = ESRCH; +out: + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_exit(&dn->dn_struct_rwlock); + + return (error); +} diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c new file mode 100644 index 000000000..779cfc96f --- /dev/null +++ b/module/zfs/dnode_sync.c @@ -0,0 +1,623 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> + +static void +dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int txgoff = tx->tx_txg & TXG_MASK; + int nblkptr = dn->dn_phys->dn_nblkptr; + int old_toplvl = dn->dn_phys->dn_nlevels - 1; + int new_level = dn->dn_next_nlevels[txgoff]; + int i; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* this dnode can't be paged out because it's dirty */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); + + db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); + ASSERT(db != NULL); + + dn->dn_phys->dn_nlevels = new_level; + dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, + dn->dn_object, dn->dn_phys->dn_nlevels); + + /* check for existing blkptrs in the dnode */ + for (i = 0; i < nblkptr; i++) + if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) + break; + if (i != nblkptr) { + /* transfer dnode's block pointers to new indirect block */ + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + ASSERT(db->db.db_data); + ASSERT(arc_released(db->db_buf)); + ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); + bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + sizeof (blkptr_t) * nblkptr); + arc_buf_freeze(db->db_buf); + } + + /* set dbuf's parent pointers to new indirect buf */ + for (i = 0; i < nblkptr; i++) { + dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i); + + if (child == NULL) + continue; + ASSERT3P(child->db_dnode, ==, dn); + if (child->db_parent && child->db_parent != dn->dn_dbuf) { + ASSERT(child->db_parent->db_level == db->db_level); + ASSERT(child->db_blkptr != + &dn->dn_phys->dn_blkptr[child->db_blkid]); + mutex_exit(&child->db_mtx); + continue; + } + ASSERT(child->db_parent == NULL || + child->db_parent == dn->dn_dbuf); + + child->db_parent = db; + dbuf_add_ref(db, child); + if (db->db.db_data) + child->db_blkptr = (blkptr_t *)db->db.db_data + i; + else + child->db_blkptr = NULL; + dprintf_dbuf_bp(child, child->db_blkptr, + "changed db_blkptr to new indirect %s", ""); + + mutex_exit(&child->db_mtx); + } + + bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); +} + +static int +free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + uint64_t bytesfreed = 0; + int i, blocks_freed = 0; + + dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); + + for (i = 0; i < num; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + + bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx); + ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); + bzero(bp, sizeof (blkptr_t)); + blocks_freed += 1; + } + dnode_diduse_space(dn, -bytesfreed); + return (blocks_freed); +} + +#ifdef ZFS_DEBUG +static void +free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) +{ + int off, num; + int i, err, epbs; + uint64_t txg = tx->tx_txg; + + epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + off = start - (db->db_blkid * 1<<epbs); + num = end - start + 1; + + ASSERT3U(off, >=, 0); + ASSERT3U(num, >=, 0); + ASSERT3U(db->db_level, >, 0); + ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift); + ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); + ASSERT(db->db_blkptr != NULL); + + for (i = off; i < off+num; i++) { + uint64_t *buf; + dmu_buf_impl_t *child; + dbuf_dirty_record_t *dr; + int j; + + ASSERT(db->db_level == 1); + + rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(db->db_dnode, db->db_level-1, + (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + rw_exit(&db->db_dnode->dn_struct_rwlock); + if (err == ENOENT) + continue; + ASSERT(err == 0); + ASSERT(child->db_level == 0); + dr = child->db_last_dirty; + while (dr && dr->dr_txg > txg) + dr = dr->dr_next; + ASSERT(dr == NULL || dr->dr_txg == txg); + + /* data_old better be zeroed */ + if (dr) { + buf = dr->dt.dl.dr_data->b_data; + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); + } + } + } + + /* + * db_data better be zeroed unless it's dirty in a + * future txg. + */ + mutex_enter(&child->db_mtx); + buf = child->db.db_data; + if (buf != NULL && child->db_state != DB_FILL && + child->db_last_dirty == NULL) { + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); + } + } + } + mutex_exit(&child->db_mtx); + + dbuf_rele(child, FTAG); + } +} +#endif + +#define ALL -1 + +static int +free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, + dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + blkptr_t *bp; + dmu_buf_impl_t *subdb; + uint64_t start, end, dbstart, dbend, i; + int epbs, shift, err; + int all = TRUE; + int blocks_freed = 0; + + /* + * There is a small possibility that this block will not be cached: + * 1 - if level > 1 and there are no children with level <= 1 + * 2 - if we didn't get a dirty hold (because this block had just + * finished being written -- and so had no holds), and then this + * block got evicted before we got here. + */ + if (db->db_state != DB_CACHED) + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + + arc_release(db->db_buf, db); + bp = (blkptr_t *)db->db.db_data; + + epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + shift = (db->db_level - 1) * epbs; + dbstart = db->db_blkid << epbs; + start = blkid >> shift; + if (dbstart < start) { + bp += start - dbstart; + all = FALSE; + } else { + start = dbstart; + } + dbend = ((db->db_blkid + 1) << epbs) - 1; + end = (blkid + nblks - 1) >> shift; + if (dbend <= end) + end = dbend; + else if (all) + all = trunc; + ASSERT3U(start, <=, end); + + if (db->db_level == 1) { + FREE_VERIFY(db, start, end, tx); + blocks_freed = free_blocks(dn, bp, end-start+1, tx); + arc_buf_freeze(db->db_buf); + ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + return (all ? ALL : blocks_freed); + } + + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); + ASSERT3U(err, ==, 0); + rw_exit(&dn->dn_struct_rwlock); + + if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { + ASSERT3P(subdb->db_blkptr, ==, bp); + blocks_freed += free_blocks(dn, bp, 1, tx); + } else { + all = FALSE; + } + dbuf_rele(subdb, FTAG); + } + arc_buf_freeze(db->db_buf); +#ifdef ZFS_DEBUG + bp -= (end-start)+1; + for (i = start; i <= end; i++, bp++) { + if (i == start && blkid != 0) + continue; + else if (i == end && !trunc) + continue; + ASSERT3U(bp->blk_birth, ==, 0); + } +#endif + ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + return (all ? ALL : blocks_freed); +} + +/* + * free_range: Traverse the indicated range of the provided file + * and "free" all the blocks contained there. + */ +static void +dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +{ + blkptr_t *bp = dn->dn_phys->dn_blkptr; + dmu_buf_impl_t *db; + int trunc, start, end, shift, i, err; + int dnlevel = dn->dn_phys->dn_nlevels; + + if (blkid > dn->dn_phys->dn_maxblkid) + return; + + ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); + trunc = blkid + nblks > dn->dn_phys->dn_maxblkid; + if (trunc) + nblks = dn->dn_phys->dn_maxblkid - blkid + 1; + + /* There are no indirect blocks in the object */ + if (dnlevel == 1) { + if (blkid >= dn->dn_phys->dn_nblkptr) { + /* this range was never made persistent */ + return; + } + ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); + (void) free_blocks(dn, bp + blkid, nblks, tx); + if (trunc) { + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); + } + return; + } + + shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); + start = blkid >> shift; + ASSERT(start < dn->dn_phys->dn_nblkptr); + end = (blkid + nblks - 1) >> shift; + bp += start; + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); + ASSERT3U(err, ==, 0); + rw_exit(&dn->dn_struct_rwlock); + + if (free_children(db, blkid, nblks, trunc, tx) == ALL) { + ASSERT3P(db->db_blkptr, ==, bp); + (void) free_blocks(dn, bp, 1, tx); + } + dbuf_rele(db, FTAG); + } + if (trunc) { + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); + } +} + +/* + * Try to kick all the dnodes dbufs out of the cache... + */ +void +dnode_evict_dbufs(dnode_t *dn) +{ + int progress; + int pass = 0; + + do { + dmu_buf_impl_t *db, marker; + int evicting = FALSE; + + progress = FALSE; + mutex_enter(&dn->dn_dbufs_mtx); + list_insert_tail(&dn->dn_dbufs, &marker); + db = list_head(&dn->dn_dbufs); + for (; db != ▮ db = list_head(&dn->dn_dbufs)) { + list_remove(&dn->dn_dbufs, db); + list_insert_tail(&dn->dn_dbufs, db); + ASSERT3P(db->db_dnode, ==, dn); + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_EVICTING) { + progress = TRUE; + evicting = TRUE; + mutex_exit(&db->db_mtx); + } else if (refcount_is_zero(&db->db_holds)) { + progress = TRUE; + dbuf_clear(db); /* exits db_mtx for us */ + } else { + mutex_exit(&db->db_mtx); + } + + } + list_remove(&dn->dn_dbufs, &marker); + /* + * NB: we need to drop dn_dbufs_mtx between passes so + * that any DB_EVICTING dbufs can make progress. + * Ideally, we would have some cv we could wait on, but + * since we don't, just wait a bit to give the other + * thread a chance to run. + */ + mutex_exit(&dn->dn_dbufs_mtx); + if (evicting) + delay(1); + pass++; + ASSERT(pass < 100); /* sanity check */ + } while (progress); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_evict(dn->dn_bonus); + dn->dn_bonus = NULL; + } + rw_exit(&dn->dn_struct_rwlock); +} + +static void +dnode_undirty_dbufs(list_t *list) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dr->dr_txg; + + mutex_enter(&db->db_mtx); + /* XXX - use dbuf_undirty()? */ + list_remove(list, dr); + ASSERT(db->db_last_dirty == dr); + db->db_last_dirty = NULL; + db->db_dirtycnt -= 1; + if (db->db_level == 0) { + ASSERT(db->db_blkid == DB_BONUS_BLKID || + dr->dt.dl.dr_data == db->db_buf); + dbuf_unoverride(dr); + mutex_exit(&db->db_mtx); + } else { + mutex_exit(&db->db_mtx); + dnode_undirty_dbufs(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + dbuf_rele(db, (void *)(uintptr_t)txg); + } +} + +static void +dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) +{ + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(dmu_tx_is_syncing(tx)); + + /* + * Our contents should have been freed in dnode_sync() by the + * free range record inserted by the caller of dnode_free(). + */ + ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); + ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); + + dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); + dnode_evict_dbufs(dn); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + /* + * XXX - It would be nice to assert this, but we may still + * have residual holds from async evictions from the arc... + * + * zfs_obj_to_path() also depends on this being + * commented out. + * + * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); + */ + + /* Undirty next bits */ + dn->dn_next_nlevels[txgoff] = 0; + dn->dn_next_indblkshift[txgoff] = 0; + dn->dn_next_blksz[txgoff] = 0; + + /* ASSERT(blkptrs are zero); */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(dn->dn_type != DMU_OT_NONE); + + ASSERT(dn->dn_free_txg > 0); + if (dn->dn_allocated_txg != dn->dn_free_txg) + dbuf_will_dirty(dn->dn_dbuf, tx); + bzero(dn->dn_phys, sizeof (dnode_phys_t)); + + mutex_enter(&dn->dn_mtx); + dn->dn_type = DMU_OT_NONE; + dn->dn_maxblkid = 0; + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + mutex_exit(&dn->dn_mtx); + + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + /* + * Now that we've released our hold, the dnode may + * be evicted, so we musn't access it. + */ +} + +/* + * Write out the dnode's dirty buffers. + * + * NOTE: The dnode is kept in memory by being dirty. Once the + * dirty bit is cleared, it may be evicted. Beware of this! + */ +void +dnode_sync(dnode_t *dn, dmu_tx_t *tx) +{ + free_range_t *rp; + dnode_phys_t *dnp = dn->dn_phys; + int txgoff = tx->tx_txg & TXG_MASK; + list_t *list = &dn->dn_dirty_records[txgoff]; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + DNODE_VERIFY(dn); + + ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); + + mutex_enter(&dn->dn_mtx); + if (dn->dn_allocated_txg == tx->tx_txg) { + /* The dnode is newly allocated or reallocated */ + if (dnp->dn_type == DMU_OT_NONE) { + /* this is a first alloc, not a realloc */ + /* XXX shouldn't the phys already be zeroed? */ + bzero(dnp, DNODE_CORE_SIZE); + dnp->dn_nlevels = 1; + } + + if (dn->dn_nblkptr > dnp->dn_nblkptr) { + /* zero the new blkptrs we are gaining */ + bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + sizeof (blkptr_t) * + (dn->dn_nblkptr - dnp->dn_nblkptr)); + } + dnp->dn_type = dn->dn_type; + dnp->dn_bonustype = dn->dn_bonustype; + dnp->dn_bonuslen = dn->dn_bonuslen; + dnp->dn_nblkptr = dn->dn_nblkptr; + } + + ASSERT(dnp->dn_nlevels > 1 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + + if (dn->dn_next_blksz[txgoff]) { + ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], + SPA_MINBLOCKSIZE) == 0); + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || + dn->dn_maxblkid == 0 || list_head(list) != NULL || + dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == + dnp->dn_datablkszsec); + dnp->dn_datablkszsec = + dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; + dn->dn_next_blksz[txgoff] = 0; + } + + if (dn->dn_next_bonuslen[txgoff]) { + if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) + dnp->dn_bonuslen = 0; + else + dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; + ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); + dn->dn_next_bonuslen[txgoff] = 0; + } + + if (dn->dn_next_indblkshift[txgoff]) { + ASSERT(dnp->dn_nlevels == 1); + dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; + dn->dn_next_indblkshift[txgoff] = 0; + } + + /* + * Just take the live (open-context) values for checksum and compress. + * Strictly speaking it's a future leak, but nothing bad happens if we + * start using the new checksum or compress algorithm a little early. + */ + dnp->dn_checksum = dn->dn_checksum; + dnp->dn_compress = dn->dn_compress; + + mutex_exit(&dn->dn_mtx); + + /* process all the "freed" ranges in the file */ + while (rp = avl_last(&dn->dn_ranges[txgoff])) { + dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); + /* grab the mutex so we don't race with dnode_block_freed() */ + mutex_enter(&dn->dn_mtx); + avl_remove(&dn->dn_ranges[txgoff], rp); + mutex_exit(&dn->dn_mtx); + kmem_free(rp, sizeof (free_range_t)); + } + + if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { + dnode_sync_free(dn, tx); + return; + } + + if (dn->dn_next_nlevels[txgoff]) { + dnode_increase_indirection(dn, tx); + dn->dn_next_nlevels[txgoff] = 0; + } + + dbuf_sync_list(list, tx); + + if (dn->dn_object != DMU_META_DNODE_OBJECT) { + ASSERT3P(list_head(list), ==, NULL); + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + } + + /* + * Although we have dropped our reference to the dnode, it + * can't be evicted until its written, and we haven't yet + * initiated the IO for the dnode's dbuf. + */ +} diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c new file mode 100644 index 000000000..93ea8aa11 --- /dev/null +++ b/module/zfs/dsl_dataset.c @@ -0,0 +1,3103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_traverse.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/unique.h> +#include <sys/zfs_context.h> +#include <sys/zfs_ioctl.h> +#include <sys/spa.h> +#include <sys/zfs_znode.h> +#include <sys/sunddi.h> + +static char *dsl_reaper = "the grim reaper"; + +static dsl_checkfunc_t dsl_dataset_destroy_begin_check; +static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; +static dsl_checkfunc_t dsl_dataset_rollback_check; +static dsl_syncfunc_t dsl_dataset_rollback_sync; +static dsl_syncfunc_t dsl_dataset_set_reservation_sync; + +#define DS_REF_MAX (1ULL << 62) + +#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE + +#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) + + +/* + * Figure out how much of this delta should be propogated to the dsl_dir + * layer. If there's a refreservation, that space has already been + * partially accounted for in our ancestors. + */ +static int64_t +parent_delta(dsl_dataset_t *ds, int64_t delta) +{ + uint64_t old_bytes, new_bytes; + + if (ds->ds_reserved == 0) + return (delta); + + old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); + new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); + + ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); + return (new_bytes - old_bytes); +} + +void +dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +{ + int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + int64_t delta; + + dprintf_bp(bp, "born, ds=%p\n", ds); + + ASSERT(dmu_tx_is_syncing(tx)); + /* It could have been compressed away to nothing */ + if (BP_IS_HOLE(bp)) + return; + ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); + ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); + if (ds == NULL) { + /* + * Account for the meta-objset space in its placeholder + * dsl_dir. + */ + ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, + used, compressed, uncompressed, tx); + dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + return; + } + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + delta = parent_delta(ds, used); + ds->ds_phys->ds_used_bytes += used; + ds->ds_phys->ds_compressed_bytes += compressed; + ds->ds_phys->ds_uncompressed_bytes += uncompressed; + ds->ds_phys->ds_unique_bytes += used; + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, + compressed, uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); + mutex_exit(&ds->ds_dir->dd_lock); +} + +int +dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, + dmu_tx_t *tx) +{ + int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + + ASSERT(pio != NULL); + ASSERT(dmu_tx_is_syncing(tx)); + /* No block pointer => nothing to free */ + if (BP_IS_HOLE(bp)) + return (0); + + ASSERT(used > 0); + if (ds == NULL) { + int err; + /* + * Account for the meta-objset space in its placeholder + * dataset. + */ + err = dsl_free(pio, tx->tx_pool, + tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); + ASSERT(err == 0); + + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, + -used, -compressed, -uncompressed, tx); + dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + return (used); + } + ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); + + ASSERT(!dsl_dataset_is_snapshot(ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { + int err; + int64_t delta; + + dprintf_bp(bp, "freeing: %s", ""); + err = dsl_free(pio, tx->tx_pool, + tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); + ASSERT(err == 0); + + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_phys->ds_unique_bytes >= used || + !DS_UNIQUE_IS_ACCURATE(ds)); + delta = parent_delta(ds, -used); + ds->ds_phys->ds_unique_bytes -= used; + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + delta, -compressed, -uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, -used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); + mutex_exit(&ds->ds_dir->dd_lock); + } else { + dprintf_bp(bp, "putting on dead list: %s", ""); + VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + ASSERT3U(ds->ds_prev->ds_object, ==, + ds->ds_phys->ds_prev_snap_obj); + ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); + /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ + if (ds->ds_prev->ds_phys->ds_next_snap_obj == + ds->ds_object && bp->blk_birth > + ds->ds_prev->ds_phys->ds_prev_snap_txg) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + mutex_enter(&ds->ds_prev->ds_lock); + ds->ds_prev->ds_phys->ds_unique_bytes += used; + mutex_exit(&ds->ds_prev->ds_lock); + } + if (bp->blk_birth > ds->ds_origin_txg) { + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_HEAD, DD_USED_SNAP, tx); + } + } + mutex_enter(&ds->ds_lock); + ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); + ds->ds_phys->ds_used_bytes -= used; + ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); + ds->ds_phys->ds_compressed_bytes -= compressed; + ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); + ds->ds_phys->ds_uncompressed_bytes -= uncompressed; + mutex_exit(&ds->ds_lock); + + return (used); +} + +uint64_t +dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) +{ + uint64_t trysnap = 0; + + if (ds == NULL) + return (0); + /* + * The snapshot creation could fail, but that would cause an + * incorrect FALSE return, which would only result in an + * overestimation of the amount of space that an operation would + * consume, which is OK. + * + * There's also a small window where we could miss a pending + * snapshot, because we could set the sync task in the quiescing + * phase. So this should only be used as a guess. + */ + if (ds->ds_trysnap_txg > + spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) + trysnap = ds->ds_trysnap_txg; + return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); +} + +int +dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) +{ + return (blk_birth > dsl_dataset_prev_snap_txg(ds)); +} + +/* ARGSUSED */ +static void +dsl_dataset_evict(dmu_buf_t *db, void *dsv) +{ + dsl_dataset_t *ds = dsv; + + ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); + + dprintf_ds(ds, "evicting %s\n", ""); + + unique_remove(ds->ds_fsid_guid); + + if (ds->ds_user_ptr != NULL) + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + + if (ds->ds_prev) { + dsl_dataset_drop_ref(ds->ds_prev, ds); + ds->ds_prev = NULL; + } + + bplist_close(&ds->ds_deadlist); + if (ds->ds_dir) + dsl_dir_close(ds->ds_dir, ds); + + ASSERT(!list_link_active(&ds->ds_synced_link)); + + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); + + kmem_free(ds, sizeof (dsl_dataset_t)); +} + +static int +dsl_dataset_get_snapname(dsl_dataset_t *ds) +{ + dsl_dataset_phys_t *headphys; + int err; + dmu_buf_t *headdbuf; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + if (ds->ds_snapname[0]) + return (0); + if (ds->ds_phys->ds_next_snap_obj == 0) + return (0); + + err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, + FTAG, &headdbuf); + if (err) + return (err); + headphys = headdbuf->db_data; + err = zap_value_search(dp->dp_meta_objset, + headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); + dmu_buf_rele(headdbuf, FTAG); + return (err); +} + +static int +dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + matchtype_t mt; + int err; + + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_FIRST; + else + mt = MT_EXACT; + + err = zap_lookup_norm(mos, snapobj, name, 8, 1, + value, mt, NULL, 0, NULL); + if (err == ENOTSUP && mt == MT_FIRST) + err = zap_lookup(mos, snapobj, name, 8, 1, value); + return (err); +} + +static int +dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + matchtype_t mt; + int err; + + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_FIRST; + else + mt = MT_EXACT; + + err = zap_remove_norm(mos, snapobj, name, mt, tx); + if (err == ENOTSUP && mt == MT_FIRST) + err = zap_remove(mos, snapobj, name, tx); + return (err); +} + +static int +dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + dsl_dataset_t *ds; + int err; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + + err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); + if (err) + return (err); + ds = dmu_buf_get_user(dbuf); + if (ds == NULL) { + dsl_dataset_t *winner; + + ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); + ds->ds_dbuf = dbuf; + ds->ds_object = dsobj; + ds->ds_phys = dbuf->db_data; + + mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, + NULL); + rw_init(&ds->ds_rwlock, 0, 0, 0); + cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); + + err = bplist_open(&ds->ds_deadlist, + mos, ds->ds_phys->ds_deadlist_obj); + if (err == 0) { + err = dsl_dir_open_obj(dp, + ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); + } + if (err) { + /* + * we don't really need to close the blist if we + * just opened it. + */ + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); + kmem_free(ds, sizeof (dsl_dataset_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } + + if (!dsl_dataset_is_snapshot(ds)) { + ds->ds_snapname[0] = '\0'; + if (ds->ds_phys->ds_prev_snap_obj) { + err = dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, + ds, &ds->ds_prev); + } + + if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { + dsl_dataset_t *origin; + + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, + FTAG, &origin); + if (err == 0) { + ds->ds_origin_txg = + origin->ds_phys->ds_creation_txg; + dsl_dataset_rele(origin, FTAG); + } + } + } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { + err = dsl_dataset_get_snapname(ds); + } + + if (err == 0 && !dsl_dataset_is_snapshot(ds)) { + /* + * In sync context, we're called with either no lock + * or with the write lock. If we're not syncing, + * we're always called with the read lock held. + */ + boolean_t need_lock = + !RW_WRITE_HELD(&dp->dp_config_rwlock) && + dsl_pool_sync_context(dp); + + if (need_lock) + rw_enter(&dp->dp_config_rwlock, RW_READER); + + err = dsl_prop_get_ds(ds, + "refreservation", sizeof (uint64_t), 1, + &ds->ds_reserved, NULL); + if (err == 0) { + err = dsl_prop_get_ds(ds, + "refquota", sizeof (uint64_t), 1, + &ds->ds_quota, NULL); + } + + if (need_lock) + rw_exit(&dp->dp_config_rwlock); + } else { + ds->ds_reserved = ds->ds_quota = 0; + } + + if (err == 0) { + winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, + dsl_dataset_evict); + } + if (err || winner) { + bplist_close(&ds->ds_deadlist); + if (ds->ds_prev) + dsl_dataset_drop_ref(ds->ds_prev, ds); + dsl_dir_close(ds->ds_dir, ds); + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); + kmem_free(ds, sizeof (dsl_dataset_t)); + if (err) { + dmu_buf_rele(dbuf, tag); + return (err); + } + ds = winner; + } else { + ds->ds_fsid_guid = + unique_insert(ds->ds_phys->ds_fsid_guid); + } + } + ASSERT3P(ds->ds_dbuf, ==, dbuf); + ASSERT3P(ds->ds_phys, ==, dbuf->db_data); + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || + spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || + dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); + mutex_enter(&ds->ds_lock); + if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { + mutex_exit(&ds->ds_lock); + dmu_buf_rele(ds->ds_dbuf, tag); + return (ENOENT); + } + mutex_exit(&ds->ds_lock); + *dsp = ds; + return (0); +} + +static int +dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* + * In syncing context we don't want the rwlock lock: there + * may be an existing writer waiting for sync phase to + * finish. We don't need to worry about such writers, since + * sync phase is single-threaded, so the writer can't be + * doing anything while we are active. + */ + if (dsl_pool_sync_context(dp)) { + ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); + return (0); + } + + /* + * Normal users will hold the ds_rwlock as a READER until they + * are finished (i.e., call dsl_dataset_rele()). "Owners" will + * drop their READER lock after they set the ds_owner field. + * + * If the dataset is being destroyed, the destroy thread will + * obtain a WRITER lock for exclusive access after it's done its + * open-context work and then change the ds_owner to + * dsl_reaper once destruction is assured. So threads + * may block here temporarily, until the "destructability" of + * the dataset is determined. + */ + ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); + mutex_enter(&ds->ds_lock); + while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { + rw_exit(&dp->dp_config_rwlock); + cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); + if (DSL_DATASET_IS_DESTROYED(ds)) { + mutex_exit(&ds->ds_lock); + dsl_dataset_drop_ref(ds, tag); + rw_enter(&dp->dp_config_rwlock, RW_READER); + return (ENOENT); + } + rw_enter(&dp->dp_config_rwlock, RW_READER); + } + mutex_exit(&ds->ds_lock); + return (0); +} + +int +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) +{ + int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); + + if (err) + return (err); + return (dsl_dataset_hold_ref(*dsp, tag)); +} + +int +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner, + dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp); + + ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER); + + if (err) + return (err); + if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { + dsl_dataset_rele(*dsp, owner); + return (EBUSY); + } + return (0); +} + +int +dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) +{ + dsl_dir_t *dd; + dsl_pool_t *dp; + const char *snapname; + uint64_t obj; + int err = 0; + + err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); + if (err) + return (err); + + dp = dd->dd_pool; + obj = dd->dd_phys->dd_head_dataset_obj; + rw_enter(&dp->dp_config_rwlock, RW_READER); + if (obj) + err = dsl_dataset_get_ref(dp, obj, tag, dsp); + else + err = ENOENT; + if (err) + goto out; + + err = dsl_dataset_hold_ref(*dsp, tag); + + /* we may be looking for a snapshot */ + if (err == 0 && snapname != NULL) { + dsl_dataset_t *ds = NULL; + + if (*snapname++ != '@') { + dsl_dataset_rele(*dsp, tag); + err = ENOENT; + goto out; + } + + dprintf("looking for snapshot '%s'\n", snapname); + err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); + if (err == 0) + err = dsl_dataset_get_ref(dp, obj, tag, &ds); + dsl_dataset_rele(*dsp, tag); + + ASSERT3U((err == 0), ==, (ds != NULL)); + + if (ds) { + mutex_enter(&ds->ds_lock); + if (ds->ds_snapname[0] == 0) + (void) strlcpy(ds->ds_snapname, snapname, + sizeof (ds->ds_snapname)); + mutex_exit(&ds->ds_lock); + err = dsl_dataset_hold_ref(ds, tag); + *dsp = err ? NULL : ds; + } + } +out: + rw_exit(&dp->dp_config_rwlock); + dsl_dir_close(dd, FTAG); + return (err); +} + +int +dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold(name, owner, dsp); + if (err) + return (err); + if ((*dsp)->ds_phys->ds_num_children > 0 && + !DS_MODE_IS_READONLY(flags)) { + dsl_dataset_rele(*dsp, owner); + return (EROFS); + } + if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { + dsl_dataset_rele(*dsp, owner); + return (EBUSY); + } + return (0); +} + +void +dsl_dataset_name(dsl_dataset_t *ds, char *name) +{ + if (ds == NULL) { + (void) strcpy(name, "mos"); + } else { + dsl_dir_name(ds->ds_dir, name); + VERIFY(0 == dsl_dataset_get_snapname(ds)); + if (ds->ds_snapname[0]) { + (void) strcat(name, "@"); + /* + * We use a "recursive" mutex so that we + * can call dprintf_ds() with ds_lock held. + */ + if (!MUTEX_HELD(&ds->ds_lock)) { + mutex_enter(&ds->ds_lock); + (void) strcat(name, ds->ds_snapname); + mutex_exit(&ds->ds_lock); + } else { + (void) strcat(name, ds->ds_snapname); + } + } + } +} + +static int +dsl_dataset_namelen(dsl_dataset_t *ds) +{ + int result; + + if (ds == NULL) { + result = 3; /* "mos" */ + } else { + result = dsl_dir_namelen(ds->ds_dir); + VERIFY(0 == dsl_dataset_get_snapname(ds)); + if (ds->ds_snapname[0]) { + ++result; /* adding one for the @-sign */ + if (!MUTEX_HELD(&ds->ds_lock)) { + mutex_enter(&ds->ds_lock); + result += strlen(ds->ds_snapname); + mutex_exit(&ds->ds_lock); + } else { + result += strlen(ds->ds_snapname); + } + } + } + + return (result); +} + +void +dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) +{ + dmu_buf_rele(ds->ds_dbuf, tag); +} + +void +dsl_dataset_rele(dsl_dataset_t *ds, void *tag) +{ + if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { + rw_exit(&ds->ds_rwlock); + } + dsl_dataset_drop_ref(ds, tag); +} + +void +dsl_dataset_disown(dsl_dataset_t *ds, void *owner) +{ + ASSERT((ds->ds_owner == owner && ds->ds_dbuf) || + (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); + + mutex_enter(&ds->ds_lock); + ds->ds_owner = NULL; + if (RW_WRITE_HELD(&ds->ds_rwlock)) { + rw_exit(&ds->ds_rwlock); + cv_broadcast(&ds->ds_exclusive_cv); + } + mutex_exit(&ds->ds_lock); + if (ds->ds_dbuf) + dsl_dataset_drop_ref(ds, owner); + else + dsl_dataset_evict(ds->ds_dbuf, ds); +} + +boolean_t +dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner) +{ + boolean_t gotit = FALSE; + + mutex_enter(&ds->ds_lock); + if (ds->ds_owner == NULL && + (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { + ds->ds_owner = owner; + if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) + rw_exit(&ds->ds_rwlock); + gotit = TRUE; + } + mutex_exit(&ds->ds_lock); + return (gotit); +} + +void +dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) +{ + ASSERT3P(owner, ==, ds->ds_owner); + if (!RW_WRITE_HELD(&ds->ds_rwlock)) + rw_enter(&ds->ds_rwlock, RW_WRITER); +} + +uint64_t +dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + uint64_t flags, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj; + objset_t *mos = dp->dp_meta_objset; + + if (origin == NULL) + origin = dp->dp_origin_snap; + + ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); + ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); + dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_flags = flags; + dsphys->ds_fsid_guid = unique_create(); + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_snapnames_zapobj = + zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, + DMU_OT_NONE, 0, tx); + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; + dsphys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + + if (origin) { + dsphys->ds_prev_snap_obj = origin->ds_object; + dsphys->ds_prev_snap_txg = + origin->ds_phys->ds_creation_txg; + dsphys->ds_used_bytes = + origin->ds_phys->ds_used_bytes; + dsphys->ds_compressed_bytes = + origin->ds_phys->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + origin->ds_phys->ds_uncompressed_bytes; + dsphys->ds_bp = origin->ds_phys->ds_bp; + dsphys->ds_flags |= origin->ds_phys->ds_flags; + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + origin->ds_phys->ds_num_children++; + + if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { + if (origin->ds_phys->ds_next_clones_obj == 0) { + origin->ds_phys->ds_next_clones_obj = + zap_create(mos, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY(0 == zap_add_int(mos, + origin->ds_phys->ds_next_clones_obj, + dsobj, tx)); + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_origin_obj = origin->ds_object; + } + + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + dmu_buf_rele(dbuf, FTAG); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_head_dataset_obj = dsobj; + + return (dsobj); +} + +uint64_t +dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) +{ + dsl_pool_t *dp = pdd->dd_pool; + uint64_t dsobj, ddobj; + dsl_dir_t *dd; + + ASSERT(lastname[0] != '@'); + + ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); + VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); + + dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); + + dsl_deleg_set_create_perms(dd, tx, cr); + + dsl_dir_close(dd, FTAG); + + return (dsobj); +} + +struct destroyarg { + dsl_sync_task_group_t *dstg; + char *snapname; + char *failed; +}; + +static int +dsl_snapshot_destroy_one(char *name, void *arg) +{ + struct destroyarg *da = arg; + dsl_dataset_t *ds; + char *cp; + int err; + + (void) strcat(name, "@"); + (void) strcat(name, da->snapname); + err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT, + da->dstg, &ds); + cp = strchr(name, '@'); + *cp = '\0'; + if (err == 0) { + dsl_dataset_make_exclusive(ds, da->dstg); + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, ds, da->dstg, 0); + } else if (err == ENOENT) { + err = 0; + } else { + (void) strcpy(da->failed, name); + } + return (err); +} + +/* + * Destroy 'snapname' in all descendants of 'fsname'. + */ +#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy +int +dsl_snapshots_destroy(char *fsname, char *snapname) +{ + int err; + struct destroyarg da; + dsl_sync_task_t *dst; + spa_t *spa; + + err = spa_open(fsname, &spa, FTAG); + if (err) + return (err); + da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + da.snapname = snapname; + da.failed = fsname; + + err = dmu_objset_find(fsname, + dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); + + if (err == 0) + err = dsl_sync_task_group_wait(da.dstg); + + for (dst = list_head(&da.dstg->dstg_tasks); dst; + dst = list_next(&da.dstg->dstg_tasks, dst)) { + dsl_dataset_t *ds = dst->dst_arg1; + /* + * Return the file system name that triggered the error + */ + if (dst->dst_err) { + dsl_dataset_name(ds, fsname); + *strchr(fsname, '@') = '\0'; + } + dsl_dataset_disown(ds, da.dstg); + } + + dsl_sync_task_group_destroy(da.dstg); + spa_close(spa, FTAG); + return (err); +} + +/* + * ds must be opened as OWNER. On return (whether successful or not), + * ds will be closed and caller can no longer dereference it. + */ +int +dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) +{ + int err; + dsl_sync_task_group_t *dstg; + objset_t *os; + dsl_dir_t *dd; + uint64_t obj; + + if (dsl_dataset_is_snapshot(ds)) { + /* Destroying a snapshot is simpler */ + dsl_dataset_make_exclusive(ds, tag); + + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_destroy_check, dsl_dataset_destroy_sync, + ds, tag, 0); + goto out; + } + + dd = ds->ds_dir; + + /* + * Check for errors and mark this ds as inconsistent, in + * case we crash while freeing the objects. + */ + err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, + dsl_dataset_destroy_begin_sync, ds, NULL, 0); + if (err) + goto out; + + err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); + if (err) + goto out; + + /* + * remove the objects in open context, so that we won't + * have too much to do in syncing context. + */ + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, + ds->ds_phys->ds_prev_snap_txg)) { + /* + * Ignore errors, if there is not enough disk space + * we will deal with it in dsl_dataset_destroy_sync(). + */ + (void) dmu_free_object(os, obj); + } + + dmu_objset_close(os); + if (err != ESRCH) + goto out; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); + rw_exit(&dd->dd_pool->dp_config_rwlock); + + if (err) + goto out; + + if (ds->ds_user_ptr) { + /* + * We need to sync out all in-flight IO before we try + * to evict (the dataset evict func is trying to clear + * the cached entries for this dataset in the ARC). + */ + txg_wait_synced(dd->dd_pool, 0); + } + + /* + * Blow away the dsl_dir + head dataset. + */ + dsl_dataset_make_exclusive(ds, tag); + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, ds, tag, 0); + dsl_sync_task_create(dstg, dsl_dir_destroy_check, + dsl_dir_destroy_sync, dd, FTAG, 0); + err = dsl_sync_task_group_wait(dstg); + dsl_sync_task_group_destroy(dstg); + /* if it is successful, dsl_dir_destroy_sync will close the dd */ + if (err) + dsl_dir_close(dd, FTAG); +out: + dsl_dataset_disown(ds, tag); + return (err); +} + +int +dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost) +{ + int err; + + ASSERT(ds->ds_owner); + + dsl_dataset_make_exclusive(ds, ds->ds_owner); + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_rollback_check, dsl_dataset_rollback_sync, + ds, &ost, 0); + /* drop exclusive access */ + mutex_enter(&ds->ds_lock); + rw_exit(&ds->ds_rwlock); + cv_broadcast(&ds->ds_exclusive_cv); + mutex_exit(&ds->ds_lock); + return (err); +} + +void * +dsl_dataset_set_user_ptr(dsl_dataset_t *ds, + void *p, dsl_dataset_evict_func_t func) +{ + void *old; + + mutex_enter(&ds->ds_lock); + old = ds->ds_user_ptr; + if (old == NULL) { + ds->ds_user_ptr = p; + ds->ds_user_evict_func = func; + } + mutex_exit(&ds->ds_lock); + return (old); +} + +void * +dsl_dataset_get_user_ptr(dsl_dataset_t *ds) +{ + return (ds->ds_user_ptr); +} + + +blkptr_t * +dsl_dataset_get_blkptr(dsl_dataset_t *ds) +{ + return (&ds->ds_phys->ds_bp); +} + +void +dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + /* If it's the meta-objset, set dp_meta_rootbp */ + if (ds == NULL) { + tx->tx_pool->dp_meta_rootbp = *bp; + } else { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_bp = *bp; + } +} + +spa_t * +dsl_dataset_get_spa(dsl_dataset_t *ds) +{ + return (ds->ds_dir->dd_pool->dp_spa); +} + +void +dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp; + + if (ds == NULL) /* this is the meta-objset */ + return; + + ASSERT(ds->ds_user_ptr != NULL); + + if (ds->ds_phys->ds_next_snap_obj != 0) + panic("dirtying snapshot!"); + + dp = ds->ds_dir->dd_pool; + + if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + } +} + +/* + * The unique space in the head dataset can be calculated by subtracting + * the space used in the most recent snapshot, that is still being used + * in this file system, from the space currently in use. To figure out + * the space in the most recent snapshot still in use, we need to take + * the total space used in the snapshot and subtract out the space that + * has been freed up since the snapshot was taken. + */ +static void +dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) +{ + uint64_t mrs_used; + uint64_t dlused, dlcomp, dluncomp; + + ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); + + if (ds->ds_phys->ds_prev_snap_obj != 0) + mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; + else + mrs_used = 0; + + VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, + &dluncomp)); + + ASSERT3U(dlused, <=, mrs_used); + ds->ds_phys->ds_unique_bytes = + ds->ds_phys->ds_used_bytes - (mrs_used - dlused); + + if (!DS_UNIQUE_IS_ACCURATE(ds) && + spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; +} + +static uint64_t +dsl_dataset_unique(dsl_dataset_t *ds) +{ + if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) + dsl_dataset_recalc_head_uniq(ds); + + return (ds->ds_phys->ds_unique_bytes); +} + +struct killarg { + dsl_dataset_t *ds; + zio_t *zio; + dmu_tx_t *tx; +}; + +/* ARGSUSED */ +static int +kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) +{ + struct killarg *ka = arg; + + if (bp == NULL) + return (0); + + ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); + + return (0); +} + +/* ARGSUSED */ +static int +dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + dmu_objset_type_t *ost = arg2; + + /* + * We can only roll back to emptyness if it is a ZPL objset. + */ + if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0) + return (EINVAL); + + /* + * This must not be a snapshot. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) + return (EINVAL); + + /* + * If we made changes this txg, traverse_dataset won't find + * them. Try again. + */ + if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) + return (EAGAIN); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + dmu_objset_type_t *ost = arg2; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + /* + * Before the roll back destroy the zil. + */ + if (ds->ds_user_ptr != NULL) { + zil_rollback_destroy( + ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx); + + /* + * We need to make sure that the objset_impl_t is reopened after + * we do the rollback, otherwise it will have the wrong + * objset_phys_t. Normally this would happen when this + * dataset-open is closed, thus causing the + * dataset to be immediately evicted. But when doing "zfs recv + * -F", we reopen the objset before that, so that there is no + * window where the dataset is closed and inconsistent. + */ + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + + /* Transfer space that was freed since last snap back to the head. */ + { + uint64_t used; + + VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist, + ds->ds_origin_txg, UINT64_MAX, &used)); + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_SNAP, DD_USED_HEAD, tx); + } + + /* Zero out the deadlist. */ + bplist_close(&ds->ds_deadlist); + bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); + ds->ds_phys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, + ds->ds_phys->ds_deadlist_obj)); + + { + /* Free blkptrs that we gave birth to */ + zio_t *zio; + struct killarg ka; + + zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + ka.ds = ds; + ka.zio = zio; + ka.tx = tx; + (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + TRAVERSE_POST, kill_blkptr, &ka); + (void) zio_wait(zio); + } + + ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) || + ds->ds_phys->ds_unique_bytes == 0); + + if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { + /* Change our contents to that of the prev snapshot */ + + ASSERT3U(ds->ds_prev->ds_object, ==, + ds->ds_phys->ds_prev_snap_obj); + ASSERT3U(ds->ds_phys->ds_used_bytes, <=, + ds->ds_prev->ds_phys->ds_used_bytes); + + ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; + ds->ds_phys->ds_used_bytes = + ds->ds_prev->ds_phys->ds_used_bytes; + ds->ds_phys->ds_compressed_bytes = + ds->ds_prev->ds_phys->ds_compressed_bytes; + ds->ds_phys->ds_uncompressed_bytes = + ds->ds_prev->ds_phys->ds_uncompressed_bytes; + ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; + + if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_unique_bytes = 0; + } + } else { + objset_impl_t *osi; + + ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); + ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); + ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); + + bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t)); + ds->ds_phys->ds_flags = 0; + ds->ds_phys->ds_unique_bytes = 0; + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds, + &ds->ds_phys->ds_bp, *ost, tx); +#ifdef _KERNEL + zfs_create_fs(&osi->os, kcred, NULL, tx); +#endif + } + + spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa, + tx, cr, "dataset = %llu", ds->ds_object); +} + +/* ARGSUSED */ +static int +dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count; + int err; + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) + return (EINVAL); + + /* + * This is really a dsl_dir thing, but check it here so that + * we'll be less likely to leave this dataset inconsistent & + * nearly destroyed. + */ + err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); + if (err) + return (err); + if (count != 0) + return (EEXIST); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* Mark it as inconsistent on-disk, in case we crash */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); +} + +/* ARGSUSED */ +int +dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + + /* we have an owner hold, so noone else can destroy us */ + ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); + + /* Can't delete a branch point. */ + if (ds->ds_phys->ds_num_children > 1) + return (EEXIST); + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) + return (EINVAL); + + /* + * If we made changes this txg, traverse_dsl_dataset won't find + * them. Try again. + */ + if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) + return (EAGAIN); + + /* XXX we should do some i/o error checking... */ + return (0); +} + +struct refsarg { + kmutex_t lock; + boolean_t gone; + kcondvar_t cv; +}; + +/* ARGSUSED */ +static void +dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) +{ + struct refsarg *arg = argv; + + mutex_enter(&arg->lock); + arg->gone = TRUE; + cv_signal(&arg->cv); + mutex_exit(&arg->lock); +} + +static void +dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) +{ + struct refsarg arg; + + mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); + arg.gone = FALSE; + (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, + dsl_dataset_refs_gone); + dmu_buf_rele(ds->ds_dbuf, tag); + mutex_enter(&arg.lock); + while (!arg.gone) + cv_wait(&arg.cv, &arg.lock); + ASSERT(arg.gone); + mutex_exit(&arg.lock); + ds->ds_dbuf = NULL; + ds->ds_phys = NULL; + mutex_destroy(&arg.lock); + cv_destroy(&arg.cv); +} + +void +dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + zio_t *zio; + int err; + int after_branch_point = FALSE; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds_prev = NULL; + uint64_t obj; + + ASSERT(ds->ds_owner); + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT(ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); + ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + + /* signal any waiters that this dataset is going away */ + mutex_enter(&ds->ds_lock); + ds->ds_owner = dsl_reaper; + cv_broadcast(&ds->ds_exclusive_cv); + mutex_exit(&ds->ds_lock); + + /* Remove our reservation */ + if (ds->ds_reserved != 0) { + uint64_t val = 0; + dsl_dataset_set_reservation_sync(ds, &val, cr, tx); + ASSERT3U(ds->ds_reserved, ==, 0); + } + + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + + dsl_pool_ds_destroyed(ds, tx); + + obj = ds->ds_object; + + if (ds->ds_phys->ds_prev_snap_obj != 0) { + if (ds->ds_prev) { + ds_prev = ds->ds_prev; + } else { + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); + } + after_branch_point = + (ds_prev->ds_phys->ds_next_snap_obj != obj); + + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + if (after_branch_point && + ds_prev->ds_phys->ds_next_clones_obj != 0) { + VERIFY(0 == zap_remove_int(mos, + ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(0 == zap_add_int(mos, + ds_prev->ds_phys->ds_next_clones_obj, + ds->ds_phys->ds_next_snap_obj, tx)); + } + } + if (after_branch_point && + ds->ds_phys->ds_next_snap_obj == 0) { + /* This clone is toast. */ + ASSERT(ds_prev->ds_phys->ds_num_children > 1); + ds_prev->ds_phys->ds_num_children--; + } else if (!after_branch_point) { + ds_prev->ds_phys->ds_next_snap_obj = + ds->ds_phys->ds_next_snap_obj; + } + } + + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + + if (ds->ds_phys->ds_next_snap_obj != 0) { + blkptr_t bp; + dsl_dataset_t *ds_next; + uint64_t itor = 0; + uint64_t old_unique; + int64_t used = 0, compressed = 0, uncompressed = 0; + + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); + ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + + old_unique = dsl_dataset_unique(ds_next); + + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); + ds_next->ds_phys->ds_prev_snap_obj = + ds->ds_phys->ds_prev_snap_obj; + ds_next->ds_phys->ds_prev_snap_txg = + ds->ds_phys->ds_prev_snap_txg; + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, + ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); + + /* + * Transfer to our deadlist (which will become next's + * new deadlist) any entries from next's current + * deadlist which were born before prev, and free the + * other entries. + * + * XXX we're doing this long task with the config lock held + */ + while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { + if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { + VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, + &bp, tx)); + if (ds_prev && !after_branch_point && + bp.blk_birth > + ds_prev->ds_phys->ds_prev_snap_txg) { + ds_prev->ds_phys->ds_unique_bytes += + bp_get_dasize(dp->dp_spa, &bp); + } + } else { + used += bp_get_dasize(dp->dp_spa, &bp); + compressed += BP_GET_PSIZE(&bp); + uncompressed += BP_GET_UCSIZE(&bp); + /* XXX check return value? */ + (void) dsl_free(zio, dp, tx->tx_txg, + &bp, NULL, NULL, ARC_NOWAIT); + } + } + + ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -compressed, -uncompressed, tx); + + /* free next's deadlist */ + bplist_close(&ds_next->ds_deadlist); + bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); + + /* set next's deadlist to our deadlist */ + bplist_close(&ds->ds_deadlist); + ds_next->ds_phys->ds_deadlist_obj = + ds->ds_phys->ds_deadlist_obj; + VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, + ds_next->ds_phys->ds_deadlist_obj)); + ds->ds_phys->ds_deadlist_obj = 0; + + if (ds_next->ds_phys->ds_next_snap_obj != 0) { + /* + * Update next's unique to include blocks which + * were previously shared by only this snapshot + * and it. Those blocks will be born after the + * prev snap and before this snap, and will have + * died after the next snap and before the one + * after that (ie. be on the snap after next's + * deadlist). + * + * XXX we're doing this long task with the + * config lock held + */ + dsl_dataset_t *ds_after_next; + uint64_t space; + + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds_next->ds_phys->ds_next_snap_obj, + FTAG, &ds_after_next)); + + VERIFY(0 == + bplist_space_birthrange(&ds_after_next->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_creation_txg, &space)); + ds_next->ds_phys->ds_unique_bytes += space; + + dsl_dataset_rele(ds_after_next, FTAG); + ASSERT3P(ds_next->ds_prev, ==, NULL); + } else { + ASSERT3P(ds_next->ds_prev, ==, ds); + dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); + ds_next->ds_prev = NULL; + if (ds_prev) { + VERIFY(0 == dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, + ds_next, &ds_next->ds_prev)); + } + + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsmed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + ds_next->ds_phys->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + } + } + dsl_dataset_rele(ds_next, FTAG); + } else { + /* + * There's no next snapshot, so this is a head dataset. + * Destroy the deadlist. Unless it's a clone, the + * deadlist should be empty. (If it's a clone, it's + * safe to ignore the deadlist contents.) + */ + struct killarg ka; + + ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); + bplist_close(&ds->ds_deadlist); + bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); + ds->ds_phys->ds_deadlist_obj = 0; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.zio = zio; + ka.tx = tx; + err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + TRAVERSE_POST, kill_blkptr, &ka); + ASSERT3U(err, ==, 0); + ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE || + ds->ds_phys->ds_unique_bytes == 0); + } + + err = zio_wait(zio); + ASSERT3U(err, ==, 0); + + if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { + /* Erase the link in the dir */ + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; + ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); + err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); + ASSERT(err == 0); + } else { + /* remove from snapshot namespace */ + dsl_dataset_t *ds_head; + ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); + VERIFY(0 == dsl_dataset_get_snapname(ds)); +#ifdef ZFS_DEBUG + { + uint64_t val; + + err = dsl_dataset_snap_lookup(ds_head, + ds->ds_snapname, &val); + ASSERT3U(err, ==, 0); + ASSERT3U(val, ==, obj); + } +#endif + err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); + ASSERT(err == 0); + dsl_dataset_rele(ds_head, FTAG); + } + + if (ds_prev && ds->ds_prev != ds_prev) + dsl_dataset_rele(ds_prev, FTAG); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); + + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + ASSERT(0 == zap_count(mos, + ds->ds_phys->ds_next_clones_obj, &count) && count == 0); + VERIFY(0 == dmu_object_free(mos, + ds->ds_phys->ds_next_clones_obj, tx)); + } + if (ds->ds_phys->ds_props_obj != 0) + VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + dsl_dir_close(ds->ds_dir, ds); + ds->ds_dir = NULL; + dsl_dataset_drain_refs(ds, tag); + VERIFY(0 == dmu_object_free(mos, obj, tx)); +} + +static int +dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t asize; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * If there's an fs-only reservation, any blocks that might become + * owned by the snapshot dataset must be accommodated by space + * outside of the reservation. + */ + asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) + return (ENOSPC); + + /* + * Propogate any reserved space for this snapshot to other + * snapshot checks in this sync group. + */ + if (asize > 0) + dsl_dir_willuse_space(ds->ds_dir, asize, tx); + + return (0); +} + +/* ARGSUSED */ +int +dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + const char *snapname = arg2; + int err; + uint64_t value; + + /* + * We don't allow multiple snapshots of the same txg. If there + * is already one, try again. + */ + if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) + return (EAGAIN); + + /* + * Check for conflicting name snapshot name. + */ + err = dsl_dataset_snap_lookup(ds, snapname, &value); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + /* + * Check that the dataset's name is not too long. Name consists + * of the dataset's length + 1 for the @-sign + snapshot name's length + */ + if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) + return (ENAMETOOLONG); + + err = dsl_dataset_snapshot_reserve_space(ds, tx); + if (err) + return (err); + + ds->ds_trysnap_txg = tx->tx_txg; + return (0); +} + +void +dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + const char *snapname = arg2; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj, crtxg; + objset_t *mos = dp->dp_meta_objset; + int err; + + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* + * The origin's ds_creation_txg has to be < TXG_INITIAL + */ + if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) + crtxg = 1; + else + crtxg = tx->tx_txg; + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); + dsphys->ds_dir_obj = ds->ds_dir->dd_object; + dsphys->ds_fsid_guid = unique_create(); + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; + dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; + dsphys->ds_next_snap_obj = ds->ds_object; + dsphys->ds_num_children = 1; + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = crtxg; + dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; + dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; + dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; + dsphys->ds_flags = ds->ds_phys->ds_flags; + dsphys->ds_bp = ds->ds_phys->ds_bp; + dmu_buf_rele(dbuf, FTAG); + + ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); + if (ds->ds_prev) { + uint64_t next_clones_obj = + ds->ds_prev->ds_phys->ds_next_clones_obj; + ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == + ds->ds_object || + ds->ds_prev->ds_phys->ds_num_children > 1); + if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, + ds->ds_prev->ds_phys->ds_creation_txg); + ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; + } else if (next_clones_obj != 0) { + VERIFY3U(0, ==, zap_remove_int(mos, + next_clones_obj, dsphys->ds_next_snap_obj, tx)); + VERIFY3U(0, ==, zap_add_int(mos, + next_clones_obj, dsobj, tx)); + } + } + + /* + * If we have a reference-reservation on this dataset, we will + * need to increase the amount of refreservation being charged + * since our unique space is going to zero. + */ + if (ds->ds_reserved) { + int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, + add, 0, 0, tx); + } + + bplist_close(&ds->ds_deadlist); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); + ds->ds_phys->ds_prev_snap_obj = dsobj; + ds->ds_phys->ds_prev_snap_txg = crtxg; + ds->ds_phys->ds_unique_bytes = 0; + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + ds->ds_phys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, + ds->ds_phys->ds_deadlist_obj)); + + dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); + err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &dsobj, tx); + ASSERT(err == 0); + + if (ds->ds_prev) + dsl_dataset_drop_ref(ds->ds_prev, ds); + VERIFY(0 == dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + + dsl_pool_ds_snapshotted(ds, tx); + + spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, + "dataset = %llu", dsobj); +} + +void +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(ds->ds_user_ptr != NULL); + ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + + /* + * in case we had to change ds_fsid_guid when we opened it, + * sync it out now. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; + + dsl_dir_dirty(ds->ds_dir, tx); + dmu_objset_sync(ds->ds_user_ptr, zio, tx); +} + +void +dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + uint64_t refd, avail, uobjs, aobjs; + + dsl_dir_stats(ds->ds_dir, nv); + + dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, + ds->ds_phys->ds_creation_time); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, + ds->ds_phys->ds_creation_txg); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, + ds->ds_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, + ds->ds_reserved); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, + ds->ds_phys->ds_guid); + + if (ds->ds_phys->ds_next_snap_obj) { + /* + * This is a snapshot; override the dd's space used with + * our unique space and compression ratio. + */ + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + ds->ds_phys->ds_unique_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, + ds->ds_phys->ds_compressed_bytes == 0 ? 100 : + (ds->ds_phys->ds_uncompressed_bytes * 100 / + ds->ds_phys->ds_compressed_bytes)); + } +} + +void +dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) +{ + stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; + stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; + stat->dds_guid = ds->ds_phys->ds_guid; + if (ds->ds_phys->ds_next_snap_obj) { + stat->dds_is_snapshot = B_TRUE; + stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; + } + + /* clone origin is really a dsl_dir thing... */ + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_dir_is_clone(ds->ds_dir)) { + dsl_dataset_t *ods; + + VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); + dsl_dataset_name(ods, stat->dds_origin); + dsl_dataset_drop_ref(ods, FTAG); + } + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); +} + +uint64_t +dsl_dataset_fsid_guid(dsl_dataset_t *ds) +{ + return (ds->ds_fsid_guid); +} + +void +dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + *refdbytesp = ds->ds_phys->ds_used_bytes; + *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) + *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; + if (ds->ds_quota != 0) { + /* + * Adjust available bytes according to refquota + */ + if (*refdbytesp < ds->ds_quota) + *availbytesp = MIN(*availbytesp, + ds->ds_quota - *refdbytesp); + else + *availbytesp = 0; + } + *usedobjsp = ds->ds_phys->ds_bp.blk_fill; + *availobjsp = DN_MAX_OBJECT - *usedobjsp; +} + +boolean_t +dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + if (ds->ds_prev == NULL) + return (B_FALSE); + if (ds->ds_phys->ds_bp.blk_birth > + ds->ds_prev->ds_phys->ds_creation_txg) + return (B_TRUE); + return (B_FALSE); +} + +/* ARGSUSED */ +static int +dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + char *newsnapname = arg2; + dsl_dir_t *dd = ds->ds_dir; + dsl_dataset_t *hds; + uint64_t val; + int err; + + err = dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); + if (err) + return (err); + + /* new name better not be in use */ + err = dsl_dataset_snap_lookup(hds, newsnapname, &val); + dsl_dataset_rele(hds, FTAG); + + if (err == 0) + err = EEXIST; + else if (err == ENOENT) + err = 0; + + /* dataset name + 1 for the "@" + the new snapshot name must fit */ + if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) + err = ENAMETOOLONG; + + return (err); +} + +static void +dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, + cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + const char *newsnapname = arg2; + dsl_dir_t *dd = ds->ds_dir; + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dataset_t *hds; + int err; + + ASSERT(ds->ds_phys->ds_next_snap_obj != 0); + + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); + + VERIFY(0 == dsl_dataset_get_snapname(ds)); + err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); + ASSERT3U(err, ==, 0); + mutex_enter(&ds->ds_lock); + (void) strcpy(ds->ds_snapname, newsnapname); + mutex_exit(&ds->ds_lock); + err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, + ds->ds_snapname, 8, 1, &ds->ds_object, tx); + ASSERT3U(err, ==, 0); + + spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); + dsl_dataset_rele(hds, FTAG); +} + +struct renamesnaparg { + dsl_sync_task_group_t *dstg; + char failed[MAXPATHLEN]; + char *oldsnap; + char *newsnap; +}; + +static int +dsl_snapshot_rename_one(char *name, void *arg) +{ + struct renamesnaparg *ra = arg; + dsl_dataset_t *ds = NULL; + char *cp; + int err; + + cp = name + strlen(name); + *cp = '@'; + (void) strcpy(cp + 1, ra->oldsnap); + + /* + * For recursive snapshot renames the parent won't be changing + * so we just pass name for both the to/from argument. + */ + err = zfs_secpolicy_rename_perms(name, name, CRED()); + if (err == ENOENT) { + return (0); + } else if (err) { + (void) strcpy(ra->failed, name); + return (err); + } + +#ifdef _KERNEL + /* + * For all filesystems undergoing rename, we'll need to unmount it. + */ + (void) zfs_unmount_snap(name, NULL); +#endif + err = dsl_dataset_hold(name, ra->dstg, &ds); + *cp = '\0'; + if (err == ENOENT) { + return (0); + } else if (err) { + (void) strcpy(ra->failed, name); + return (err); + } + + dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, + dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); + + return (0); +} + +static int +dsl_recursive_rename(char *oldname, const char *newname) +{ + int err; + struct renamesnaparg *ra; + dsl_sync_task_t *dst; + spa_t *spa; + char *cp, *fsname = spa_strdup(oldname); + int len = strlen(oldname); + + /* truncate the snapshot name to get the fsname */ + cp = strchr(fsname, '@'); + *cp = '\0'; + + err = spa_open(fsname, &spa, FTAG); + if (err) { + kmem_free(fsname, len + 1); + return (err); + } + ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); + ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + + ra->oldsnap = strchr(oldname, '@') + 1; + ra->newsnap = strchr(newname, '@') + 1; + *ra->failed = '\0'; + + err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, + DS_FIND_CHILDREN); + kmem_free(fsname, len + 1); + + if (err == 0) { + err = dsl_sync_task_group_wait(ra->dstg); + } + + for (dst = list_head(&ra->dstg->dstg_tasks); dst; + dst = list_next(&ra->dstg->dstg_tasks, dst)) { + dsl_dataset_t *ds = dst->dst_arg1; + if (dst->dst_err) { + dsl_dir_name(ds->ds_dir, ra->failed); + (void) strcat(ra->failed, "@"); + (void) strcat(ra->failed, ra->newsnap); + } + dsl_dataset_rele(ds, ra->dstg); + } + + if (err) + (void) strcpy(oldname, ra->failed); + + dsl_sync_task_group_destroy(ra->dstg); + kmem_free(ra, sizeof (struct renamesnaparg)); + spa_close(spa, FTAG); + return (err); +} + +static int +dsl_valid_rename(char *oldname, void *arg) +{ + int delta = *(int *)arg; + + if (strlen(oldname) + delta >= MAXNAMELEN) + return (ENAMETOOLONG); + + return (0); +} + +#pragma weak dmu_objset_rename = dsl_dataset_rename +int +dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) +{ + dsl_dir_t *dd; + dsl_dataset_t *ds; + const char *tail; + int err; + + err = dsl_dir_open(oldname, FTAG, &dd, &tail); + if (err) + return (err); + if (tail == NULL) { + int delta = strlen(newname) - strlen(oldname); + + /* if we're growing, validate child name lengths */ + if (delta > 0) + err = dmu_objset_find(oldname, dsl_valid_rename, + &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + + if (!err) + err = dsl_dir_rename(dd, newname); + dsl_dir_close(dd, FTAG); + return (err); + } + if (tail[0] != '@') { + /* the name ended in a nonexistant component */ + dsl_dir_close(dd, FTAG); + return (ENOENT); + } + + dsl_dir_close(dd, FTAG); + + /* new name must be snapshot in same filesystem */ + tail = strchr(newname, '@'); + if (tail == NULL) + return (EINVAL); + tail++; + if (strncmp(oldname, newname, tail - newname) != 0) + return (EXDEV); + + if (recursive) { + err = dsl_recursive_rename(oldname, newname); + } else { + err = dsl_dataset_hold(oldname, FTAG, &ds); + if (err) + return (err); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_snapshot_rename_check, + dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); + + dsl_dataset_rele(ds, FTAG); + } + + return (err); +} + +struct promotenode { + list_node_t link; + dsl_dataset_t *ds; +}; + +struct promotearg { + list_t shared_snaps, origin_snaps, clone_snaps; + dsl_dataset_t *origin_origin, *origin_head; + uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; +}; + +static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); + +/* ARGSUSED */ +static int +dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *hds = arg1; + struct promotearg *pa = arg2; + struct promotenode *snap = list_head(&pa->shared_snaps); + dsl_dataset_t *origin_ds = snap->ds; + int err; + + /* Check that it is a real clone */ + if (!dsl_dir_is_clone(hds->ds_dir)) + return (EINVAL); + + /* Since this is so expensive, don't do the preliminary check */ + if (!dmu_tx_is_syncing(tx)) + return (0); + + if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) + return (EXDEV); + + /* compute origin's new unique space */ + snap = list_tail(&pa->clone_snaps); + ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + err = bplist_space_birthrange(&snap->ds->ds_deadlist, + origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); + if (err) + return (err); + + /* + * Walk the snapshots that we are moving + * + * Compute space to transfer. Consider the incremental changes + * to used for each snapshot: + * (my used) = (prev's used) + (blocks born) - (blocks killed) + * So each snapshot gave birth to: + * (blocks born) = (my used) - (prev's used) + (blocks killed) + * So a sequence would look like: + * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) + * Which simplifies to: + * uN + kN + kN-1 + ... + k1 + k0 + * Note however, if we stop before we reach the ORIGIN we get: + * uN + kN + kN-1 + ... + kM - uM-1 + */ + pa->used = origin_ds->ds_phys->ds_used_bytes; + pa->comp = origin_ds->ds_phys->ds_compressed_bytes; + pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; + for (snap = list_head(&pa->shared_snaps); snap; + snap = list_next(&pa->shared_snaps, snap)) { + uint64_t val, dlused, dlcomp, dluncomp; + dsl_dataset_t *ds = snap->ds; + + /* Check that the snapshot name does not conflict */ + VERIFY(0 == dsl_dataset_get_snapname(ds)); + err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + /* The very first snapshot does not have a deadlist */ + if (ds->ds_phys->ds_prev_snap_obj == 0) + continue; + + if (err = bplist_space(&ds->ds_deadlist, + &dlused, &dlcomp, &dluncomp)) + return (err); + pa->used += dlused; + pa->comp += dlcomp; + pa->uncomp += dluncomp; + } + + /* + * If we are a clone of a clone then we never reached ORIGIN, + * so we need to subtract out the clone origin's used space. + */ + if (pa->origin_origin) { + pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; + pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; + pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; + } + + /* Check that there is enough space here */ + err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, + pa->used); + if (err) + return (err); + + /* + * Compute the amounts of space that will be used by snapshots + * after the promotion (for both origin and clone). For each, + * it is the amount of space that will be on all of their + * deadlists (that was not born before their new origin). + */ + if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + uint64_t space; + + /* + * Note, typically this will not be a clone of a clone, + * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so + * these snaplist_space() -> bplist_space_birthrange() + * calls will be fast because they do not have to + * iterate over all bps. + */ + snap = list_head(&pa->origin_snaps); + err = snaplist_space(&pa->shared_snaps, + snap->ds->ds_origin_txg, &pa->cloneusedsnap); + if (err) + return (err); + + err = snaplist_space(&pa->clone_snaps, + snap->ds->ds_origin_txg, &space); + if (err) + return (err); + pa->cloneusedsnap += space; + } + if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + err = snaplist_space(&pa->origin_snaps, + origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); + if (err) + return (err); + } + + return (0); +} + +static void +dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *hds = arg1; + struct promotearg *pa = arg2; + struct promotenode *snap = list_head(&pa->shared_snaps); + dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_t *origin_head; + dsl_dir_t *dd = hds->ds_dir; + dsl_pool_t *dp = hds->ds_dir->dd_pool; + dsl_dir_t *odd = NULL; + uint64_t oldnext_obj; + int64_t delta; + + ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); + + snap = list_head(&pa->origin_snaps); + origin_head = snap->ds; + + /* + * We need to explicitly open odd, since origin_ds's dd will be + * changing. + */ + VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, + NULL, FTAG, &odd)); + + /* change origin's next snap */ + dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); + oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; + snap = list_tail(&pa->clone_snaps); + ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; + + /* change the origin's next clone */ + if (origin_ds->ds_phys->ds_next_clones_obj) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + origin_ds->ds_phys->ds_next_clones_obj, + origin_ds->ds_phys->ds_next_snap_obj, tx)); + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + origin_ds->ds_phys->ds_next_clones_obj, + oldnext_obj, tx)); + } + + /* change origin */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); + dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; + hds->ds_origin_txg = origin_head->ds_origin_txg; + dmu_buf_will_dirty(odd->dd_dbuf, tx); + odd->dd_phys->dd_origin_obj = origin_ds->ds_object; + origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; + + /* move snapshots to this dir */ + for (snap = list_head(&pa->shared_snaps); snap; + snap = list_next(&pa->shared_snaps, snap)) { + dsl_dataset_t *ds = snap->ds; + + /* unregister props as dsl_dir is changing */ + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + /* move snap name entry */ + VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY(0 == dsl_dataset_snap_remove(origin_head, + ds->ds_snapname, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, + hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, + 8, 1, &ds->ds_object, tx)); + /* change containing dsl_dir */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); + ds->ds_phys->ds_dir_obj = dd->dd_object; + ASSERT3P(ds->ds_dir, ==, odd); + dsl_dir_close(ds->ds_dir, ds); + VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, + NULL, ds, &ds->ds_dir)); + + ASSERT3U(dsl_prop_numcb(ds), ==, 0); + } + + /* + * Change space accounting. + * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either + * both be valid, or both be 0 (resulting in delta == 0). This + * is true for each of {clone,origin} independently. + */ + + delta = pa->cloneusedsnap - + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, >=, 0); + ASSERT3U(pa->used, >=, delta); + dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + pa->used - delta, pa->comp, pa->uncomp, tx); + + delta = pa->originusedsnap - + odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, <=, 0); + ASSERT3U(pa->used, >=, -delta); + dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(odd, DD_USED_HEAD, + -pa->used - delta, -pa->comp, -pa->uncomp, tx); + + origin_ds->ds_phys->ds_unique_bytes = pa->unique; + + /* log history record */ + spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, + cr, "dataset = %llu", hds->ds_object); + + dsl_dir_close(odd, FTAG); +} + +static char *snaplist_tag = "snaplist"; +/* + * Make a list of dsl_dataset_t's for the snapshots between first_obj + * (exclusive) and last_obj (inclusive). The list will be in reverse + * order (last_obj will be the list_head()). If first_obj == 0, do all + * snapshots back to this dataset's origin. + */ +static int +snaplist_make(dsl_pool_t *dp, boolean_t own, + uint64_t first_obj, uint64_t last_obj, list_t *l) +{ + uint64_t obj = last_obj; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); + + list_create(l, sizeof (struct promotenode), + offsetof(struct promotenode, link)); + + while (obj != first_obj) { + dsl_dataset_t *ds; + struct promotenode *snap; + int err; + + if (own) { + err = dsl_dataset_own_obj(dp, obj, + 0, snaplist_tag, &ds); + if (err == 0) + dsl_dataset_make_exclusive(ds, snaplist_tag); + } else { + err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); + } + if (err == ENOENT) { + /* lost race with snapshot destroy */ + struct promotenode *last = list_tail(l); + ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); + obj = last->ds->ds_phys->ds_prev_snap_obj; + continue; + } else if (err) { + return (err); + } + + if (first_obj == 0) + first_obj = ds->ds_dir->dd_phys->dd_origin_obj; + + snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); + snap->ds = ds; + list_insert_tail(l, snap); + obj = ds->ds_phys->ds_prev_snap_obj; + } + + return (0); +} + +static int +snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) +{ + struct promotenode *snap; + + *spacep = 0; + for (snap = list_head(l); snap; snap = list_next(l, snap)) { + uint64_t used; + int err = bplist_space_birthrange(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used); + if (err) + return (err); + *spacep += used; + } + return (0); +} + +static void +snaplist_destroy(list_t *l, boolean_t own) +{ + struct promotenode *snap; + + if (!list_link_active(&l->list_head)) + return; + + while ((snap = list_tail(l)) != NULL) { + list_remove(l, snap); + if (own) + dsl_dataset_disown(snap->ds, snaplist_tag); + else + dsl_dataset_rele(snap->ds, snaplist_tag); + kmem_free(snap, sizeof (struct promotenode)); + } + list_destroy(l); +} + +/* + * Promote a clone. Nomenclature note: + * "clone" or "cds": the original clone which is being promoted + * "origin" or "ods": the snapshot which is originally clone's origin + * "origin head" or "ohds": the dataset which is the head + * (filesystem/volume) for the origin + * "origin origin": the origin of the origin's filesystem (typically + * NULL, indicating that the clone is not a clone of a clone). + */ +int +dsl_dataset_promote(const char *name) +{ + dsl_dataset_t *ds; + dsl_dir_t *dd; + dsl_pool_t *dp; + dmu_object_info_t doi; + struct promotearg pa = { 0 }; + struct promotenode *snap; + int err; + + err = dsl_dataset_hold(name, FTAG, &ds); + if (err) + return (err); + dd = ds->ds_dir; + dp = dd->dd_pool; + + err = dmu_object_info(dp->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, &doi); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + + /* + * We are going to inherit all the snapshots taken before our + * origin (i.e., our new origin will be our parent's origin). + * Take ownership of them so that we can rename them into our + * namespace. + */ + rw_enter(&dp->dp_config_rwlock, RW_READER); + + err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, + &pa.shared_snaps); + if (err != 0) + goto out; + + err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); + if (err != 0) + goto out; + + snap = list_head(&pa.shared_snaps); + ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); + err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, + snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); + if (err != 0) + goto out; + + if (dsl_dir_is_clone(snap->ds->ds_dir)) { + err = dsl_dataset_own_obj(dp, + snap->ds->ds_dir->dd_phys->dd_origin_obj, + 0, FTAG, &pa.origin_origin); + if (err != 0) + goto out; + } + +out: + rw_exit(&dp->dp_config_rwlock); + + /* + * Add in 128x the snapnames zapobj size, since we will be moving + * a bunch of snapnames to the promoted ds, and dirtying their + * bonus buffers. + */ + if (err == 0) { + err = dsl_sync_task_do(dp, dsl_dataset_promote_check, + dsl_dataset_promote_sync, ds, &pa, + 2 + 2 * doi.doi_physical_blks); + } + + snaplist_destroy(&pa.shared_snaps, B_TRUE); + snaplist_destroy(&pa.clone_snaps, B_FALSE); + snaplist_destroy(&pa.origin_snaps, B_FALSE); + if (pa.origin_origin) + dsl_dataset_disown(pa.origin_origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (err); +} + +struct cloneswaparg { + dsl_dataset_t *cds; /* clone dataset */ + dsl_dataset_t *ohds; /* origin's head dataset */ + boolean_t force; + int64_t unused_refres_delta; /* change in unconsumed refreservation */ +}; + +/* ARGSUSED */ +static int +dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + struct cloneswaparg *csa = arg1; + + /* they should both be heads */ + if (dsl_dataset_is_snapshot(csa->cds) || + dsl_dataset_is_snapshot(csa->ohds)) + return (EINVAL); + + /* the branch point should be just before them */ + if (csa->cds->ds_prev != csa->ohds->ds_prev) + return (EINVAL); + + /* cds should be the clone */ + if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj != + csa->ohds->ds_object) + return (EINVAL); + + /* the clone should be a child of the origin */ + if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) + return (EINVAL); + + /* ohds shouldn't be modified unless 'force' */ + if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) + return (ETXTBSY); + + /* adjust amount of any unconsumed refreservation */ + csa->unused_refres_delta = + (int64_t)MIN(csa->ohds->ds_reserved, + csa->ohds->ds_phys->ds_unique_bytes) - + (int64_t)MIN(csa->ohds->ds_reserved, + csa->cds->ds_phys->ds_unique_bytes); + + if (csa->unused_refres_delta > 0 && + csa->unused_refres_delta > + dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + struct cloneswaparg *csa = arg1; + dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; + + ASSERT(csa->cds->ds_reserved == 0); + ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota); + + dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); + dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); + dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx); + + if (csa->cds->ds_user_ptr != NULL) { + csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr); + csa->cds->ds_user_ptr = NULL; + } + + if (csa->ohds->ds_user_ptr != NULL) { + csa->ohds->ds_user_evict_func(csa->ohds, + csa->ohds->ds_user_ptr); + csa->ohds->ds_user_ptr = NULL; + } + + /* reset origin's unique bytes */ + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &csa->cds->ds_prev->ds_phys->ds_unique_bytes)); + + /* swap blkptrs */ + { + blkptr_t tmp; + tmp = csa->ohds->ds_phys->ds_bp; + csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; + csa->cds->ds_phys->ds_bp = tmp; + } + + /* set dd_*_bytes */ + { + int64_t dused, dcomp, duncomp; + uint64_t cdl_used, cdl_comp, cdl_uncomp; + uint64_t odl_used, odl_comp, odl_uncomp; + + ASSERT3U(csa->cds->ds_dir->dd_phys-> + dd_used_breakdown[DD_USED_SNAP], ==, 0); + + VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, + &cdl_comp, &cdl_uncomp)); + VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, + &odl_comp, &odl_uncomp)); + + dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - + (csa->ohds->ds_phys->ds_used_bytes + odl_used); + dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - + (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); + duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + + cdl_uncomp - + (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); + + dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, + dused, dcomp, duncomp, tx); + dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, + -dused, -dcomp, -duncomp, tx); + + /* + * The difference in the space used by snapshots is the + * difference in snapshot space due to the head's + * deadlist (since that's the only thing that's + * changing that affects the snapused). + */ + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); + VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, + csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); + dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, + DD_USED_HEAD, DD_USED_SNAP, tx); + } + +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } + + /* swap ds_*_bytes */ + SWITCH64(csa->ohds->ds_phys->ds_used_bytes, + csa->cds->ds_phys->ds_used_bytes); + SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, + csa->cds->ds_phys->ds_compressed_bytes); + SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, + csa->cds->ds_phys->ds_uncompressed_bytes); + SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, + csa->cds->ds_phys->ds_unique_bytes); + + /* apply any parent delta for change in unconsumed refreservation */ + dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, + csa->unused_refres_delta, 0, 0, tx); + + /* swap deadlists */ + bplist_close(&csa->cds->ds_deadlist); + bplist_close(&csa->ohds->ds_deadlist); + SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, + csa->cds->ds_phys->ds_deadlist_obj); + VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, + csa->cds->ds_phys->ds_deadlist_obj)); + VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, + csa->ohds->ds_phys->ds_deadlist_obj)); + + dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); +} + +/* + * Swap 'clone' with its origin head file system. Used at the end + * of "online recv" to swizzle the file system to the new version. + */ +int +dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, + boolean_t force) +{ + struct cloneswaparg csa; + int error; + + ASSERT(clone->ds_owner); + ASSERT(origin_head->ds_owner); +retry: + /* Need exclusive access for the swap */ + rw_enter(&clone->ds_rwlock, RW_WRITER); + if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { + rw_exit(&clone->ds_rwlock); + rw_enter(&origin_head->ds_rwlock, RW_WRITER); + if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { + rw_exit(&origin_head->ds_rwlock); + goto retry; + } + } + csa.cds = clone; + csa.ohds = origin_head; + csa.force = force; + error = dsl_sync_task_do(clone->ds_dir->dd_pool, + dsl_dataset_clone_swap_check, + dsl_dataset_clone_swap_sync, &csa, NULL, 9); + return (error); +} + +/* + * Given a pool name and a dataset object number in that pool, + * return the name of that dataset. + */ +int +dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) +{ + spa_t *spa; + dsl_pool_t *dp; + dsl_dataset_t *ds; + int error; + + if ((error = spa_open(pname, &spa, FTAG)) != 0) + return (error); + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { + dsl_dataset_name(ds, buf); + dsl_dataset_rele(ds, FTAG); + } + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + + return (error); +} + +int +dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) +{ + int error = 0; + + ASSERT3S(asize, >, 0); + + /* + * *ref_rsrv is the portion of asize that will come from any + * unconsumed refreservation space. + */ + *ref_rsrv = 0; + + mutex_enter(&ds->ds_lock); + /* + * Make a space adjustment for reserved bytes. + */ + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { + ASSERT3U(*used, >=, + ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *ref_rsrv = + asize - MIN(asize, parent_delta(ds, asize + inflight)); + } + + if (!check_quota || ds->ds_quota == 0) { + mutex_exit(&ds->ds_lock); + return (0); + } + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) + error = ERESTART; + else + error = EDQUOT; + } + mutex_exit(&ds->ds_lock); + + return (error); +} + +/* ARGSUSED */ +static int +dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) + return (ENOTSUP); + + if (new_quota == 0) + return (0); + + if (new_quota < ds->ds_phys->ds_used_bytes || + new_quota < ds->ds_reserved) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +void +dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + ds->ds_quota = new_quota; + + dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); + + spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu ", + (longlong_t)new_quota, ds->ds_object); +} + +int +dsl_dataset_set_quota(const char *dsname, uint64_t quota) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + if (quota != ds->ds_quota) { + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(ds->ds_dir->dd_pool, 0); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, + ds, "a, 0); + } + dsl_dataset_rele(ds, FTAG); + return (err); +} + +static int +dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + int64_t delta; + uint64_t unique; + + if (new_reservation > INT64_MAX) + return (EOVERFLOW); + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_REFRESERVATION) + return (ENOTSUP); + + if (dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) + return (0); + + mutex_enter(&ds->ds_lock); + unique = dsl_dataset_unique(ds); + delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved); + mutex_exit(&ds->ds_lock); + + if (delta > 0 && + delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + if (delta > 0 && ds->ds_quota > 0 && + new_reservation > ds->ds_quota) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + uint64_t unique; + int64_t delta; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + unique = dsl_dataset_unique(ds); + delta = MAX(0, (int64_t)(new_reservation - unique)) - + MAX(0, (int64_t)(ds->ds_reserved - unique)); + ds->ds_reserved = new_reservation; + mutex_exit(&ds->ds_lock); + + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); + mutex_exit(&ds->ds_dir->dd_lock); + dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", + new_reservation, cr, tx); + + spa_history_internal_log(LOG_DS_REFRESERV, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", + (longlong_t)new_reservation, ds->ds_object); +} + +int +dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_reservation_check, + dsl_dataset_set_reservation_sync, ds, &reservation, 0); + dsl_dataset_rele(ds, FTAG); + return (err); +} diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c new file mode 100644 index 000000000..da5d15787 --- /dev/null +++ b/module/zfs/dsl_deleg.c @@ -0,0 +1,735 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * DSL permissions are stored in a two level zap attribute + * mechanism. The first level identifies the "class" of + * entry. The class is identified by the first 2 letters of + * the attribute. The second letter "l" or "d" identifies whether + * it is a local or descendent permission. The first letter + * identifies the type of entry. + * + * ul$<id> identifies permissions granted locally for this userid. + * ud$<id> identifies permissions granted on descendent datasets for + * this userid. + * Ul$<id> identifies permission sets granted locally for this userid. + * Ud$<id> identifies permission sets granted on descendent datasets for + * this userid. + * gl$<id> identifies permissions granted locally for this groupid. + * gd$<id> identifies permissions granted on descendent datasets for + * this groupid. + * Gl$<id> identifies permission sets granted locally for this groupid. + * Gd$<id> identifies permission sets granted on descendent datasets for + * this groupid. + * el$ identifies permissions granted locally for everyone. + * ed$ identifies permissions granted on descendent datasets + * for everyone. + * El$ identifies permission sets granted locally for everyone. + * Ed$ identifies permission sets granted to descendent datasets for + * everyone. + * c-$ identifies permission to create at dataset creation time. + * C-$ identifies permission sets to grant locally at dataset creation + * time. + * s-$@<name> permissions defined in specified set @<name> + * S-$@<name> Sets defined in named set @<name> + * + * Each of the above entities points to another zap attribute that contains one + * attribute for each allowed permission, such as create, destroy,... + * All of the "upper" case class types will specify permission set names + * rather than permissions. + * + * Basically it looks something like this: + * ul$12 -> ZAP OBJ -> permissions... + * + * The ZAP OBJ is referred to as the jump object. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio_checksum.h> /* for the default checksum value */ +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/cred.h> +#include <sys/sunddi.h> + +#include "zfs_deleg.h" + +/* + * Validate that user is allowed to delegate specified permissions. + * + * In order to delegate "create" you must have "create" + * and "allow". + */ +int +dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + nvlist_t *perms; + nvpair_t *permpair = NULL; + + VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + + if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) + return (EPERM); + + if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) + return (error); + } + } + return (0); +} + +/* + * Validate that user is allowed to unallow specified permissions. They + * must have the 'allow' permission, and even then can only unallow + * perms for their uid. + */ +int +dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + char idstr[32]; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + (void) snprintf(idstr, sizeof (idstr), "%lld", + (longlong_t)crgetuid(cr)); + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + zfs_deleg_who_type_t type = nvpair_name(whopair)[0]; + + if (type != ZFS_DELEG_USER && + type != ZFS_DELEG_USER_SETS) + return (EPERM); + + if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) + return (EPERM); + } + return (0); +} + +static void +dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + nvlist_t *nvp = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { + jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, + DMU_OT_NONE, 0, tx); + VERIFY(zap_update(mos, zapobj, + whokey, 8, 1, &jumpobj, tx) == 0); + } + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + VERIFY(zap_update(mos, jumpobj, + perm, 8, 1, &n, tx) == 0); + spa_history_internal_log(LOG_DS_PERM_UPDATE, + dd->dd_pool->dp_spa, tx, cr, + "%s %s dataset = %llu", whokey, perm, + dd->dd_phys->dd_head_dataset_obj); + } + } +} + +static void +dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + nvlist_t *nvp = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) + return; + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + if (nvpair_value_nvlist(whopair, &perms) != 0) { + if (zap_lookup(mos, zapobj, whokey, 8, + 1, &jumpobj) == 0) { + (void) zap_remove(mos, zapobj, whokey, tx); + VERIFY(0 == zap_destroy(mos, jumpobj, tx)); + } + spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE, + dd->dd_pool->dp_spa, tx, cr, + "%s dataset = %llu", whokey, + dd->dd_phys->dd_head_dataset_obj); + continue; + } + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) + continue; + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + (void) zap_remove(mos, jumpobj, perm, tx); + if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { + (void) zap_remove(mos, zapobj, + whokey, tx); + VERIFY(0 == zap_destroy(mos, + jumpobj, tx)); + } + spa_history_internal_log(LOG_DS_PERM_REMOVE, + dd->dd_pool->dp_spa, tx, cr, + "%s %s dataset = %llu", whokey, perm, + dd->dd_phys->dd_head_dataset_obj); + } + } +} + +int +dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +{ + dsl_dir_t *dd; + int error; + nvpair_t *whopair = NULL; + int blocks_modified = 0; + + error = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (error) + return (error); + + if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) { + dsl_dir_close(dd, FTAG); + return (ENOTSUP); + } + + while (whopair = nvlist_next_nvpair(nvp, whopair)) + blocks_modified++; + + error = dsl_sync_task_do(dd->dd_pool, NULL, + unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, + dd, nvp, blocks_modified); + dsl_dir_close(dd, FTAG); + + return (error); +} + +/* + * Find all 'allow' permissions from a given point and then continue + * traversing up to the root. + * + * This function constructs an nvlist of nvlists. + * each setpoint is an nvlist composed of an nvlist of an nvlist + * of the individual * users/groups/everyone/create + * permissions. + * + * The nvlist will look like this. + * + * { source fsname -> { whokeys { permissions,...}, ...}} + * + * The fsname nvpairs will be arranged in a bottom up order. For example, + * if we have the following structure a/b/c then the nvpairs for the fsnames + * will be ordered a/b/c, a/b, a. + */ +int +dsl_deleg_get(const char *ddname, nvlist_t **nvp) +{ + dsl_dir_t *dd, *startdd; + dsl_pool_t *dp; + int error; + objset_t *mos; + + error = dsl_dir_open(ddname, FTAG, &startdd, NULL); + if (error) + return (error); + + dp = startdd->dd_pool; + mos = dp->dp_meta_objset; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + for (dd = startdd; dd != NULL; dd = dd->dd_parent) { + zap_cursor_t basezc; + zap_attribute_t baseza; + nvlist_t *sp_nvp; + uint64_t n; + char source[MAXNAMELEN]; + + if (dd->dd_phys->dd_deleg_zapobj && + (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, + &n) == 0) && n) { + VERIFY(nvlist_alloc(&sp_nvp, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + } else { + continue; + } + + for (zap_cursor_init(&basezc, mos, + dd->dd_phys->dd_deleg_zapobj); + zap_cursor_retrieve(&basezc, &baseza) == 0; + zap_cursor_advance(&basezc)) { + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *perms_nvp; + + ASSERT(baseza.za_integer_length == 8); + ASSERT(baseza.za_num_integers == 1); + + VERIFY(nvlist_alloc(&perms_nvp, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + for (zap_cursor_init(&zc, mos, baseza.za_first_integer); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(nvlist_add_boolean(perms_nvp, + za.za_name) == 0); + } + zap_cursor_fini(&zc); + VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name, + perms_nvp) == 0); + nvlist_free(perms_nvp); + } + + zap_cursor_fini(&basezc); + + dsl_dir_name(dd, source); + VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); + nvlist_free(sp_nvp); + } + rw_exit(&dp->dp_config_rwlock); + + dsl_dir_close(startdd, FTAG); + return (0); +} + +/* + * Routines for dsl_deleg_access() -- access checking. + */ +typedef struct perm_set { + avl_node_t p_node; + boolean_t p_matched; + char p_setname[ZFS_MAX_DELEG_NAME]; +} perm_set_t; + +static int +perm_set_compare(const void *arg1, const void *arg2) +{ + const perm_set_t *node1 = arg1; + const perm_set_t *node2 = arg2; + int val; + + val = strcmp(node1->p_setname, node2->p_setname); + if (val == 0) + return (0); + return (val > 0 ? 1 : -1); +} + +/* + * Determine whether a specified permission exists. + * + * First the base attribute has to be retrieved. i.e. ul$12 + * Once the base object has been retrieved the actual permission + * is lookup up in the zap object the base object points to. + * + * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if + * there is no perm in that jumpobj. + */ +static int +dsl_check_access(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, const char *perm) +{ + int error; + uint64_t jumpobj, zero; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error == 0) { + error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); + if (error == ENOENT) + error = EPERM; + } + return (error); +} + +/* + * check a specified user/group for a requested permission + */ +static int +dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, + int checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids; + int i; + uint64_t id; + + /* check for user */ + id = crgetuid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_USER, checkflag, &id, perm) == 0) + return (0); + + /* check for users primary group */ + id = crgetgid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + + /* check for everyone entry */ + id = -1; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0) + return (0); + + /* check each supplemental group user is a member of */ + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + } + + return (EPERM); +} + +/* + * Iterate over the sets specified in the specified zapobj + * and load them into the permsets avl tree. + */ +static int +dsl_load_sets(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, avl_tree_t *avl) +{ + zap_cursor_t zc; + zap_attribute_t za; + perm_set_t *permnode; + avl_index_t idx; + uint64_t jumpobj; + int error; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error != 0) + return (error); + + for (zap_cursor_init(&zc, mos, jumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP); + (void) strlcpy(permnode->p_setname, za.za_name, + sizeof (permnode->p_setname)); + permnode->p_matched = B_FALSE; + + if (avl_find(avl, permnode, &idx) == NULL) { + avl_insert(avl, permnode, idx); + } else { + kmem_free(permnode, sizeof (perm_set_t)); + } + } + zap_cursor_fini(&zc); + return (0); +} + +/* + * Load all permissions user based on cred belongs to. + */ +static void +dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, + char checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids, i; + uint64_t id; + + id = crgetuid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_USER_SETS, checkflag, &id, avl); + + id = crgetgid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl); + + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + } +} + +/* + * Check if user has requested permission. + */ +int +dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +{ + dsl_dataset_t *ds; + dsl_dir_t *dd; + dsl_pool_t *dp; + void *cookie; + int error; + char checkflag = ZFS_DELEG_LOCAL; + objset_t *mos; + avl_tree_t permsets; + perm_set_t *setnode; + + error = dsl_dataset_hold(dsname, FTAG, &ds); + if (error) + return (error); + + dp = ds->ds_dir->dd_pool; + mos = dp->dp_meta_objset; + + if (dsl_delegation_on(mos) == B_FALSE) { + dsl_dataset_rele(ds, FTAG); + return (ECANCELED); + } + + if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) { + dsl_dataset_rele(ds, FTAG); + return (EPERM); + } + + avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), + offsetof(perm_set_t, p_node)); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, + checkflag = ZFS_DELEG_DESCENDENT) { + uint64_t zapobj; + boolean_t expanded; + + /* + * If not in global zone then make sure + * the zoned property is set + */ + if (!INGLOBALZONE(curproc)) { + uint64_t zoned; + + if (dsl_prop_get_dd(dd, + zfs_prop_to_name(ZFS_PROP_ZONED), + 8, 1, &zoned, NULL) != 0) + break; + if (!zoned) + break; + } + zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) + continue; + + dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); +again: + expanded = B_FALSE; + for (setnode = avl_first(&permsets); setnode; + setnode = AVL_NEXT(&permsets, setnode)) { + if (setnode->p_matched == B_TRUE) + continue; + + /* See if this set directly grants this permission */ + error = dsl_check_access(mos, zapobj, + ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); + if (error == 0) + goto success; + if (error == EPERM) + setnode->p_matched = B_TRUE; + + /* See if this set includes other sets */ + error = dsl_load_sets(mos, zapobj, + ZFS_DELEG_NAMED_SET_SETS, 0, + setnode->p_setname, &permsets); + if (error == 0) + setnode->p_matched = expanded = B_TRUE; + } + /* + * If we expanded any sets, that will define more sets, + * which we need to check. + */ + if (expanded) + goto again; + + error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); + if (error == 0) + goto success; + } + error = EPERM; +success: + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + + cookie = NULL; + while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) + kmem_free(setnode, sizeof (perm_set_t)); + + return (error); +} + +/* + * Other routines. + */ + +static void +copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, + boolean_t dosets, uint64_t uid, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t jumpobj, pjumpobj; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + zap_cursor_t zc; + zap_attribute_t za; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE, + ZFS_DELEG_LOCAL, NULL); + if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0) + return; + + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER, + ZFS_DELEG_LOCAL, &uid); + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { + jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); + } + + for (zap_cursor_init(&zc, mos, pjumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zero = 0; + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + + VERIFY(zap_update(mos, jumpobj, za.za_name, + 8, 1, &zero, tx) == 0); + } + zap_cursor_fini(&zc); +} + +/* + * set all create time permission on new dataset. + */ +void +dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) +{ + dsl_dir_t *dd; + uint64_t uid = crgetuid(cr); + + if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) + return; + + for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { + uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj; + + if (pzapobj == 0) + continue; + + copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx); + copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx); + } +} + +int +dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + + if (zapobj == 0) + return (0); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx)); + } + zap_cursor_fini(&zc); + VERIFY(0 == zap_destroy(mos, zapobj, tx)); + return (0); +} + +boolean_t +dsl_delegation_on(objset_t *os) +{ + return (os->os->os_spa->spa_delegation); +} diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c new file mode 100644 index 000000000..48d87f97f --- /dev/null +++ b/module/zfs/dsl_dir.c @@ -0,0 +1,1331 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include <sys/sunddi.h> +#include "zfs_namecheck.h" + +static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); +static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, + cred_t *cr, dmu_tx_t *tx); + + +/* ARGSUSED */ +static void +dsl_dir_evict(dmu_buf_t *db, void *arg) +{ + dsl_dir_t *dd = arg; + dsl_pool_t *dp = dd->dd_pool; + int t; + + for (t = 0; t < TXG_SIZE; t++) { + ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); + ASSERT(dd->dd_tempreserved[t] == 0); + ASSERT(dd->dd_space_towrite[t] == 0); + } + + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + + spa_close(dd->dd_pool->dp_spa, dd); + + /* + * The props callback list should be empty since they hold the + * dir open. + */ + list_destroy(&dd->dd_prop_cbs); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); +} + +int +dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **ddp) +{ + dmu_buf_t *dbuf; + dsl_dir_t *dd; + int err; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + + err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); + if (err) + return (err); + dd = dmu_buf_get_user(dbuf); +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbuf, &doi); + ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); + } +#endif + if (dd == NULL) { + dsl_dir_t *winner; + int err; + + dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); + dd->dd_object = ddobj; + dd->dd_dbuf = dbuf; + dd->dd_pool = dp; + dd->dd_phys = dbuf->db_data; + mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_node)); + + if (dd->dd_phys->dd_parent_obj) { + err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, + NULL, dd, &dd->dd_parent); + if (err) + goto errout; + if (tail) { +#ifdef ZFS_DEBUG + uint64_t foundobj; + + err = zap_lookup(dp->dp_meta_objset, + dd->dd_parent->dd_phys->dd_child_dir_zapobj, + tail, sizeof (foundobj), 1, &foundobj); + ASSERT(err || foundobj == ddobj); +#endif + (void) strcpy(dd->dd_myname, tail); + } else { + err = zap_value_search(dp->dp_meta_objset, + dd->dd_parent->dd_phys->dd_child_dir_zapobj, + ddobj, 0, dd->dd_myname); + } + if (err) + goto errout; + } else { + (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); + } + + winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, + dsl_dir_evict); + if (winner) { + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dd = winner; + } else { + spa_open_ref(dp->dp_spa, dd); + } + } + + /* + * The dsl_dir_t has both open-to-close and instantiate-to-evict + * holds on the spa. We need the open-to-close holds because + * otherwise the spa_refcnt wouldn't change when we open a + * dir which the spa also has open, so we could incorrectly + * think it was OK to unload/export/destroy the pool. We need + * the instantiate-to-evict hold because the dsl_dir_t has a + * pointer to the dd_pool, which has a pointer to the spa_t. + */ + spa_open_ref(dp->dp_spa, tag); + ASSERT3P(dd->dd_pool, ==, dp); + ASSERT3U(dd->dd_object, ==, ddobj); + ASSERT3P(dd->dd_dbuf, ==, dbuf); + *ddp = dd; + return (0); + +errout: + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); + +} + +void +dsl_dir_close(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + +/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ +void +dsl_dir_name(dsl_dir_t *dd, char *buf) +{ + if (dd->dd_parent) { + dsl_dir_name(dd->dd_parent, buf); + (void) strcat(buf, "/"); + } else { + buf[0] = '\0'; + } + if (!MUTEX_HELD(&dd->dd_lock)) { + /* + * recursive mutex so that we can use + * dprintf_dd() with dd_lock held + */ + mutex_enter(&dd->dd_lock); + (void) strcat(buf, dd->dd_myname); + mutex_exit(&dd->dd_lock); + } else { + (void) strcat(buf, dd->dd_myname); + } +} + +/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */ +int +dsl_dir_namelen(dsl_dir_t *dd) +{ + int result = 0; + + if (dd->dd_parent) { + /* parent's name + 1 for the "/" */ + result = dsl_dir_namelen(dd->dd_parent) + 1; + } + + if (!MUTEX_HELD(&dd->dd_lock)) { + /* see dsl_dir_name */ + mutex_enter(&dd->dd_lock); + result += strlen(dd->dd_myname); + mutex_exit(&dd->dd_lock); + } else { + result += strlen(dd->dd_myname); + } + + return (result); +} + +int +dsl_dir_is_private(dsl_dir_t *dd) +{ + int rv = FALSE; + + if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) + rv = TRUE; + if (dataset_name_hidden(dd->dd_myname)) + rv = TRUE; + return (rv); +} + + +static int +getcomponent(const char *path, char *component, const char **nextp) +{ + char *p; + if (path == NULL) + return (ENOENT); + /* This would be a good place to reserve some namespace... */ + p = strpbrk(path, "/@"); + if (p && (p[1] == '/' || p[1] == '@')) { + /* two separators in a row */ + return (EINVAL); + } + if (p == NULL || p == path) { + /* + * if the first thing is an @ or /, it had better be an + * @ and it had better not have any more ats or slashes, + * and it had better have something after the @. + */ + if (p != NULL && + (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) + return (EINVAL); + if (strlen(path) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(component, path); + p = NULL; + } else if (p[0] == '/') { + if (p-path >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(component, path, p - path); + component[p-path] = '\0'; + p++; + } else if (p[0] == '@') { + /* + * if the next separator is an @, there better not be + * any more slashes. + */ + if (strchr(path, '/')) + return (EINVAL); + if (p-path >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(component, path, p - path); + component[p-path] = '\0'; + } else { + ASSERT(!"invalid p"); + } + *nextp = p; + return (0); +} + +/* + * same as dsl_open_dir, ignore the first component of name and use the + * spa instead + */ +int +dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, + dsl_dir_t **ddp, const char **tailp) +{ + char buf[MAXNAMELEN]; + const char *next, *nextnext = NULL; + int err; + dsl_dir_t *dd; + dsl_pool_t *dp; + uint64_t ddobj; + int openedspa = FALSE; + + dprintf("%s\n", name); + + err = getcomponent(name, buf, &next); + if (err) + return (err); + if (spa == NULL) { + err = spa_open(buf, &spa, FTAG); + if (err) { + dprintf("spa_open(%s) failed\n", buf); + return (err); + } + openedspa = TRUE; + + /* XXX this assertion belongs in spa_open */ + ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); + } + + dp = spa_get_dsl(spa); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); + if (err) { + rw_exit(&dp->dp_config_rwlock); + if (openedspa) + spa_close(spa, FTAG); + return (err); + } + + while (next != NULL) { + dsl_dir_t *child_ds; + err = getcomponent(next, buf, &nextnext); + if (err) + break; + ASSERT(next[0] != '\0'); + if (next[0] == '@') + break; + dprintf("looking up %s in obj%lld\n", + buf, dd->dd_phys->dd_child_dir_zapobj); + + err = zap_lookup(dp->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj, + buf, sizeof (ddobj), 1, &ddobj); + if (err) { + if (err == ENOENT) + err = 0; + break; + } + + err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); + if (err) + break; + dsl_dir_close(dd, tag); + dd = child_ds; + next = nextnext; + } + rw_exit(&dp->dp_config_rwlock); + + if (err) { + dsl_dir_close(dd, tag); + if (openedspa) + spa_close(spa, FTAG); + return (err); + } + + /* + * It's an error if there's more than one component left, or + * tailp==NULL and there's any component left. + */ + if (next != NULL && + (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { + /* bad path name */ + dsl_dir_close(dd, tag); + dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); + err = ENOENT; + } + if (tailp) + *tailp = next; + if (openedspa) + spa_close(spa, FTAG); + *ddp = dd; + return (err); +} + +/* + * Return the dsl_dir_t, and possibly the last component which couldn't + * be found in *tail. Return NULL if the path is bogus, or if + * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' + * means that the last component is a snapshot. + */ +int +dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) +{ + return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); +} + +uint64_t +dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, + dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + uint64_t ddobj; + dsl_dir_phys_t *dsphys; + dmu_buf_t *dbuf; + + ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, + DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); + if (pds) { + VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, + name, sizeof (uint64_t), 1, &ddobj, tx)); + } else { + /* it's the root dir */ + VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); + } + VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + + dsphys->dd_creation_time = gethrestime_sec(); + if (pds) + dsphys->dd_parent_obj = pds->dd_object; + dsphys->dd_props_zapobj = zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + dsphys->dd_child_dir_zapobj = zap_create(mos, + DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) + dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; + dmu_buf_rele(dbuf, FTAG); + + return (ddobj); +} + +/* ARGSUSED */ +int +dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err; + uint64_t count; + + /* + * There should be exactly two holds, both from + * dsl_dataset_destroy: one on the dd directory, and one on its + * head ds. Otherwise, someone is trying to lookup something + * inside this dir while we want to destroy it. The + * config_rwlock ensures that nobody else opens it after we + * check. + */ + if (dmu_buf_refcount(dd->dd_dbuf) > 2) + return (EBUSY); + + err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); + if (err) + return (err); + if (count != 0) + return (EEXIST); + + return (0); +} + +void +dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t val, obj; + dd_used_t t; + + ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); + ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); + + /* Remove our reservation. */ + val = 0; + dsl_dir_set_reservation_sync(dd, &val, cr, tx); + ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); + ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0); + + VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); + VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); + VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); + VERIFY(0 == zap_remove(mos, + dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); + + obj = dd->dd_object; + dsl_dir_close(dd, tag); + VERIFY(0 == dmu_object_free(mos, obj, tx)); +} + +boolean_t +dsl_dir_is_clone(dsl_dir_t *dd) +{ + return (dd->dd_phys->dd_origin_obj && + (dd->dd_pool->dp_origin_snap == NULL || + dd->dd_phys->dd_origin_obj != + dd->dd_pool->dp_origin_snap->ds_object)); +} + +void +dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) +{ + mutex_enter(&dd->dd_lock); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + dd->dd_phys->dd_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, + dd->dd_phys->dd_reserved); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, + dd->dd_phys->dd_compressed_bytes == 0 ? 100 : + (dd->dd_phys->dd_uncompressed_bytes * 100 / + dd->dd_phys->dd_compressed_bytes)); + if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, + dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, + dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); + } + mutex_exit(&dd->dd_lock); + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_dir_is_clone(dd)) { + dsl_dataset_t *ds; + char buf[MAXNAMELEN]; + + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_origin_obj, FTAG, &ds)); + dsl_dataset_name(ds, buf); + dsl_dataset_rele(ds, FTAG); + dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); + } + rw_exit(&dd->dd_pool->dp_config_rwlock); +} + +void +dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + + ASSERT(dd->dd_phys); + + if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(dd->dd_dbuf, dd); + } +} + +static int64_t +parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) +{ + uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); + uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); + return (new_accounted - old_accounted); +} + +void +dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); + dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, + dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); + dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; + mutex_exit(&dd->dd_lock); + + /* release the hold from dsl_dir_dirty */ + dmu_buf_rele(dd->dd_dbuf, dd); +} + +static uint64_t +dsl_dir_space_towrite(dsl_dir_t *dd) +{ + uint64_t space = 0; + int i; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + for (i = 0; i < TXG_SIZE; i++) { + space += dd->dd_space_towrite[i&TXG_MASK]; + ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); + } + return (space); +} + +/* + * How much space would dd have available if ancestor had delta applied + * to it? If ondiskonly is set, we're only interested in what's + * on-disk, not estimated pending changes. + */ +uint64_t +dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly) +{ + uint64_t parentspace, myspace, quota, used; + + /* + * If there are no restrictions otherwise, assume we have + * unlimited space available. + */ + quota = UINT64_MAX; + parentspace = UINT64_MAX; + + if (dd->dd_parent != NULL) { + parentspace = dsl_dir_space_available(dd->dd_parent, + ancestor, delta, ondiskonly); + } + + mutex_enter(&dd->dd_lock); + if (dd->dd_phys->dd_quota != 0) + quota = dd->dd_phys->dd_quota; + used = dd->dd_phys->dd_used_bytes; + if (!ondiskonly) + used += dsl_dir_space_towrite(dd); + + if (dd->dd_parent == NULL) { + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); + quota = MIN(quota, poolsize); + } + + if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { + /* + * We have some space reserved, in addition to what our + * parent gave us. + */ + parentspace += dd->dd_phys->dd_reserved - used; + } + + if (dd == ancestor) { + ASSERT(delta <= 0); + ASSERT(used >= -delta); + used += delta; + if (parentspace != UINT64_MAX) + parentspace -= delta; + } + + if (used > quota) { + /* over quota */ + myspace = 0; + + /* + * While it's OK to be a little over quota, if + * we think we are using more space than there + * is in the pool (which is already 1.6% more than + * dsl_pool_adjustedsize()), something is very + * wrong. + */ + ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa)); + } else { + /* + * the lesser of the space provided by our parent and + * the space left in our quota + */ + myspace = MIN(parentspace, quota - used); + } + + mutex_exit(&dd->dd_lock); + + return (myspace); +} + +struct tempreserve { + list_node_t tr_node; + dsl_pool_t *tr_dp; + dsl_dir_t *tr_ds; + uint64_t tr_size; +}; + +static int +dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, + boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, + dmu_tx_t *tx, boolean_t first) +{ + uint64_t txg = tx->tx_txg; + uint64_t est_inflight, used_on_disk, quota, parent_rsrv; + struct tempreserve *tr; + int enospc = EDQUOT; + int txgidx = txg & TXG_MASK; + int i; + uint64_t ref_rsrv = 0; + + ASSERT3U(txg, !=, 0); + ASSERT3S(asize, >, 0); + + mutex_enter(&dd->dd_lock); + + /* + * Check against the dsl_dir's quota. We don't add in the delta + * when checking for over-quota because they get one free hit. + */ + est_inflight = dsl_dir_space_towrite(dd); + for (i = 0; i < TXG_SIZE; i++) + est_inflight += dd->dd_tempreserved[i]; + used_on_disk = dd->dd_phys->dd_used_bytes; + + /* + * On the first iteration, fetch the dataset's used-on-disk and + * refreservation values. Also, if checkrefquota is set, test if + * allocating this space would exceed the dataset's refquota. + */ + if (first && tx->tx_objset) { + int error; + dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset; + + error = dsl_dataset_check_quota(ds, checkrefquota, + asize, est_inflight, &used_on_disk, &ref_rsrv); + if (error) { + mutex_exit(&dd->dd_lock); + return (error); + } + } + + /* + * If this transaction will result in a net free of space, + * we want to let it through. + */ + if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) + quota = UINT64_MAX; + else + quota = dd->dd_phys->dd_quota; + + /* + * Adjust the quota against the actual pool size at the root. + * To ensure that it's possible to remove files from a full + * pool without inducing transient overcommits, we throttle + * netfree transactions against a quota that is slightly larger, + * but still within the pool's allocation slop. In cases where + * we're very close to full, this will allow a steady trickle of + * removes to get through. + */ + if (dd->dd_parent == NULL) { + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); + if (poolsize < quota) { + quota = poolsize; + enospc = ENOSPC; + } + } + + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (used_on_disk + est_inflight > quota) { + if (est_inflight > 0 || used_on_disk < quota) + enospc = ERESTART; + dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " + "quota=%lluK tr=%lluK err=%d\n", + used_on_disk>>10, est_inflight>>10, + quota>>10, asize>>10, enospc); + mutex_exit(&dd->dd_lock); + return (enospc); + } + + /* We need to up our estimated delta before dropping dd_lock */ + dd->dd_tempreserved[txgidx] += asize; + + parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, + asize - ref_rsrv); + mutex_exit(&dd->dd_lock); + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_ds = dd; + tr->tr_size = asize; + list_insert_tail(tr_list, tr); + + /* see if it's OK with our parent */ + if (dd->dd_parent && parent_rsrv) { + boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); + + return (dsl_dir_tempreserve_impl(dd->dd_parent, + parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); + } else { + return (0); + } +} + +/* + * Reserve space in this dsl_dir, to be used in this tx's txg. + * After the space has been dirtied (and dsl_dir_willuse_space() + * has been called), the reservation should be canceled, using + * dsl_dir_tempreserve_clear(). + */ +int +dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, + uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) +{ + int err; + list_t *tr_list; + + if (asize == 0) { + *tr_cookiep = NULL; + return (0); + } + + tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(tr_list, sizeof (struct tempreserve), + offsetof(struct tempreserve, tr_node)); + ASSERT3S(asize, >, 0); + ASSERT3S(fsize, >=, 0); + + err = arc_tempreserve_space(lsize, tx->tx_txg); + if (err == 0) { + struct tempreserve *tr; + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_size = lsize; + list_insert_tail(tr_list, tr); + + err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); + } else { + if (err == EAGAIN) { + txg_delay(dd->dd_pool, tx->tx_txg, 1); + err = ERESTART; + } + dsl_pool_memory_pressure(dd->dd_pool); + } + + if (err == 0) { + struct tempreserve *tr; + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_dp = dd->dd_pool; + tr->tr_size = asize; + list_insert_tail(tr_list, tr); + + err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, + FALSE, asize > usize, tr_list, tx, TRUE); + } + + if (err) + dsl_dir_tempreserve_clear(tr_list, tx); + else + *tr_cookiep = tr_list; + + return (err); +} + +/* + * Clear a temporary reservation that we previously made with + * dsl_dir_tempreserve_space(). + */ +void +dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) +{ + int txgidx = tx->tx_txg & TXG_MASK; + list_t *tr_list = tr_cookie; + struct tempreserve *tr; + + ASSERT3U(tx->tx_txg, !=, 0); + + if (tr_cookie == NULL) + return; + + while (tr = list_head(tr_list)) { + if (tr->tr_dp) { + dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx); + } else if (tr->tr_ds) { + mutex_enter(&tr->tr_ds->dd_lock); + ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, + tr->tr_size); + tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; + mutex_exit(&tr->tr_ds->dd_lock); + } else { + arc_tempreserve_clear(tr->tr_size); + } + list_remove(tr_list, tr); + kmem_free(tr, sizeof (struct tempreserve)); + } + + kmem_free(tr_list, sizeof (list_t)); +} + +static void +dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +{ + int64_t parent_space; + uint64_t est_used; + + mutex_enter(&dd->dd_lock); + if (space > 0) + dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; + + est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; + parent_space = parent_delta(dd, est_used, space); + mutex_exit(&dd->dd_lock); + + /* Make sure that we clean up dd_space_to* */ + dsl_dir_dirty(dd, tx); + + /* XXX this is potentially expensive and unnecessary... */ + if (parent_space && dd->dd_parent) + dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx); +} + +/* + * Call in open context when we think we're going to write/free space, + * eg. when dirtying data. Be conservative (ie. OK to write less than + * this or free more than this, but don't write more or free less). + */ +void +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +{ + dsl_pool_willuse_space(dd->dd_pool, space, tx); + dsl_dir_willuse_space_impl(dd, space, tx); +} + +/* call from syncing context when we actually write/free space for this dd */ +void +dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) +{ + int64_t accounted_delta; + boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(type < DD_USED_NUM); + + dsl_dir_dirty(dd, tx); + + if (needlock) + mutex_enter(&dd->dd_lock); + accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); + ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || + dd->dd_phys->dd_compressed_bytes >= -compressed); + ASSERT(uncompressed >= 0 || + dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); + dd->dd_phys->dd_used_bytes += used; + dd->dd_phys->dd_uncompressed_bytes += uncompressed; + dd->dd_phys->dd_compressed_bytes += compressed; + + if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(used > 0 || + dd->dd_phys->dd_used_breakdown[type] >= -used); + dd->dd_phys->dd_used_breakdown[type] += used; +#ifdef DEBUG + dd_used_t t; + uint64_t u = 0; + for (t = 0; t < DD_USED_NUM; t++) + u += dd->dd_phys->dd_used_breakdown[t]; + ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); +#endif + } + if (needlock) + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent != NULL) { + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + accounted_delta, compressed, uncompressed, tx); + dsl_dir_transfer_space(dd->dd_parent, + used - accounted_delta, + DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + } +} + +void +dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(oldtype < DD_USED_NUM); + ASSERT(newtype < DD_USED_NUM); + + if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) + return; + + dsl_dir_dirty(dd, tx); + if (needlock) + mutex_enter(&dd->dd_lock); + ASSERT(delta > 0 ? + dd->dd_phys->dd_used_breakdown[oldtype] >= delta : + dd->dd_phys->dd_used_breakdown[newtype] >= -delta); + ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); + dd->dd_phys->dd_used_breakdown[oldtype] -= delta; + dd->dd_phys->dd_used_breakdown[newtype] += delta; + if (needlock) + mutex_exit(&dd->dd_lock); +} + +static int +dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + int err = 0; + uint64_t towrite; + + if (new_quota == 0) + return (0); + + mutex_enter(&dd->dd_lock); + /* + * If we are doing the preliminary check in open context, and + * there are pending changes, then don't fail it, since the + * pending changes could under-estimate the amount of space to be + * freed up. + */ + towrite = dsl_dir_space_towrite(dd); + if ((dmu_tx_is_syncing(tx) || towrite == 0) && + (new_quota < dd->dd_phys->dd_reserved || + new_quota < dd->dd_phys->dd_used_bytes + towrite)) { + err = ENOSPC; + } + mutex_exit(&dd->dd_lock); + return (err); +} + +/* ARGSUSED */ +static void +dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + dd->dd_phys->dd_quota = new_quota; + mutex_exit(&dd->dd_lock); + + spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu ", + (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj); +} + +int +dsl_dir_set_quota(const char *ddname, uint64_t quota) +{ + dsl_dir_t *dd; + int err; + + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) + return (err); + + if (quota != dd->dd_phys->dd_quota) { + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(dd->dd_pool, 0); + + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, dd, "a, 0); + } + dsl_dir_close(dd, FTAG); + return (err); +} + +int +dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + uint64_t used, avail; + int64_t delta; + + if (new_reservation > INT64_MAX) + return (EOVERFLOW); + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) + return (0); + + mutex_enter(&dd->dd_lock); + used = dd->dd_phys->dd_used_bytes; + delta = MAX(used, new_reservation) - + MAX(used, dd->dd_phys->dd_reserved); + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent) { + avail = dsl_dir_space_available(dd->dd_parent, + NULL, 0, FALSE); + } else { + avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; + } + + if (delta > 0 && delta > avail) + return (ENOSPC); + if (delta > 0 && dd->dd_phys->dd_quota > 0 && + new_reservation > dd->dd_phys->dd_quota) + return (ENOSPC); + return (0); +} + +/* ARGSUSED */ +static void +dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + uint64_t used; + int64_t delta; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + used = dd->dd_phys->dd_used_bytes; + delta = MAX(used, new_reservation) - + MAX(used, dd->dd_phys->dd_reserved); + dd->dd_phys->dd_reserved = new_reservation; + + if (dd->dd_parent != NULL) { + /* Roll up this additional usage into our ancestors */ + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + delta, 0, 0, tx); + } + mutex_exit(&dd->dd_lock); + + spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu", + (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj); +} + +int +dsl_dir_set_reservation(const char *ddname, uint64_t reservation) +{ + dsl_dir_t *dd; + int err; + + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) + return (err); + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, + dsl_dir_set_reservation_sync, dd, &reservation, 0); + dsl_dir_close(dd, FTAG); + return (err); +} + +static dsl_dir_t * +closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) +{ + for (; ds1; ds1 = ds1->dd_parent) { + dsl_dir_t *dd; + for (dd = ds2; dd; dd = dd->dd_parent) { + if (ds1 == dd) + return (dd); + } + } + return (NULL); +} + +/* + * If delta is applied to dd, how much of that delta would be applied to + * ancestor? Syncing context only. + */ +static int64_t +would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) +{ + if (dd == ancestor) + return (delta); + + mutex_enter(&dd->dd_lock); + delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); + mutex_exit(&dd->dd_lock); + return (would_change(dd->dd_parent, delta, ancestor)); +} + +struct renamearg { + dsl_dir_t *newparent; + const char *mynewname; +}; + +/*ARGSUSED*/ +static int +dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct renamearg *ra = arg2; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err; + uint64_t val; + + /* There should be 2 references: the open and the dirty */ + if (dmu_buf_refcount(dd->dd_dbuf) > 2) + return (EBUSY); + + /* check for existing name */ + err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, + ra->mynewname, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + if (ra->newparent != dd->dd_parent) { + /* is there enough space? */ + uint64_t myspace = + MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); + + /* no rename into our descendant */ + if (closest_common_ancestor(dd, ra->newparent) == dd) + return (EINVAL); + + if (err = dsl_dir_transfer_possible(dd->dd_parent, + ra->newparent, myspace)) + return (err); + } + + return (0); +} + +static void +dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct renamearg *ra = arg2; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err; + + ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); + + if (ra->newparent != dd->dd_parent) { + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + -dd->dd_phys->dd_used_bytes, + -dd->dd_phys->dd_compressed_bytes, + -dd->dd_phys->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, + dd->dd_phys->dd_used_bytes, + dd->dd_phys->dd_compressed_bytes, + dd->dd_phys->dd_uncompressed_bytes, tx); + + if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { + uint64_t unused_rsrv = dd->dd_phys->dd_reserved - + dd->dd_phys->dd_used_bytes; + + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + -unused_rsrv, 0, 0, tx); + dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, + unused_rsrv, 0, 0, tx); + } + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + /* remove from old parent zapobj */ + err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, + dd->dd_myname, tx); + ASSERT3U(err, ==, 0); + + (void) strcpy(dd->dd_myname, ra->mynewname); + dsl_dir_close(dd->dd_parent, dd); + dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; + VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, + ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); + + /* add to new parent zapobj */ + err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, + dd->dd_myname, 8, 1, &dd->dd_object, tx); + ASSERT3U(err, ==, 0); + + spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, + tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); +} + +int +dsl_dir_rename(dsl_dir_t *dd, const char *newname) +{ + struct renamearg ra; + int err; + + /* new parent should exist */ + err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); + if (err) + return (err); + + /* can't rename to different pool */ + if (dd->dd_pool != ra.newparent->dd_pool) { + err = ENXIO; + goto out; + } + + /* new name should not already exist */ + if (ra.mynewname == NULL) { + err = EEXIST; + goto out; + } + + err = dsl_sync_task_do(dd->dd_pool, + dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); + +out: + dsl_dir_close(ra.newparent, FTAG); + return (err); +} + +int +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) +{ + dsl_dir_t *ancestor; + int64_t adelta; + uint64_t avail; + + ancestor = closest_common_ancestor(sdd, tdd); + adelta = would_change(sdd, -space, ancestor); + avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); + if (avail < space) + return (ENOSPC); + + return (0); +} diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c new file mode 100644 index 000000000..dacc57c81 --- /dev/null +++ b/module/zfs/dsl_pool.c @@ -0,0 +1,613 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> + +int zfs_no_write_throttle = 0; +int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ +int zfs_txg_synctime = 5; /* target secs to sync a txg */ + +uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ +uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ +uint64_t zfs_write_limit_inflated = 0; +uint64_t zfs_write_limit_override = 0; + +kmutex_t zfs_write_limit_lock; + +static pgcnt_t old_physmem = 0; + +static int +dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) +{ + uint64_t obj; + int err; + + err = zap_lookup(dp->dp_meta_objset, + dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, + name, sizeof (obj), 1, &obj); + if (err) + return (err); + + return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); +} + +static dsl_pool_t * +dsl_pool_open_impl(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp; + blkptr_t *bp = spa_get_rootblkptr(spa); + + dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); + dp->dp_spa = spa; + dp->dp_meta_rootbp = *bp; + rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); + dp->dp_write_limit = zfs_write_limit_min; + txg_init(dp, txg); + + txg_list_create(&dp->dp_dirty_datasets, + offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_dirs, + offsetof(dsl_dir_t, dd_dirty_link)); + txg_list_create(&dp->dp_sync_tasks, + offsetof(dsl_sync_task_group_t, dstg_node)); + list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); + + mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); + + return (dp); +} + +int +dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + dsl_dir_t *dd; + dsl_dataset_t *ds; + objset_impl_t *osi; + + rw_enter(&dp->dp_config_rwlock, RW_WRITER); + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); + if (err) + goto out; + dp->dp_meta_objset = &osi->os; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, + &dp->dp_root_dir_obj); + if (err) + goto out; + + err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir); + if (err) + goto out; + + err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); + if (err) + goto out; + + if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); + if (err) + goto out; + err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, + FTAG, &ds); + if (err) + goto out; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + dp, &dp->dp_origin_snap); + if (err) + goto out; + dsl_dataset_rele(ds, FTAG); + dsl_dir_close(dd, dp); + } + + /* get scrub status */ + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, + &dp->dp_scrub_func); + if (err == 0) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, + &dp->dp_scrub_queue_obj); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_min_txg); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_max_txg); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &spa->spa_scrub_errors); + if (err) + goto out; + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { + /* + * A new-type scrub was in progress on an old + * pool. Restart from the beginning, since the + * old software may have changed the pool in the + * meantime. + */ + dsl_pool_scrub_restart(dp); + } + } else { + /* + * It's OK if there is no scrub in progress (and if + * there was an I/O error, ignore it). + */ + err = 0; + } + +out: + rw_exit(&dp->dp_config_rwlock); + if (err) + dsl_pool_close(dp); + else + *dpp = dp; + + return (err); +} + +void +dsl_pool_close(dsl_pool_t *dp) +{ + /* drop our references from dsl_pool_open() */ + + /* + * Since we held the origin_snap from "syncing" context (which + * includes pool-opening context), it actually only got a "ref" + * and not a hold, so just drop that here. + */ + if (dp->dp_origin_snap) + dsl_dataset_drop_ref(dp->dp_origin_snap, dp); + if (dp->dp_mos_dir) + dsl_dir_close(dp->dp_mos_dir, dp); + if (dp->dp_root_dir) + dsl_dir_close(dp->dp_root_dir, dp); + + /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ + if (dp->dp_meta_objset) + dmu_objset_evict(NULL, dp->dp_meta_objset->os); + + txg_list_destroy(&dp->dp_dirty_datasets); + txg_list_destroy(&dp->dp_dirty_dirs); + list_destroy(&dp->dp_synced_datasets); + + arc_flush(dp->dp_spa); + txg_fini(dp); + rw_destroy(&dp->dp_config_rwlock); + mutex_destroy(&dp->dp_lock); + mutex_destroy(&dp->dp_scrub_cancel_lock); + if (dp->dp_blkstats) + kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + kmem_free(dp, sizeof (dsl_pool_t)); +} + +dsl_pool_t * +dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + objset_impl_t *osip; + dsl_dataset_t *ds; + uint64_t dsobj; + + /* create and open the MOS (meta-objset) */ + dp->dp_meta_objset = &dmu_objset_create_impl(spa, + NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; + + /* create the pool directory */ + err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); + ASSERT3U(err, ==, 0); + + /* create and open the root dir */ + dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); + VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir)); + + /* create and open the meta-objset dir */ + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); + VERIFY(0 == dsl_pool_open_special_dir(dp, + MOS_DIR_NAME, &dp->dp_mos_dir)); + + if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) + dsl_pool_create_origin(dp, tx); + + /* create the root dataset */ + dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); + + /* create the root objset */ + VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + osip = dmu_objset_create_impl(dp->dp_spa, ds, + dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); +#ifdef _KERNEL + zfs_create_fs(&osip->os, kcred, zplprops, tx); +#endif + dsl_dataset_rele(ds, FTAG); + + dmu_tx_commit(tx); + + return (dp); +} + +void +dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) +{ + zio_t *zio; + dmu_tx_t *tx; + dsl_dir_t *dd; + dsl_dataset_t *ds; + dsl_sync_task_group_t *dstg; + objset_impl_t *mosi = dp->dp_meta_objset->os; + hrtime_t start, write_time; + uint64_t data_written; + int err; + + tx = dmu_tx_create_assigned(dp, txg); + + dp->dp_read_overhead = 0; + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + if (!list_link_active(&ds->ds_synced_link)) + list_insert_tail(&dp->dp_synced_datasets, ds); + else + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, zio, tx); + } + DTRACE_PROBE(pool_sync__1setup); + + start = gethrtime(); + err = zio_wait(zio); + write_time = gethrtime() - start; + ASSERT(err == 0); + DTRACE_PROBE(pool_sync__2rootzio); + + while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) + dsl_sync_task_group_sync(dstg, tx); + DTRACE_PROBE(pool_sync__3task); + + start = gethrtime(); + while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) + dsl_dir_sync(dd, tx); + write_time += gethrtime() - start; + + if (spa_sync_pass(dp->dp_spa) == 1) + dsl_pool_scrub_sync(dp, tx); + + start = gethrtime(); + if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || + list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dmu_objset_sync(mosi, zio, tx); + err = zio_wait(zio); + ASSERT(err == 0); + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + } + write_time += gethrtime() - start; + DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, + hrtime_t, dp->dp_read_overhead); + write_time -= dp->dp_read_overhead; + + dmu_tx_commit(tx); + + data_written = dp->dp_space_towrite[txg & TXG_MASK]; + dp->dp_space_towrite[txg & TXG_MASK] = 0; + ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); + + /* + * If the write limit max has not been explicitly set, set it + * to a fraction of available physical memory (default 1/8th). + * Note that we must inflate the limit because the spa + * inflates write sizes to account for data replication. + * Check this each sync phase to catch changing memory size. + */ + if (physmem != old_physmem && zfs_write_limit_shift) { + mutex_enter(&zfs_write_limit_lock); + old_physmem = physmem; + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + zfs_write_limit_inflated = MAX(zfs_write_limit_min, + spa_get_asize(dp->dp_spa, zfs_write_limit_max)); + mutex_exit(&zfs_write_limit_lock); + } + + /* + * Attempt to keep the sync time consistent by adjusting the + * amount of write traffic allowed into each transaction group. + * Weight the throughput calculation towards the current value: + * thru = 3/4 old_thru + 1/4 new_thru + */ + ASSERT(zfs_write_limit_min > 0); + if (data_written > zfs_write_limit_min / 8 && write_time > 0) { + uint64_t throughput = (data_written * NANOSEC) / write_time; + if (dp->dp_throughput) + dp->dp_throughput = throughput / 4 + + 3 * dp->dp_throughput / 4; + else + dp->dp_throughput = throughput; + dp->dp_write_limit = MIN(zfs_write_limit_inflated, + MAX(zfs_write_limit_min, + dp->dp_throughput * zfs_txg_synctime)); + } +} + +void +dsl_pool_zil_clean(dsl_pool_t *dp) +{ + dsl_dataset_t *ds; + + while (ds = list_head(&dp->dp_synced_datasets)) { + list_remove(&dp->dp_synced_datasets, ds); + ASSERT(ds->ds_user_ptr != NULL); + zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); + dmu_buf_rele(ds->ds_dbuf, ds); + } +} + +/* + * TRUE if the current thread is the tx_sync_thread or if we + * are being called from SPA context during pool initialization. + */ +int +dsl_pool_sync_context(dsl_pool_t *dp) +{ + return (curthread == dp->dp_tx.tx_sync_thread || + spa_get_dsl(dp->dp_spa) == NULL); +} + +uint64_t +dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) +{ + uint64_t space, resv; + + /* + * Reserve about 1.6% (1/64), or at least 32MB, for allocation + * efficiency. + * XXX The intent log is not accounted for, so it must fit + * within this slop. + * + * If we're trying to assess whether it's OK to do a free, + * cut the reservation in half to allow forward progress + * (e.g. make it possible to rm(1) files from a full pool). + */ + space = spa_get_dspace(dp->dp_spa); + resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); + if (netfree) + resv >>= 1; + + return (space - resv); +} + +int +dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) +{ + uint64_t reserved = 0; + uint64_t write_limit = (zfs_write_limit_override ? + zfs_write_limit_override : dp->dp_write_limit); + + if (zfs_no_write_throttle) { + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], + space); + return (0); + } + + /* + * Check to see if we have exceeded the maximum allowed IO for + * this transaction group. We can do this without locks since + * a little slop here is ok. Note that we do the reserved check + * with only half the requested reserve: this is because the + * reserve requests are worst-case, and we really don't want to + * throttle based off of worst-case estimates. + */ + if (write_limit > 0) { + reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] + + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; + + if (reserved && reserved > write_limit) + return (ERESTART); + } + + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); + + /* + * If this transaction group is over 7/8ths capacity, delay + * the caller 1 clock tick. This will slow down the "fill" + * rate until the sync process can catch up with us. + */ + if (reserved && reserved > (write_limit - (write_limit >> 3))) + txg_delay(dp, tx->tx_txg, 1); + + return (0); +} + +void +dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +{ + ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); +} + +void +dsl_pool_memory_pressure(dsl_pool_t *dp) +{ + uint64_t space_inuse = 0; + int i; + + if (dp->dp_write_limit == zfs_write_limit_min) + return; + + for (i = 0; i < TXG_SIZE; i++) { + space_inuse += dp->dp_space_towrite[i]; + space_inuse += dp->dp_tempreserved[i]; + } + dp->dp_write_limit = MAX(zfs_write_limit_min, + MIN(dp->dp_write_limit, space_inuse / 4)); +} + +void +dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +{ + if (space > 0) { + mutex_enter(&dp->dp_lock); + dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; + mutex_exit(&dp->dp_lock); + } +} + +/* ARGSUSED */ +static int +upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds, *prev = NULL; + int err; + dsl_pool_t *dp = spa_get_dsl(spa); + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) + break; + dsl_dataset_rele(ds, FTAG); + ds = prev; + prev = NULL; + } + + if (prev == NULL) { + prev = dp->dp_origin_snap; + + /* + * The $ORIGIN can't have any data, or the accounting + * will be wrong. + */ + ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); + + /* The origin doesn't get attached to itself */ + if (ds->ds_object == prev->ds_object) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_prev_snap_obj = prev->ds_object; + ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; + + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; + + dmu_buf_will_dirty(prev->ds_dbuf, tx); + prev->ds_phys->ds_num_children++; + + if (ds->ds_phys->ds_next_snap_obj == 0) { + ASSERT(ds->ds_prev == NULL); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + } + } + + ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); + ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); + + if (prev->ds_phys->ds_next_clones_obj == 0) { + prev->ds_phys->ds_next_clones_obj = + zap_create(dp->dp_meta_objset, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); + + dsl_dataset_rele(ds, FTAG); + if (prev != dp->dp_origin_snap) + dsl_dataset_rele(prev, FTAG); + return (0); +} + +void +dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap != NULL); + + (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, + tx, DS_FIND_CHILDREN); +} + +void +dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t dsobj; + dsl_dataset_t *ds; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap == NULL); + + /* create the origin dir, ds, & snap-ds */ + rw_enter(&dp->dp_config_rwlock, RW_WRITER); + dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, + NULL, 0, kcred, tx); + VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); + VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + dp, &dp->dp_origin_snap)); + dsl_dataset_rele(ds, FTAG); + rw_exit(&dp->dp_config_rwlock); +} diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c new file mode 100644 index 000000000..212acbbc5 --- /dev/null +++ b/module/zfs/dsl_prop.c @@ -0,0 +1,602 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/spa.h> +#include <sys/zio_checksum.h> /* for the default checksum value */ +#include <sys/zap.h> +#include <sys/fs/zfs.h> + +#include "zfs_prop.h" + +static int +dodefault(const char *propname, int intsz, int numint, void *buf) +{ + zfs_prop_t prop; + + /* + * The setonce properties are read-only, BUT they still + * have a default value that can be used as the initial + * value. + */ + if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL || + (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) + return (ENOENT); + + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { + if (intsz != 1) + return (EOVERFLOW); + (void) strncpy(buf, zfs_prop_default_string(prop), + numint); + } else { + if (intsz != 8 || numint < 1) + return (EOVERFLOW); + + *(uint64_t *)buf = zfs_prop_default_numeric(prop); + } + + return (0); +} + +int +dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, + int intsz, int numint, void *buf, char *setpoint) +{ + int err = ENOENT; + objset_t *mos = dd->dd_pool->dp_meta_objset; + zfs_prop_t prop; + + ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); + + if (setpoint) + setpoint[0] = '\0'; + + prop = zfs_name_to_prop(propname); + + /* + * Note: dd may be NULL, therefore we shouldn't dereference it + * ouside this loop. + */ + for (; dd != NULL; dd = dd->dd_parent) { + ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, + propname, intsz, numint, buf); + if (err != ENOENT) { + if (setpoint) + dsl_dir_name(dd, setpoint); + break; + } + + /* + * Break out of this loop for non-inheritable properties. + */ + if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) + break; + } + if (err == ENOENT) + err = dodefault(propname, intsz, numint, buf); + + return (err); +} + +int +dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, + int intsz, int numint, void *buf, char *setpoint) +{ + ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); + + if (ds->ds_phys->ds_props_obj) { + int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_props_obj, propname, intsz, numint, buf); + if (err != ENOENT) { + if (setpoint) + dsl_dataset_name(ds, setpoint); + return (err); + } + } + + return (dsl_prop_get_dd(ds->ds_dir, propname, + intsz, numint, buf, setpoint)); +} + +/* + * Register interest in the named property. We'll call the callback + * once to notify it of the current property value, and again each time + * the property changes, until this callback is unregistered. + * + * Return 0 on success, errno if the prop is not an integer value. + */ +int +dsl_prop_register(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dd->dd_pool; + uint64_t value; + dsl_prop_cb_record_t *cbr; + int err; + int need_rwlock; + + need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); + if (need_rwlock) + rw_enter(&dp->dp_config_rwlock, RW_READER); + + err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); + if (err != 0) { + if (need_rwlock) + rw_exit(&dp->dp_config_rwlock); + return (err); + } + + cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); + cbr->cbr_ds = ds; + cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP); + (void) strcpy((char *)cbr->cbr_propname, propname); + cbr->cbr_func = callback; + cbr->cbr_arg = cbarg; + mutex_enter(&dd->dd_lock); + list_insert_head(&dd->dd_prop_cbs, cbr); + mutex_exit(&dd->dd_lock); + + cbr->cbr_func(cbr->cbr_arg, value); + + VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, + NULL, cbr, &dd)); + if (need_rwlock) + rw_exit(&dp->dp_config_rwlock); + /* Leave dir open until this callback is unregistered */ + return (0); +} + +int +dsl_prop_get(const char *dsname, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + + dsl_dataset_rele(ds, FTAG); + return (err); +} + +/* + * Get the current property value. It may have changed by the time this + * function returns, so it is NOT safe to follow up with + * dsl_prop_register() and assume that the value has not changed in + * between. + * + * Return 0 on success, ENOENT if ddname is invalid. + */ +int +dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint) +{ + return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); +} + +/* + * Unregister this callback. Return 0 on success, ENOENT if ddname is + * invalid, ENOMSG if no matching callback registered. + */ +int +dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_cb_record_t *cbr; + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); + cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds && + cbr->cbr_func == callback && + cbr->cbr_arg == cbarg && + strcmp(cbr->cbr_propname, propname) == 0) + break; + } + + if (cbr == NULL) { + mutex_exit(&dd->dd_lock); + return (ENOMSG); + } + + list_remove(&dd->dd_prop_cbs, cbr); + mutex_exit(&dd->dd_lock); + kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); + kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); + + /* Clean up from dsl_prop_register */ + dsl_dir_close(dd, cbr); + return (0); +} + +/* + * Return the number of callbacks that are registered for this dataset. + */ +int +dsl_prop_numcb(dsl_dataset_t *ds) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_cb_record_t *cbr; + int num = 0; + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); + cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds) + num++; + } + mutex_exit(&dd->dd_lock); + + return (num); +} + +static void +dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, + const char *propname, uint64_t value, int first) +{ + dsl_dir_t *dd; + dsl_prop_cb_record_t *cbr; + objset_t *mos = dp->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t *za; + int err; + uint64_t dummyval; + + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); + if (err) + return; + + if (!first) { + /* + * If the prop is set here, then this change is not + * being inherited here or below; stop the recursion. + */ + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, + 8, 1, &dummyval); + if (err == 0) { + dsl_dir_close(dd, FTAG); + return; + } + ASSERT3U(err, ==, ENOENT); + } + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); cbr; + cbr = list_next(&dd->dd_prop_cbs, cbr)) { + uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj; + + if (strcmp(cbr->cbr_propname, propname) != 0) + continue; + + /* + * If the property is set on this ds, then it is not + * inherited here; don't call the callback. + */ + if (propobj && 0 == zap_lookup(mos, propobj, propname, + 8, 1, &dummyval)) + continue; + + cbr->cbr_func(cbr->cbr_arg, value); + } + mutex_exit(&dd->dd_lock); + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, mos, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + dsl_prop_changed_notify(dp, za->za_first_integer, + propname, value, FALSE); + } + kmem_free(za, sizeof (zap_attribute_t)); + zap_cursor_fini(&zc); + dsl_dir_close(dd, FTAG); +} + +struct prop_set_arg { + const char *name; + int intsz; + int numints; + const void *buf; +}; + + +static void +dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct prop_set_arg *psa = arg2; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj, intval; + int isint; + char valbuf[32]; + char *valstr; + + isint = (dodefault(psa->name, 8, 1, &intval) == 0); + + if (dsl_dataset_is_snapshot(ds)) { + ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_SNAP_PROPS); + if (ds->ds_phys->ds_props_obj == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_props_obj = + zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + } + zapobj = ds->ds_phys->ds_props_obj; + } else { + zapobj = ds->ds_dir->dd_phys->dd_props_zapobj; + } + + if (psa->numints == 0) { + int err = zap_remove(mos, zapobj, psa->name, tx); + ASSERT(err == 0 || err == ENOENT); + if (isint) { + VERIFY(0 == dsl_prop_get_ds(ds, + psa->name, 8, 1, &intval, NULL)); + } + } else { + VERIFY(0 == zap_update(mos, zapobj, psa->name, + psa->intsz, psa->numints, psa->buf, tx)); + if (isint) + intval = *(uint64_t *)psa->buf; + } + + if (isint) { + if (dsl_dataset_is_snapshot(ds)) { + dsl_prop_cb_record_t *cbr; + /* + * It's a snapshot; nothing can inherit this + * property, so just look for callbacks on this + * ds here. + */ + mutex_enter(&ds->ds_dir->dd_lock); + for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr; + cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds && + strcmp(cbr->cbr_propname, psa->name) == 0) + cbr->cbr_func(cbr->cbr_arg, intval); + } + mutex_exit(&ds->ds_dir->dd_lock); + } else { + dsl_prop_changed_notify(ds->ds_dir->dd_pool, + ds->ds_dir->dd_object, psa->name, intval, TRUE); + } + } + if (isint) { + (void) snprintf(valbuf, sizeof (valbuf), + "%lld", (longlong_t)intval); + valstr = valbuf; + } else { + valstr = (char *)psa->buf; + } + spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT : + LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr, + "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object); +} + +void +dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + cred_t *cr, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t zapobj = dd->dd_phys->dd_props_zapobj; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx)); + + dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); + + spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr, + "%s=%llu dataset = %llu", name, (u_longlong_t)val, + dd->dd_phys->dd_head_dataset_obj); +} + +int +dsl_prop_set(const char *dsname, const char *propname, + int intsz, int numints, const void *buf) +{ + dsl_dataset_t *ds; + int err; + struct prop_set_arg psa; + + /* + * We must do these checks before we get to the syncfunc, since + * it can't fail. + */ + if (strlen(propname) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + if (intsz * numints >= ZAP_MAXVALUELEN) + return (E2BIG); + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + if (dsl_dataset_is_snapshot(ds) && + spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } + + psa.name = propname; + psa.intsz = intsz; + psa.numints = numints; + psa.buf = buf; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + NULL, dsl_prop_set_sync, ds, &psa, 2); + + dsl_dataset_rele(ds, FTAG); + return (err); +} + +/* + * Iterate over all properties for this dataset and return them in an nvlist. + */ +int +dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dir_t *dd = ds->ds_dir; + boolean_t snapshot = dsl_dataset_is_snapshot(ds); + int err = 0; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t propobj = ds->ds_phys->ds_props_obj; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (local && snapshot && !propobj) + return (0); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + while (dd != NULL) { + char setpoint[MAXNAMELEN]; + zap_cursor_t zc; + zap_attribute_t za; + dsl_dir_t *dd_next; + + if (propobj) { + dsl_dataset_name(ds, setpoint); + dd_next = dd; + } else { + dsl_dir_name(dd, setpoint); + propobj = dd->dd_phys->dd_props_zapobj; + dd_next = dd->dd_parent; + } + + for (zap_cursor_init(&zc, mos, propobj); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + nvlist_t *propval; + zfs_prop_t prop = zfs_name_to_prop(za.za_name); + + /* Skip non-inheritable properties. */ + if (prop != ZPROP_INVAL && + !zfs_prop_inheritable(prop) && + (dd != ds->ds_dir || (snapshot && dd != dd_next))) + continue; + + /* Skip properties not valid for this type. */ + if (snapshot && prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) + continue; + + /* Skip properties already defined */ + if (nvlist_lookup_nvlist(*nvp, za.za_name, + &propval) == 0) + continue; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + if (za.za_integer_length == 1) { + /* + * String property + */ + char *tmp = kmem_alloc(za.za_num_integers, + KM_SLEEP); + err = zap_lookup(mos, propobj, + za.za_name, 1, za.za_num_integers, tmp); + if (err != 0) { + kmem_free(tmp, za.za_num_integers); + break; + } + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, + tmp) == 0); + kmem_free(tmp, za.za_num_integers); + } else { + /* + * Integer property + */ + ASSERT(za.za_integer_length == 8); + (void) nvlist_add_uint64(propval, ZPROP_VALUE, + za.za_first_integer); + } + + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, + setpoint) == 0); + VERIFY(nvlist_add_nvlist(*nvp, za.za_name, + propval) == 0); + nvlist_free(propval); + } + zap_cursor_fini(&zc); + + if (err != ENOENT) + break; + err = 0; + /* + * If we are just after the props that have been set + * locally, then we are done after the first iteration. + */ + if (local) + break; + dd = dd_next; + propobj = 0; + } + rw_exit(&dp->dp_config_rwlock); + + return (err); +} + +void +dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) +{ + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); + nvlist_free(propval); +} + +void +dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) +{ + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); + VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); + nvlist_free(propval); +} diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c new file mode 100644 index 000000000..950a91f78 --- /dev/null +++ b/module/zfs/dsl_scrub.c @@ -0,0 +1,1014 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dnode.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zil_impl.h> + +typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); + +static scrub_cb_t dsl_pool_scrub_clean_cb; +static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; + +int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ +int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ +boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ + +extern int zfs_txg_timeout; + +static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { + NULL, + dsl_pool_scrub_clean_cb +}; + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +/* ARGSUSED */ +static void +dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_pool_t *dp = arg1; + enum scrub_func *funcp = arg2; + dmu_object_type_t ot = 0; + boolean_t complete = B_FALSE; + + dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); + + ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); + ASSERT(*funcp > SCRUB_FUNC_NONE); + ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); + + dp->dp_scrub_min_txg = 0; + dp->dp_scrub_max_txg = tx->tx_txg; + + if (*funcp == SCRUB_FUNC_CLEAN) { + vdev_t *rvd = dp->dp_spa->spa_root_vdev; + + /* rewrite all disk labels */ + vdev_config_dirty(rvd); + + if (vdev_resilver_needed(rvd, + &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { + spa_event_notify(dp->dp_spa, NULL, + ESC_ZFS_RESILVER_START); + dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, + tx->tx_txg); + } + + /* zero out the scrub stats in all vdev_stat_t's */ + vdev_scrub_stat_update(rvd, + dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : + POOL_SCRUB_EVERYTHING, B_FALSE); + + dp->dp_spa->spa_scrub_started = B_TRUE; + } + + /* back to the generic stuff */ + + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + dp->dp_scrub_func = *funcp; + dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + dp->dp_scrub_restart = B_FALSE; + dp->dp_spa->spa_scrub_errors = 0; + + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, + &dp->dp_scrub_func, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, + &dp->dp_scrub_queue_obj, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_min_txg, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_max_txg, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &dp->dp_spa->spa_scrub_errors, tx)); + + spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); +} + +int +dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) +{ + return (dsl_sync_task_do(dp, NULL, + dsl_pool_scrub_setup_sync, dp, &func, 0)); +} + +/* ARGSUSED */ +static void +dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_pool_t *dp = arg1; + boolean_t *completep = arg2; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + mutex_enter(&dp->dp_scrub_cancel_lock); + + if (dp->dp_scrub_restart) { + dp->dp_scrub_restart = B_FALSE; + *completep = B_FALSE; + } + + /* XXX this is scrub-clean specific */ + mutex_enter(&dp->dp_spa->spa_scrub_lock); + while (dp->dp_spa->spa_scrub_inflight > 0) { + cv_wait(&dp->dp_spa->spa_scrub_io_cv, + &dp->dp_spa->spa_scrub_lock); + } + mutex_exit(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_started = B_FALSE; + dp->dp_spa->spa_scrub_active = B_FALSE; + + dp->dp_scrub_func = SCRUB_FUNC_NONE; + VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, tx)); + dp->dp_scrub_queue_obj = 0; + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, tx)); + + spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, + "complete=%u", *completep); + + /* below is scrub-clean specific */ + vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, + *completep); + /* + * If the scrub/resilver completed, update all DTLs to reflect this. + * Whether it succeeded or not, vacate all temporary scrub DTLs. + */ + vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, + *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); + if (dp->dp_scrub_min_txg && *completep) + spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); + spa_errlog_rotate(dp->dp_spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); + + dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; + mutex_exit(&dp->dp_scrub_cancel_lock); +} + +int +dsl_pool_scrub_cancel(dsl_pool_t *dp) +{ + boolean_t complete = B_FALSE; + + return (dsl_sync_task_do(dp, NULL, + dsl_pool_scrub_cancel_sync, dp, &complete, 3)); +} + +int +dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, + zio_done_func_t *done, void *private, uint32_t arc_flags) +{ + /* + * This function will be used by bp-rewrite wad to intercept frees. + */ + return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp, + done, private, arc_flags)); +} + +static boolean_t +bookmark_is_zero(const zbookmark_t *zb) +{ + return (zb->zb_objset == 0 && zb->zb_object == 0 && + zb->zb_level == 0 && zb->zb_blkid == 0); +} + +/* dnp is the dnode for zb1->zb_object */ +static boolean_t +bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb1->zb_object != -1ULL); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == -1ULL) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == 0) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == 0) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + +static boolean_t +scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) +{ + int elapsed_ticks; + int mintime; + + if (dp->dp_scrub_pausing) + return (B_TRUE); /* we're already pausing */ + + if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb->zb_level != 0) + return (B_FALSE); + + mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : + zfs_scrub_min_time; + elapsed_ticks = lbolt64 - dp->dp_scrub_start_time; + if (elapsed_ticks > hz * zfs_txg_timeout || + (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { + dprintf("pausing at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); + dp->dp_scrub_pausing = B_TRUE; + dp->dp_scrub_bookmark = *zb; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_traverse_arg { + dsl_pool_t *zta_dp; + zil_header_t *zta_zh; +} zil_traverse_arg_t; + +/* ARGSUSED */ +static void +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_traverse_arg_t *zta = arg; + dsl_pool_t *dp = zta->zta_dp; + zil_header_t *zh = zta->zta_zh; + zbookmark_t zb; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + return; + + zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); +} + +/* ARGSUSED */ +static void +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_traverse_arg_t *zta = arg; + dsl_pool_t *dp = zta->zta_dp; + zil_header_t *zh = zta->zta_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return; + + zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = lr->lr_foid; + zb.zb_level = BP_GET_LEVEL(bp); + zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); + } +} + +static void +traverse_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_traverse_arg_t zta = { dp, zh }; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && (spa_mode & FWRITE)) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, + claim_txg); + + zil_free(zilog); +} + +static void +scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, + arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) +{ + int err; + arc_buf_t *buf = NULL; + + if (bp->blk_birth == 0) + return; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (scrub_pause(dp, zb)) + return; + + if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { + /* + * If we already visited this bp & everything below (in + * a prior txg), don't bother doing it again. + */ + if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) + return; + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for pausing + * again. + */ + if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > dp->dp_scrub_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); + } + } + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read(NULL, dp->dp_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + cbp = buf->b_data; + + for (i = 0; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + scrub_visitbp(dp, dnp, buf, cbp, &czb); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + dnode_phys_t *child_dnp; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read(NULL, dp->dp_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + child_dnp = buf->b_data; + + for (i = 0; i < epb; i++, child_dnp++) { + for (j = 0; j < child_dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, + zb->zb_blkid * epb + i, + child_dnp->dn_nlevels - 1, j); + scrub_visitbp(dp, child_dnp, buf, + &child_dnp->dn_blkptr[j], &czb); + } + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + int j; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + + osp = buf->b_data; + + traverse_zil(dp, &osp->os_zil_header); + + for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, 0, + osp->os_meta_dnode.dn_nlevels - 1, j); + scrub_visitbp(dp, &osp->os_meta_dnode, buf, + &osp->os_meta_dnode.dn_blkptr[j], &czb); + } + } + + (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); + if (buf) + (void) arc_buf_remove_ref(buf, &buf); +} + +static void +scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) +{ + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0); + scrub_visitbp(dp, NULL, NULL, bp, &zb); +} + +void +dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { + SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0); + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) != 0) { + return; + } + + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_next_snap_obj, tx) == 0); + } + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); +} + +void +dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); + + if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { + dp->dp_scrub_bookmark.zb_objset = + ds->ds_phys->ds_prev_snap_obj; + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) == 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_prev_snap_obj, tx) == 0); + } +} + +void +dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { + dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; + } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { + dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; + } + + if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds1->ds_object, tx) == 0) { + int err = zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds2->ds_object, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds1->ds_object, tx)); + } + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds2->ds_object, tx) == 0) { + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds1->ds_object, tx)); + } +} + +struct enqueue_clones_arg { + dmu_tx_t *tx; + uint64_t originobj; +}; + +/* ARGSUSED */ +static int +enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct enqueue_clones_arg *eca = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp; + + err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); + if (err) + return (err); + dp = ds->ds_dir->dd_pool; + + if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, eca->tx) == 0); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + uint64_t min_txg_save; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + /* + * Iterate over the bps in this ds. + */ + min_txg_save = dp->dp_scrub_min_txg; + dp->dp_scrub_min_txg = + MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); + scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); + dp->dp_scrub_min_txg = min_txg_save; + + if (dp->dp_scrub_pausing) + goto out; + + /* + * Add descendent datasets to work queue. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_next_snap_obj, tx) == 0); + } + if (ds->ds_phys->ds_num_children > 1) { + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + struct enqueue_clones_arg eca; + eca.tx = tx; + eca.originobj = ds->ds_object; + + (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, + NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + } else { + VERIFY(zap_join(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + dp->dp_scrub_queue_obj, tx) == 0); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp; + + err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); + if (err) + return (err); + + dp = ds->ds_dir->dd_pool; + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + boolean_t complete = B_TRUE; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + /* If the spa is not fully loaded, don't bother. */ + if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE) + return; + + if (dp->dp_scrub_restart) { + enum scrub_func func = dp->dp_scrub_func; + dp->dp_scrub_restart = B_FALSE; + dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); + } + + if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { + /* + * We must have resumed after rebooting; reset the vdev + * stats to know that we're doing a scrub (although it + * will think we're just starting now). + */ + vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, + dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : + POOL_SCRUB_EVERYTHING, B_FALSE); + } + + dp->dp_scrub_pausing = B_FALSE; + dp->dp_scrub_start_time = lbolt64; + dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); + dp->dp_spa->spa_scrub_active = B_TRUE; + + if (dp->dp_scrub_bookmark.zb_objset == 0) { + /* First do the MOS & ORIGIN */ + scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); + if (dp->dp_scrub_pausing) + goto out; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + } else { + scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!dp->dp_scrub_pausing); + } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) { + /* + * If we were paused, continue from here. Note if the + * ds we were paused on was deleted, the zb_objset will + * be -1, so we will skip this and find a new objset + * below. + */ + scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); + if (dp->dp_scrub_pausing) + goto out; + } + + /* + * In case we were paused right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + + /* keep pulling things out of the zap-object-as-queue */ + while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), + zap_cursor_retrieve(&zc, &za) == 0) { + VERIFY(0 == zap_remove(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, za.za_name, tx)); + scrub_visitds(dp, za.za_first_integer, tx); + if (dp->dp_scrub_pausing) + break; + zap_cursor_fini(&zc); + } + zap_cursor_fini(&zc); + if (dp->dp_scrub_pausing) + goto out; + + /* done. */ + + dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); + return; +out: + VERIFY(0 == zap_update(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &dp->dp_spa->spa_scrub_errors, tx)); + + /* XXX this is scrub-clean specific */ + mutex_enter(&dp->dp_spa->spa_scrub_lock); + while (dp->dp_spa->spa_scrub_inflight > 0) { + cv_wait(&dp->dp_spa->spa_scrub_io_cv, + &dp->dp_spa->spa_scrub_lock); + } + mutex_exit(&dp->dp_spa->spa_scrub_lock); +} + +void +dsl_pool_scrub_restart(dsl_pool_t *dp) +{ + mutex_enter(&dp->dp_scrub_cancel_lock); + dp->dp_scrub_restart = B_TRUE; + mutex_exit(&dp->dp_scrub_cancel_lock); +} + +/* + * scrub consumers + */ + +static void +count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } +} + +static void +dsl_pool_scrub_clean_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) + spa->spa_scrub_errors++; + mutex_exit(&spa->spa_scrub_lock); +} + +static int +dsl_pool_scrub_clean_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_t *zb) +{ + size_t size = BP_GET_LSIZE(bp); + int d; + spa_t *spa = dp->dp_spa; + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; + int zio_priority; + + count_block(dp->dp_blkstats, bp); + + if (dp->dp_scrub_isresilver == 0) { + /* It's a scrub */ + zio_flags |= ZIO_FLAG_SCRUB; + zio_priority = ZIO_PRIORITY_SCRUB; + needs_io = B_TRUE; + } else { + /* It's a resilver */ + zio_flags |= ZIO_FLAG_RESILVER; + zio_priority = ZIO_PRIORITY_RESILVER; + needs_io = B_FALSE; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + vdev_t *vd = vdev_lookup_top(spa, + DVA_GET_VDEV(&bp->blk_dva[d])); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_examined += + DVA_GET_ASIZE(&bp->blk_dva[d]); + mutex_exit(&vd->vdev_stat_lock); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) { + if (DVA_GET_GANG(&bp->blk_dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best we can do is look at the + * pool-wide DTL. + * XXX -- it would be better to change our + * allocation policy to ensure that this can't + * happen. + */ + vd = spa->spa_root_vdev; + } + needs_io = vdev_dtl_contains(&vd->vdev_dtl_map, + bp->blk_birth, 1); + } + } + + if (needs_io && !zfs_no_scrub_io) { + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + dsl_pool_scrub_clean_done, NULL, zio_priority, + zio_flags, zb)); + } + + /* do not relocate this block */ + return (0); +} + +int +dsl_pool_scrub_clean(dsl_pool_t *dp) +{ + /* + * Purge all vdev caches. We do this here rather than in sync + * context because this requires a writer lock on the spa_config + * lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER); + dp->dp_spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(dp->dp_spa->spa_root_vdev); + dp->dp_spa->spa_scrub_reopen = B_FALSE; + spa_config_exit(dp->dp_spa, SCL_ALL, FTAG); + + return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); +} diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c new file mode 100644 index 000000000..21100225a --- /dev/null +++ b/module/zfs/dsl_synctask.c @@ -0,0 +1,225 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/cred.h> + +#define DST_AVG_BLKSHIFT 14 + +/* ARGSUSED */ +static int +dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) +{ + return (0); +} + +dsl_sync_task_group_t * +dsl_sync_task_group_create(dsl_pool_t *dp) +{ + dsl_sync_task_group_t *dstg; + + dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); + list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), + offsetof(dsl_sync_task_t, dst_node)); + dstg->dstg_pool = dp; + dstg->dstg_cr = CRED(); + + return (dstg); +} + +void +dsl_sync_task_create(dsl_sync_task_group_t *dstg, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified) +{ + dsl_sync_task_t *dst; + + if (checkfunc == NULL) + checkfunc = dsl_null_checkfunc; + dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); + dst->dst_checkfunc = checkfunc; + dst->dst_syncfunc = syncfunc; + dst->dst_arg1 = arg1; + dst->dst_arg2 = arg2; + list_insert_tail(&dstg->dstg_tasks, dst); + + dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; +} + +int +dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) +{ + dmu_tx_t *tx; + uint64_t txg; + dsl_sync_task_t *dst; + +top: + tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + + /* Do a preliminary error check. */ + dstg->dstg_err = 0; + rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { +#ifdef ZFS_DEBUG + /* + * Only check half the time, otherwise, the sync-context + * check will almost never fail. + */ + if (spa_get_random(2) == 0) + continue; +#endif + dst->dst_err = + dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); + if (dst->dst_err) + dstg->dstg_err = dst->dst_err; + } + rw_exit(&dstg->dstg_pool->dp_config_rwlock); + + if (dstg->dstg_err) { + dmu_tx_commit(tx); + return (dstg->dstg_err); + } + + VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); + + dmu_tx_commit(tx); + + txg_wait_synced(dstg->dstg_pool, txg); + + if (dstg->dstg_err == EAGAIN) + goto top; + + return (dstg->dstg_err); +} + +void +dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +{ + uint64_t txg; + + dstg->dstg_nowaiter = B_TRUE; + txg = dmu_tx_get_txg(tx); + VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); +} + +void +dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) +{ + dsl_sync_task_t *dst; + + while (dst = list_head(&dstg->dstg_tasks)) { + list_remove(&dstg->dstg_tasks, dst); + kmem_free(dst, sizeof (dsl_sync_task_t)); + } + kmem_free(dstg, sizeof (dsl_sync_task_group_t)); +} + +void +dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +{ + dsl_sync_task_t *dst; + void *tr_cookie; + + ASSERT3U(dstg->dstg_err, ==, 0); + + /* + * Check for sufficient space. + */ + dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir, + dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx); + /* don't bother trying again */ + if (dstg->dstg_err == ERESTART) + dstg->dstg_err = EAGAIN; + if (dstg->dstg_err) + return; + + /* + * Check for errors by calling checkfuncs. + */ + rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER); + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { + dst->dst_err = + dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); + if (dst->dst_err) + dstg->dstg_err = dst->dst_err; + } + + if (dstg->dstg_err == 0) { + /* + * Execute sync tasks. + */ + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { + dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, + dstg->dstg_cr, tx); + } + } + rw_exit(&dstg->dstg_pool->dp_config_rwlock); + + dsl_dir_tempreserve_clear(tr_cookie, tx); + + if (dstg->dstg_nowaiter) + dsl_sync_task_group_destroy(dstg); +} + +int +dsl_sync_task_do(dsl_pool_t *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified) +{ + dsl_sync_task_group_t *dstg; + int err; + + dstg = dsl_sync_task_group_create(dp); + dsl_sync_task_create(dstg, checkfunc, syncfunc, + arg1, arg2, blocks_modified); + err = dsl_sync_task_group_wait(dstg); + dsl_sync_task_group_destroy(dstg); + return (err); +} + +void +dsl_sync_task_do_nowait(dsl_pool_t *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) +{ + dsl_sync_task_group_t *dstg; + + dstg = dsl_sync_task_group_create(dp); + dsl_sync_task_create(dstg, checkfunc, syncfunc, + arg1, arg2, blocks_modified); + dsl_sync_task_group_nowait(dstg, tx); +} diff --git a/module/zfs/fletcher.c b/module/zfs/fletcher.c new file mode 100644 index 000000000..edda3c9a9 --- /dev/null +++ b/module/zfs/fletcher.c @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/byteorder.h> +#include <sys/spa.h> + +void +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += ip[0]; + a1 += ip[1]; + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +void +fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += BSWAP_64(ip[0]); + a1 += BSWAP_64(ip[1]); + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +void +fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_incremental_native(const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = zcp->zc_word[0]; + b = zcp->zc_word[1]; + c = zcp->zc_word[2]; + d = zcp->zc_word[3]; + + for (; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_incremental_byteswap(const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = zcp->zc_word[0]; + b = zcp->zc_word[1]; + c = zcp->zc_word[2]; + d = zcp->zc_word[3]; + + for (; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c new file mode 100644 index 000000000..b257d4af7 --- /dev/null +++ b/module/zfs/gzip.c @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/zmod.h> + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <strings.h> +#endif + +size_t +gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + size_t dstlen = d_len; + + ASSERT(d_len <= s_len); + + if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) { + if (d_len != s_len) + return (s_len); + + bcopy(s_start, d_start, s_len); + return (s_len); + } + + return (dstlen); +} + +/*ARGSUSED*/ +int +gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + size_t dstlen = d_len; + + ASSERT(d_len >= s_len); + + if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK) + return (-1); + + return (0); +} diff --git a/module/zfs/include/sys/arc.h b/module/zfs/include/sys/arc.h new file mode 100644 index 000000000..749bf53e5 --- /dev/null +++ b/module/zfs/include/sys/arc.h @@ -0,0 +1,138 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ARC_H +#define _SYS_ARC_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/spa.h> + +typedef struct arc_buf_hdr arc_buf_hdr_t; +typedef struct arc_buf arc_buf_t; +typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); +typedef int arc_evict_func_t(void *private); + +/* generic arc_done_func_t's which you can use */ +arc_done_func_t arc_bcopy_func; +arc_done_func_t arc_getbuf_func; + +struct arc_buf { + arc_buf_hdr_t *b_hdr; + arc_buf_t *b_next; + krwlock_t b_lock; + void *b_data; + arc_evict_func_t *b_efunc; + void *b_private; +}; + +typedef enum arc_buf_contents { + ARC_BUFC_DATA, /* buffer contains data */ + ARC_BUFC_METADATA, /* buffer contains metadata */ + ARC_BUFC_NUMTYPES +} arc_buf_contents_t; +/* + * These are the flags we pass into calls to the arc + */ +#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ +#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ +#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ +#define ARC_CACHED (1 << 4) /* I/O was already in cache */ +#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ + +void arc_space_consume(uint64_t space); +void arc_space_return(uint64_t space); +void *arc_data_buf_alloc(uint64_t space); +void arc_data_buf_free(void *buf, uint64_t space); +arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, + arc_buf_contents_t type); +void arc_buf_add_ref(arc_buf_t *buf, void *tag); +int arc_buf_remove_ref(arc_buf_t *buf, void *tag); +int arc_buf_size(arc_buf_t *buf); +void arc_release(arc_buf_t *buf, void *tag); +int arc_released(arc_buf_t *buf); +int arc_has_callback(arc_buf_t *buf); +void arc_buf_freeze(arc_buf_t *buf); +void arc_buf_thaw(arc_buf_t *buf); +#ifdef ZFS_DEBUG +int arc_referenced(arc_buf_t *buf); +#endif + +typedef struct writeprops { + dmu_object_type_t wp_type; + uint8_t wp_level; + uint8_t wp_copies; + uint8_t wp_dncompress, wp_oscompress; + uint8_t wp_dnchecksum, wp_oschecksum; +} writeprops_t; + +void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp); +int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); +int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t *arc_flags, const zbookmark_t *zb); +zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, + boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, + int zio_flags, const zbookmark_t *zb); +int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, uint32_t arc_flags); +int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); + +void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); +int arc_buf_evict(arc_buf_t *buf); + +void arc_flush(spa_t *spa); +void arc_tempreserve_clear(uint64_t reserve); +int arc_tempreserve_space(uint64_t reserve, uint64_t txg); + +void arc_init(void); +void arc_fini(void); + +/* + * Level 2 ARC + */ + +void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end); +void l2arc_remove_vdev(vdev_t *vd); +boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_init(void); +void l2arc_fini(void); +void l2arc_start(void); +void l2arc_stop(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_H */ diff --git a/module/zfs/include/sys/bplist.h b/module/zfs/include/sys/bplist.h new file mode 100644 index 000000000..cdb93a6c3 --- /dev/null +++ b/module/zfs/include/sys/bplist.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_BPLIST_H +#define _SYS_BPLIST_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bplist_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpl_entries blkptr_t's, representing + * a total of bpl_bytes physical space. + */ + uint64_t bpl_entries; + uint64_t bpl_bytes; + uint64_t bpl_comp; + uint64_t bpl_uncomp; +} bplist_phys_t; + +#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t)) + +typedef struct bplist_q { + blkptr_t bpq_blk; + void *bpq_next; +} bplist_q_t; + +typedef struct bplist { + kmutex_t bpl_lock; + objset_t *bpl_mos; + uint64_t bpl_object; + uint8_t bpl_blockshift; + uint8_t bpl_bpshift; + uint8_t bpl_havecomp; + bplist_q_t *bpl_queue; + bplist_phys_t *bpl_phys; + dmu_buf_t *bpl_dbuf; + dmu_buf_t *bpl_cached_dbuf; +} bplist_t; + +extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); +extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); +extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); +extern void bplist_close(bplist_t *bpl); +extern boolean_t bplist_empty(bplist_t *bpl); +extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); +extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx); +extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp); +extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); +extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); +extern int bplist_space(bplist_t *bpl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +extern int bplist_space_birthrange(bplist_t *bpl, + uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPLIST_H */ diff --git a/module/zfs/include/sys/dbuf.h b/module/zfs/include/sys/dbuf.h new file mode 100644 index 000000000..75ce27264 --- /dev/null +++ b/module/zfs/include/sys/dbuf.h @@ -0,0 +1,347 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DBUF_H +#define _SYS_DBUF_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define DB_BONUS_BLKID (-1ULL) +#define IN_DMU_SYNC 2 + +/* + * define flags for dbuf_read + */ + +#define DB_RF_MUST_SUCCEED (1 << 0) +#define DB_RF_CANFAIL (1 << 1) +#define DB_RF_HAVESTRUCT (1 << 2) +#define DB_RF_NOPREFETCH (1 << 3) +#define DB_RF_NEVERWAIT (1 << 4) +#define DB_RF_CACHED (1 << 5) + +/* + * The simplified state transition diagram for dbufs looks like: + * + * +----> READ ----+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * | ^ ^ + * | | | + * +----> FILL ----+ | + * | | + * | | + * +--------> NOFILL -------+ + */ +typedef enum dbuf_states { + DB_UNCACHED, + DB_FILL, + DB_NOFILL, + DB_READ, + DB_CACHED, + DB_EVICTING +} dbuf_states_t; + +struct objset_impl; +struct dnode; +struct dmu_tx; + +/* + * level = 0 means the user data + * level = 1 means the single indirect block + * etc. + */ + +#define LIST_LINK_INACTIVE(link) \ + ((link)->list_next == NULL && (link)->list_prev == NULL) + +struct dmu_buf_impl; + +typedef enum override_states { + DR_NOT_OVERRIDDEN, + DR_IN_DMU_SYNC, + DR_OVERRIDDEN +} override_states_t; + +typedef struct dbuf_dirty_record { + /* link on our parents dirty list */ + list_node_t dr_dirty_node; + + /* transaction group this data will sync in */ + uint64_t dr_txg; + + /* zio of outstanding write IO */ + zio_t *dr_zio; + + /* pointer back to our dbuf */ + struct dmu_buf_impl *dr_dbuf; + + /* pointer to next dirty record */ + struct dbuf_dirty_record *dr_next; + + /* pointer to parent dirty record */ + struct dbuf_dirty_record *dr_parent; + + union dirty_types { + struct dirty_indirect { + + /* protect access to list */ + kmutex_t dr_mtx; + + /* Our list of dirty children */ + list_t dr_children; + } di; + struct dirty_leaf { + + /* + * dr_data is set when we dirty the buffer + * so that we can retain the pointer even if it + * gets COW'd in a subsequent transaction group. + */ + arc_buf_t *dr_data; + blkptr_t dr_overridden_by; + override_states_t dr_override_state; + } dl; + } dt; +} dbuf_dirty_record_t; + +typedef struct dmu_buf_impl { + /* + * The following members are immutable, with the exception of + * db.db_data, which is protected by db_mtx. + */ + + /* the publicly visible structure */ + dmu_buf_t db; + + /* the objset we belong to */ + struct objset_impl *db_objset; + + /* + * the dnode we belong to (NULL when evicted) + */ + struct dnode *db_dnode; + + /* + * our parent buffer; if the dnode points to us directly, + * db_parent == db_dnode->dn_dbuf + * only accessed by sync thread ??? + * (NULL when evicted) + */ + struct dmu_buf_impl *db_parent; + + /* + * link for hash table of all dmu_buf_impl_t's + */ + struct dmu_buf_impl *db_hash_next; + + /* our block number */ + uint64_t db_blkid; + + /* + * Pointer to the blkptr_t which points to us. May be NULL if we + * don't have one yet. (NULL when evicted) + */ + blkptr_t *db_blkptr; + + /* + * Our indirection level. Data buffers have db_level==0. + * Indirect buffers which point to data buffers have + * db_level==1. etc. Buffers which contain dnodes have + * db_level==0, since the dnodes are stored in a file. + */ + uint8_t db_level; + + /* db_mtx protects the members below */ + kmutex_t db_mtx; + + /* + * Current state of the buffer + */ + dbuf_states_t db_state; + + /* + * Refcount accessed by dmu_buf_{hold,rele}. + * If nonzero, the buffer can't be destroyed. + * Protected by db_mtx. + */ + refcount_t db_holds; + + /* buffer holding our data */ + arc_buf_t *db_buf; + + kcondvar_t db_changed; + dbuf_dirty_record_t *db_data_pending; + + /* pointer to most recent dirty record for this buffer */ + dbuf_dirty_record_t *db_last_dirty; + + /* + * Our link on the owner dnodes's dn_dbufs list. + * Protected by its dn_dbufs_mtx. + */ + list_node_t db_link; + + /* Data which is unique to data (leaf) blocks: */ + + /* stuff we store for the user (see dmu_buf_set_user) */ + void *db_user_ptr; + void **db_user_data_ptr_ptr; + dmu_buf_evict_func_t *db_evict_func; + + uint8_t db_immediate_evict; + uint8_t db_freed_in_flight; + + uint8_t db_dirtycnt; +} dmu_buf_impl_t; + +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 256 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) +typedef struct dbuf_hash_table { + uint64_t hash_table_mask; + dmu_buf_impl_t **hash_table; + kmutex_t hash_mutexes[DBUF_MUTEXES]; +} dbuf_hash_table_t; + + +uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); + +dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); +void dbuf_create_bonus(struct dnode *dn); + +dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); +dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, + void *tag); +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, + void *tag, dmu_buf_impl_t **dbp); + +void dbuf_prefetch(struct dnode *dn, uint64_t blkid); + +void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); +uint64_t dbuf_refcount(dmu_buf_impl_t *db); + +void dbuf_rele(dmu_buf_impl_t *db, void *tag); + +dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); + +int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); + +void dbuf_clear(dmu_buf_impl_t *db); +void dbuf_evict(dmu_buf_impl_t *db); + +void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_unoverride(dbuf_dirty_record_t *dr); +void dbuf_sync_list(list_t *list, dmu_tx_t *tx); + +void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, + struct dmu_tx *); + +void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); + +void dbuf_init(void); +void dbuf_fini(void); + +#define DBUF_IS_METADATA(db) \ + ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata) + +#define DBUF_GET_BUFC_TYPE(db) \ + (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + +#define DBUF_IS_CACHEABLE(db) \ + ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(db) && \ + ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) + +#define DBUF_IS_L2CACHEABLE(db) \ + ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(db) && \ + ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but gcc does not + * support that preprocessor token. + */ +#define dprintf_dbuf(dbuf, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dbuf)->db.db_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj); \ + dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ + "obj=%s lvl=%u blkid=%lld " fmt, \ + __db_buf, (dbuf)->db_level, \ + (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ + dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DBUF_VERIFY(db) dbuf_verify(db) + +#else + +#define dprintf_dbuf(db, fmt, ...) +#define dprintf_dbuf_bp(db, bp, fmt, ...) +#define DBUF_VERIFY(db) + +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DBUF_H */ diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h new file mode 100644 index 000000000..3b1e5c8fb --- /dev/null +++ b/module/zfs/include/sys/dmu.h @@ -0,0 +1,638 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_H +#define _SYS_DMU_H + +/* + * This file describes the interface that the DMU provides for its + * consumers. + * + * The DMU also interacts with the SPA. That interface is described in + * dmu_spa.h. + */ + +#include <sys/inttypes.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cred.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct uio; +struct page; +struct vnode; +struct spa; +struct zilog; +struct zio; +struct blkptr; +struct zap_cursor; +struct dsl_dataset; +struct dsl_pool; +struct dnode; +struct drr_begin; +struct drr_end; +struct zbookmark; +struct spa; +struct nvlist; +struct objset_impl; + +typedef struct objset objset_t; +typedef struct dmu_tx dmu_tx_t; +typedef struct dsl_dir dsl_dir_t; + +typedef enum dmu_object_type { + DMU_OT_NONE, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPLIST, /* UINT64 */ + DMU_OT_BPLIST_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DIR, /* UINT64 */ + DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_DATASET, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_OLDACL, /* Old ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_UNLINKED_SET, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ + DMU_OT_SPA_HISTORY, /* UINT8 */ + DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ + DMU_OT_POOL_PROPS, /* ZAP */ + DMU_OT_DSL_PERMS, /* ZAP */ + DMU_OT_ACL, /* ACL */ + DMU_OT_SYSACL, /* SYSACL */ + DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ + DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ + DMU_OT_NEXT_CLONES, /* ZAP */ + DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_NUMTYPES +} dmu_object_type_t; + +typedef enum dmu_objset_type { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +} dmu_objset_type_t; + +void byteswap_uint64_array(void *buf, size_t size); +void byteswap_uint32_array(void *buf, size_t size); +void byteswap_uint16_array(void *buf, size_t size); +void byteswap_uint8_array(void *buf, size_t size); +void zap_byteswap(void *buf, size_t size); +void zfs_oldacl_byteswap(void *buf, size_t size); +void zfs_acl_byteswap(void *buf, size_t size); +void zfs_znode_byteswap(void *buf, size_t size); + +#define DS_MODE_NOHOLD 0 /* internal use only */ +#define DS_MODE_USER 1 /* simple access, no special needs */ +#define DS_MODE_OWNER 2 /* the "main" access, e.g. a mount */ +#define DS_MODE_TYPE_MASK 0x3 +#define DS_MODE_TYPE(x) ((x) & DS_MODE_TYPE_MASK) +#define DS_MODE_READONLY 0x8 +#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) +#define DS_MODE_INCONSISTENT 0x10 +#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT) + +#define DS_FIND_SNAPSHOTS (1<<0) +#define DS_FIND_CHILDREN (1<<1) + +/* + * The maximum number of bytes that can be accessed as part of one + * operation, including metadata. + */ +#define DMU_MAX_ACCESS (10<<20) /* 10MB */ +#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ + +/* + * Public routines to create, destroy, open, and close objsets. + */ +int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp); +int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type, + objset_t **osp); +void dmu_objset_close(objset_t *os); +int dmu_objset_evict_dbufs(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); +int dmu_objset_destroy(const char *name); +int dmu_snapshots_destroy(char *fsname, char *snapname); +int dmu_objset_rollback(objset_t *os); +int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_rename(const char *name, const char *newname, + boolean_t recursive); +int dmu_objset_find(char *name, int func(char *, void *), void *arg, + int flags); +void dmu_objset_byteswap(void *buf, size_t size); + +typedef struct dmu_buf { + uint64_t db_object; /* object that this buffer is part of */ + uint64_t db_offset; /* byte offset in this object */ + uint64_t db_size; /* size of buffer in bytes */ + void *db_data; /* data in buffer */ +} dmu_buf_t; + +typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); + +/* + * The names of zap entries in the DIRECTORY_OBJECT of the MOS. + */ +#define DMU_POOL_DIRECTORY_OBJECT 1 +#define DMU_POOL_CONFIG "config" +#define DMU_POOL_ROOT_DATASET "root_dataset" +#define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" +#define DMU_POOL_ERRLOG_LAST "errlog_last" +#define DMU_POOL_SPARES "spares" +#define DMU_POOL_DEFLATE "deflate" +#define DMU_POOL_HISTORY "history" +#define DMU_POOL_PROPS "pool_props" +#define DMU_POOL_L2CACHE "l2cache" + +/* 4x8 zbookmark_t */ +#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" +/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ +#define DMU_POOL_SCRUB_QUEUE "scrub_queue" +/* 1x8 txg */ +#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg" +/* 1x8 txg */ +#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg" +/* 1x4 enum scrub_func */ +#define DMU_POOL_SCRUB_FUNC "scrub_func" +/* 1x8 count */ +#define DMU_POOL_SCRUB_ERRORS "scrub_errors" + +/* + * Allocate an object from this objset. The range of object numbers + * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. + * + * The transaction must be assigned to a txg. The newly allocated + * object will be "held" in the transaction (ie. you can modify the + * newly allocated object in this transaction). + * + * dmu_object_alloc() chooses an object and returns it in *objectp. + * + * dmu_object_claim() allocates a specific object number. If that + * number is already allocated, it fails and returns EEXIST. + * + * Return 0 on success, or ENOSPC or EEXIST as specified above. + */ +uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * Free an object from this objset. + * + * The object's data will be freed as well (ie. you don't need to call + * dmu_free(object, 0, -1, tx)). + * + * The object need not be held in the transaction. + * + * If there are any holds on this object's buffers (via dmu_buf_hold()), + * or tx holds on the object (via dmu_tx_hold_object()), you can not + * free it; it fails and returns EBUSY. + * + * If the object is not allocated, it fails and returns ENOENT. + * + * Return 0 on success, or EBUSY or ENOENT as specified above. + */ +int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); + +/* + * Find the next allocated or free object. + * + * The objectp parameter is in-out. It will be updated to be the next + * object which is allocated. Ignore objects which have not been + * modified since txg. + * + * XXX Can only be called on a objset with no dirty data. + * + * Returns 0 on success, or ENOENT if there are no more objects. + */ +int dmu_object_next(objset_t *os, uint64_t *objectp, + boolean_t hole, uint64_t txg); + +/* + * Set the data blocksize for an object. + * + * The object cannot have any blocks allcated beyond the first. If + * the first block is allocated already, the new size must be greater + * than the current block size. If these conditions are not met, + * ENOTSUP will be returned. + * + * Returns 0 on success, or EBUSY if there are any holds on the object + * contents, or ENOTSUP as described above. + */ +int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, + int ibs, dmu_tx_t *tx); + +/* + * Set the checksum property on a dnode. The new checksum algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx); + +/* + * Set the compress property on a dnode. The new compression algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx); + +/* + * Decide how many copies of a given block we should make. Can be from + * 1 to SPA_DVAS_PER_BP. + */ +int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, + dmu_object_type_t ot); +/* + * The bonus data is accessed more or less like a regular buffer. + * You must dmu_bonus_hold() to get the buffer, which will give you a + * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus + * data. As with any normal buffer, you must call dmu_buf_read() to + * read db_data, dmu_buf_will_dirty() before modifying it, and the + * object must be held in an assigned transaction before calling + * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus + * buffer as well. You must release your hold with dmu_buf_rele(). + */ +int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); +int dmu_bonus_max(void); +int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); + +/* + * Obtain the DMU buffer from the specified object which contains the + * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so + * that it will remain in memory. You must release the hold with + * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your + * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. + * + * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill + * on the returned buffer before reading or writing the buffer's + * db_data. The comments for those routines describe what particular + * operations are valid after calling them. + * + * The object number must be a valid, allocated object number. + */ +int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **); +void dmu_buf_add_ref(dmu_buf_t *db, void* tag); +void dmu_buf_rele(dmu_buf_t *db, void *tag); +uint64_t dmu_buf_refcount(dmu_buf_t *db); + +/* + * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a + * range of an object. A pointer to an array of dmu_buf_t*'s is + * returned (in *dbpp). + * + * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and + * frees the array. The hold on the array of buffers MUST be released + * with dmu_buf_rele_array. You can NOT release the hold on each buffer + * individually with dmu_buf_rele. + */ +int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); + +/* + * Returns NULL on success, or the existing user ptr if it's already + * been set. + * + * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). + * + * user_data_ptr_ptr should be NULL, or a pointer to a pointer which + * will be set to db->db_data when you are allowed to access it. Note + * that db->db_data (the pointer) can change when you do dmu_buf_read(), + * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill(). + * *user_data_ptr_ptr will be set to the new value when it changes. + * + * If non-NULL, pageout func will be called when this buffer is being + * excised from the cache, so that you can clean up the data structure + * pointed to by user_ptr. + * + * dmu_evict_user() will call the pageout func for all buffers in a + * objset with a given pageout func. + */ +void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *pageout_func); +/* + * set_user_ie is the same as set_user, but request immediate eviction + * when hold count goes to zero. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); +void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, + void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *pageout_func); +void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); + +/* + * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. + */ +void *dmu_buf_get_user(dmu_buf_t *db); + +/* + * Indicate that you are going to modify the buffer's data (db_data). + * + * The transaction (tx) must be assigned to a txg (ie. you've called + * dmu_tx_assign()). The buffer's object must be held in the tx + * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). + */ +void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); + +/* + * You must create a transaction, then hold the objects which you will + * (or might) modify as part of this transaction. Then you must assign + * the transaction to a transaction group. Once the transaction has + * been assigned, you can modify buffers which belong to held objects as + * part of this transaction. You can't modify buffers before the + * transaction has been assigned; you can't modify buffers which don't + * belong to objects which this transaction holds; you can't hold + * objects once the transaction has been assigned. You may hold an + * object which you are going to free (with dmu_object_free()), but you + * don't have to. + * + * You can abort the transaction before it has been assigned. + * + * Note that you may hold buffers (with dmu_buf_hold) at any time, + * regardless of transaction state. + */ + +#define DMU_NEW_OBJECT (-1ULL) +#define DMU_OBJECT_END (-1ULL) + +dmu_tx_t *dmu_tx_create(objset_t *os); +void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, + uint64_t len); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); +void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_abort(dmu_tx_t *tx); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_commit(dmu_tx_t *tx); + +/* + * Free up the data blocks for a defined range of a file. If size is + * zero, the range from offset to end-of-file is freed. + */ +int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx); +int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size); +int dmu_free_object(objset_t *os, uint64_t object); + +/* + * Convenience functions. + * + * Canfail routines will return 0 on success, or an errno if there is a + * nonrecoverable I/O error. + */ +int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf); +void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx); +int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); +int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, struct page *pp, dmu_tx_t *tx); + +extern int zfs_prefetch_disable; + +/* + * Asynchronously try to read in the data. + */ +void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, + uint64_t len); + +typedef struct dmu_object_info { + /* All sizes are in bytes. */ + uint32_t doi_data_block_size; + uint32_t doi_metadata_block_size; + uint64_t doi_bonus_size; + dmu_object_type_t doi_type; + dmu_object_type_t doi_bonus_type; + uint8_t doi_indirection; /* 2 = dnode->indirect->data */ + uint8_t doi_checksum; + uint8_t doi_compress; + uint8_t doi_pad[5]; + /* Values below are number of 512-byte blocks. */ + uint64_t doi_physical_blks; /* data + metadata */ + uint64_t doi_max_block_offset; +} dmu_object_info_t; + +typedef void arc_byteswap_func_t(void *buf, size_t size); + +typedef struct dmu_object_type_info { + arc_byteswap_func_t *ot_byteswap; + boolean_t ot_metadata; + char *ot_name; +} dmu_object_type_info_t; + +extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; + +/* + * Get information on a DMU object. + * + * Return 0 on success or ENOENT if object is not allocated. + * + * If doi is NULL, just indicates whether the object exists. + */ +int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, + u_longlong_t *nblk512); + +typedef struct dmu_objset_stats { + uint64_t dds_num_clones; /* number of clones of this */ + uint64_t dds_creation_txg; + uint64_t dds_guid; + dmu_objset_type_t dds_type; + uint8_t dds_is_snapshot; + uint8_t dds_inconsistent; + char dds_origin[MAXNAMELEN]; +} dmu_objset_stats_t; + +/* + * Get stats on a dataset. + */ +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); + +/* + * Add entries to the nvlist for all the objset's properties. See + * zfs_prop_table[] and zfs(1m) for details on the properties. + */ +void dmu_objset_stats(objset_t *os, struct nvlist *nv); + +/* + * Get the space usage statistics for statvfs(). + * + * refdbytes is the amount of space "referenced" by this objset. + * availbytes is the amount of space available to this objset, taking + * into account quotas & reservations, assuming that no other objsets + * use the space first. These values correspond to the 'referenced' and + * 'available' properties, described in the zfs(1m) manpage. + * + * usedobjs and availobjs are the number of objects currently allocated, + * and available. + */ +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); + +/* + * The fsid_guid is a 56-bit ID that can change to avoid collisions. + * (Contrast with the ds_guid which is a 64-bit ID that will never + * change, so there is a small probability that it will collide.) + */ +uint64_t dmu_objset_fsid_guid(objset_t *os); + +int dmu_objset_is_snapshot(objset_t *os); + +extern struct spa *dmu_objset_spa(objset_t *os); +extern struct zilog *dmu_objset_zil(objset_t *os); +extern struct dsl_pool *dmu_objset_pool(objset_t *os); +extern struct dsl_dataset *dmu_objset_ds(objset_t *os); +extern void dmu_objset_name(objset_t *os, char *buf); +extern dmu_objset_type_t dmu_objset_type(objset_t *os); +extern uint64_t dmu_objset_id(objset_t *os); +extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *id, uint64_t *offp, boolean_t *case_conflict); +extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, + int maxlen, boolean_t *conflict); +extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp); +extern void dmu_objset_set_user(objset_t *os, void *user_ptr); +extern void *dmu_objset_get_user(objset_t *os); + +/* + * Return the txg number for the given assigned transaction. + */ +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); + +/* + * Synchronous write. + * If a parent zio is provided this function initiates a write on the + * provided buffer as a child of the parent zio. + * In the absence of a parent zio, the write is completed synchronously. + * At write completion, blk is filled with the bp of the written block. + * Note that while the data covered by this function will be on stable + * storage when the write completes this new data does not become a + * permanent part of the file until the associated transaction commits. + */ +typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg); +int dmu_sync(struct zio *zio, dmu_buf_t *db, + struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg); + +/* + * Find the next hole or data block in file starting at *off + * Return found offset in *off. Return ESRCH for end of file. + */ +int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, + uint64_t *off); + +/* + * Initial setup and final teardown. + */ +extern void dmu_init(void); +extern void dmu_fini(void); + +typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, + uint64_t object, uint64_t offset, int len); +void dmu_traverse_objset(objset_t *os, uint64_t txg_start, + dmu_traverse_cb_t cb, void *arg); + +int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + struct vnode *vp, offset_t *off); + +typedef struct dmu_recv_cookie { + /* + * This structure is opaque! + * + * If logical and real are different, we are recving the stream + * into the "real" temporary clone, and then switching it with + * the "logical" target. + */ + struct dsl_dataset *drc_logical_ds; + struct dsl_dataset *drc_real_ds; + struct drr_begin *drc_drrb; + char *drc_tosnap; + boolean_t drc_newfs; + boolean_t drc_force; +} dmu_recv_cookie_t; + +int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *, + boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *); +int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp); +int dmu_recv_end(dmu_recv_cookie_t *drc); +void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc); + +/* CRC64 table */ +#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ +extern uint64_t zfs_crc64_table[256]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_H */ diff --git a/module/zfs/include/sys/dmu_impl.h b/module/zfs/include/sys/dmu_impl.h new file mode 100644 index 000000000..96ce688e1 --- /dev/null +++ b/module/zfs/include/sys/dmu_impl.h @@ -0,0 +1,239 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_IMPL_H +#define _SYS_DMU_IMPL_H + +#include <sys/txg_impl.h> +#include <sys/zio.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the locking strategy for the DMU. Numbers in parenthesis are + * cases that use that lock order, referenced below: + * + * ARC is self-contained + * bplist is self-contained + * refcount is self-contained + * txg is self-contained (hopefully!) + * zst_lock + * zf_rwlock + * + * XXX try to improve evicting path? + * + * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > + * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs + * + * dp_config_rwlock + * must be held before: everything + * protects dd namespace changes + * protects property changes globally + * held from: + * dsl_dir_open/r: + * dsl_dir_create_sync/w: + * dsl_dir_sync_destroy/w: + * dsl_dir_rename_sync/w: + * dsl_prop_changed_notify/r: + * + * os_obj_lock + * must be held before: + * everything except dp_config_rwlock + * protects os_obj_next + * held from: + * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock + * + * dn_struct_rwlock + * must be held before: + * everything except dp_config_rwlock and os_obj_lock + * protects structure of dnode (eg. nlevels) + * db_blkptr can change when syncing out change to nlevels + * dn_maxblkid + * dn_nlevels + * dn_*blksz* + * phys nlevels, maxblkid, physical blkptr_t's (?) + * held from: + * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch + * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) + * dmu_tx_count_free: + * dbuf_read_impl: db_mtx, dmu_zfetch() + * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() + * dbuf_new_size: db_mtx + * dbuf_dirty: db_mtx + * dbuf_findbp: (callers, phys? - the real need) + * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) + * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx + * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() + * dnode_sync/w (increase_indirection): db_mtx (phys) + * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) + * dnode_new_blkid/w: (dn_maxblkid) + * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) + * dnode_next_offset: (phys) + * + * dn_dbufs_mtx + * must be held before: + * db_mtx, hash_mutexes + * protects: + * dn_dbufs + * dn_evicted + * held from: + * dmu_evict_user: db_mtx (dn_dbufs) + * dbuf_free_range: db_mtx (dn_dbufs) + * dbuf_remove_ref: db_mtx, callees: + * dbuf_hash_remove: hash_mutexes, db_mtx + * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) + * dnode_set_blksz: (dn_dbufs) + * + * hash_mutexes (global) + * must be held before: + * db_mtx + * protects dbuf_hash_table (global) and db_hash_next + * held from: + * dbuf_find: db_mtx + * dbuf_hash_insert: db_mtx + * dbuf_hash_remove: db_mtx + * + * db_mtx (meta-leaf) + * must be held before: + * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) + * protects: + * db_state + * db_holds + * db_buf + * db_changed + * db_data_pending + * db_dirtied + * db_link + * db_dirty_node (??) + * db_dirtycnt + * db_d.* + * db.* + * held from: + * dbuf_dirty: dn_mtx, dn_dirty_mtx + * dbuf_dirty->dsl_dir_willuse_space: dd_lock + * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock + * dbuf_undirty: dn_dirty_mtx (db_d) + * dbuf_write_done: dn_dirty_mtx (db_state) + * dbuf_* + * dmu_buf_update_user: none (db_d) + * dmu_evict_user: none (db_d) (maybe can eliminate) + * dbuf_find: none (db_holds) + * dbuf_hash_insert: none (db_holds) + * dmu_buf_read_array_impl: none (db_state, db_changed) + * dmu_sync: none (db_dirty_node, db_d) + * dnode_reallocate: none (db) + * + * dn_mtx (leaf) + * protects: + * dn_dirty_dbufs + * dn_ranges + * phys accounting + * dn_allocated_txg + * dn_free_txg + * dn_assigned_txg + * dd_assigned_tx + * dn_notxholds + * dn_dirtyctx + * dn_dirtyctx_firstset + * (dn_phys copy fields?) + * (dn_phys contents?) + * held from: + * dnode_* + * dbuf_dirty: none + * dbuf_sync: none (phys accounting) + * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) + * dbuf_write_done: none (phys accounting) + * dmu_object_info_from_dnode: none (accounting) + * dmu_tx_commit: none + * dmu_tx_hold_object_impl: none + * dmu_tx_try_assign: dn_notxholds(cv) + * dmu_tx_unassign: none + * + * dd_lock + * must be held before: + * ds_lock + * ancestors' dd_lock + * protects: + * dd_prop_cbs + * dd_sync_* + * dd_used_bytes + * dd_tempreserved + * dd_space_towrite + * dd_myname + * dd_phys accounting? + * held from: + * dsl_dir_* + * dsl_prop_changed_notify: none (dd_prop_cbs) + * dsl_prop_register: none (dd_prop_cbs) + * dsl_prop_unregister: none (dd_prop_cbs) + * dsl_dataset_block_freeable: none (dd_sync_*) + * + * os_lock (leaf) + * protects: + * os_dirty_dnodes + * os_free_dnodes + * os_dnodes + * os_downgraded_dbufs + * dn_dirtyblksz + * dn_dirty_link + * held from: + * dnode_create: none (os_dnodes) + * dnode_destroy: none (os_dnodes) + * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) + * dnode_free: none (dn_dirtyblksz, os_*_dnodes) + * + * ds_lock + * protects: + * ds_user_ptr + * ds_user_evice_func + * ds_open_refcount + * ds_snapname + * ds_phys accounting + * ds_reserved + * held from: + * dsl_dataset_* + * + * dr_mtx (leaf) + * protects: + * dr_children + * held from: + * dbuf_dirty + * dbuf_undirty + * dbuf_sync_indirect + * dnode_new_blkid + */ + +struct objset; +struct dmu_pool; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_IMPL_H */ diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h new file mode 100644 index 000000000..15df29a17 --- /dev/null +++ b/module/zfs/include/sys/dmu_objset.h @@ -0,0 +1,136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_OBJSET_H +#define _SYS_DMU_OBJSET_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/arc.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/zio.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dmu_tx; +struct objset_impl; + +typedef struct objset_phys { + dnode_phys_t os_meta_dnode; + zil_header_t os_zil_header; + uint64_t os_type; + char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - + sizeof (uint64_t)]; +} objset_phys_t; + +struct objset { + struct objset_impl *os; + int os_mode; +}; + +typedef struct objset_impl { + /* Immutable: */ + struct dsl_dataset *os_dsl_dataset; + spa_t *os_spa; + arc_buf_t *os_phys_buf; + objset_phys_t *os_phys; + dnode_t *os_meta_dnode; + zilog_t *os_zil; + objset_t os; + uint8_t os_checksum; /* can change, under dsl_dir's locks */ + uint8_t os_compress; /* can change, under dsl_dir's locks */ + uint8_t os_copies; /* can change, under dsl_dir's locks */ + uint8_t os_primary_cache; /* can change, under dsl_dir's locks */ + uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */ + + /* no lock needed: */ + struct dmu_tx *os_synctx; /* XXX sketchy */ + blkptr_t *os_rootbp; + zil_header_t os_zil_header; + + /* Protected by os_obj_lock */ + kmutex_t os_obj_lock; + uint64_t os_obj_next; + + /* Protected by os_lock */ + kmutex_t os_lock; + list_t os_dirty_dnodes[TXG_SIZE]; + list_t os_free_dnodes[TXG_SIZE]; + list_t os_dnodes; + list_t os_downgraded_dbufs; + + /* stuff we store for the user */ + kmutex_t os_user_ptr_lock; + void *os_user_ptr; +} objset_impl_t; + +#define DMU_META_DNODE_OBJECT 0 + +#define DMU_OS_IS_L2CACHEABLE(os) \ + ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ + (os)->os_secondary_cache == ZFS_CACHE_METADATA) + +/* called from zpl */ +int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp); +void dmu_objset_close(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); +int dmu_objset_destroy(const char *name); +int dmu_objset_rollback(objset_t *os); +int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +void dmu_objset_stats(objset_t *os, nvlist_t *nv); +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dmu_objset_fsid_guid(objset_t *os); +int dmu_objset_find(char *name, int func(char *, void *), void *arg, + int flags); +int dmu_objset_find_spa(spa_t *spa, const char *name, + int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); +void dmu_objset_byteswap(void *buf, size_t size); +int dmu_objset_evict_dbufs(objset_t *os); + +/* called from dsl */ +void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); +objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, + blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); +int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, + objset_impl_t **osip); +void dmu_objset_evict(struct dsl_dataset *ds, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_OBJSET_H */ diff --git a/module/zfs/include/sys/dmu_traverse.h b/module/zfs/include/sys/dmu_traverse.h new file mode 100644 index 000000000..3e0268911 --- /dev/null +++ b/module/zfs/include/sys/dmu_traverse.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_TRAVERSE_H +#define _SYS_DMU_TRAVERSE_H + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dnode_phys; +struct dsl_dataset; + +typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp, + const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); + +#define TRAVERSE_PRE (1<<0) +#define TRAVERSE_POST (1<<1) +#define TRAVERSE_PREFETCH_METADATA (1<<2) +#define TRAVERSE_PREFETCH_DATA (1<<3) +#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) + +int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/module/zfs/include/sys/dmu_tx.h b/module/zfs/include/sys/dmu_tx.h new file mode 100644 index 000000000..2727daaaa --- /dev/null +++ b/module/zfs/include/sys/dmu_tx.h @@ -0,0 +1,139 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_TX_H +#define _SYS_DMU_TX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/inttypes.h> +#include <sys/dmu.h> +#include <sys/txg.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf_impl; +struct dmu_tx_hold; +struct dnode_link; +struct dsl_pool; +struct dnode; +struct dsl_dir; + +struct dmu_tx { + /* + * No synchronization is needed because a tx can only be handled + * by one thread. + */ + list_t tx_holds; /* list of dmu_tx_hold_t */ + objset_t *tx_objset; + struct dsl_dir *tx_dir; + struct dsl_pool *tx_pool; + uint64_t tx_txg; + uint64_t tx_lastsnap_txg; + uint64_t tx_lasttried_txg; + txg_handle_t tx_txgh; + void *tx_tempreserve_cookie; + struct dmu_tx_hold *tx_needassign_txh; + uint8_t tx_anyobj; + int tx_err; +#ifdef ZFS_DEBUG + uint64_t tx_space_towrite; + uint64_t tx_space_tofree; + uint64_t tx_space_tooverwrite; + uint64_t tx_space_tounref; + refcount_t tx_space_written; + refcount_t tx_space_freed; +#endif +}; + +enum dmu_tx_hold_type { + THT_NEWOBJECT, + THT_WRITE, + THT_BONUS, + THT_FREE, + THT_ZAP, + THT_SPACE, + THT_NUMTYPES +}; + +typedef struct dmu_tx_hold { + dmu_tx_t *txh_tx; + list_node_t txh_node; + struct dnode *txh_dnode; + uint64_t txh_space_towrite; + uint64_t txh_space_tofree; + uint64_t txh_space_tooverwrite; + uint64_t txh_space_tounref; + uint64_t txh_memory_tohold; + uint64_t txh_fudge; +#ifdef ZFS_DEBUG + enum dmu_tx_hold_type txh_type; + uint64_t txh_arg1; + uint64_t txh_arg2; +#endif +} dmu_tx_hold_t; + + +/* + * These routines are defined in dmu.h, and are called by the user. + */ +dmu_tx_t *dmu_tx_create(objset_t *dd); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_abort(dmu_tx_t *tx); +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); +void dmu_tx_wait(dmu_tx_t *tx); + +/* + * These routines are defined in dmu_spa.h, and are called by the SPA. + */ +extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); + +/* + * These routines are only called by the DMU. + */ +dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); +int dmu_tx_is_syncing(dmu_tx_t *tx); +int dmu_tx_private_ok(dmu_tx_t *tx); +void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); +void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); +void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); +int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); + +#ifdef ZFS_DEBUG +#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) +#else +#define DMU_TX_DIRTY_BUF(tx, db) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TX_H */ diff --git a/module/zfs/include/sys/dmu_zfetch.h b/module/zfs/include/sys/dmu_zfetch.h new file mode 100644 index 000000000..c94bced93 --- /dev/null +++ b/module/zfs/include/sys/dmu_zfetch.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DFETCH_H +#define _DFETCH_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint64_t zfetch_array_rd_sz; + +struct dnode; /* so we can reference dnode */ + +typedef enum zfetch_dirn { + ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */ + ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */ +} zfetch_dirn_t; + +typedef struct zstream { + uint64_t zst_offset; /* offset of starting block in range */ + uint64_t zst_len; /* length of range, in blocks */ + zfetch_dirn_t zst_direction; /* direction of prefetch */ + uint64_t zst_stride; /* length of stride, in blocks */ + uint64_t zst_ph_offset; /* prefetch offset, in blocks */ + uint64_t zst_cap; /* prefetch limit (cap), in blocks */ + kmutex_t zst_lock; /* protects stream */ + clock_t zst_last; /* lbolt of last prefetch */ + avl_node_t zst_node; /* embed avl node here */ +} zstream_t; + +typedef struct zfetch { + krwlock_t zf_rwlock; /* protects zfetch structure */ + list_t zf_stream; /* AVL tree of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ + uint32_t zf_stream_cnt; /* # of active streams */ + uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ +} zfetch_t; + +void dmu_zfetch_init(zfetch_t *, struct dnode *); +void dmu_zfetch_rele(zfetch_t *); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); + + +#ifdef __cplusplus +} +#endif + +#endif /* _DFETCH_H */ diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h new file mode 100644 index 000000000..c79ff48a6 --- /dev/null +++ b/module/zfs/include/sys/dnode.h @@ -0,0 +1,275 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DNODE_H +#define _SYS_DNODE_H + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/refcount.h> +#include <sys/dmu_zfetch.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * dnode_hold() flags. + */ +#define DNODE_MUST_BE_ALLOCATED 1 +#define DNODE_MUST_BE_FREE 2 + +/* + * dnode_next_offset() flags. + */ +#define DNODE_FIND_HOLE 1 +#define DNODE_FIND_BACKWARDS 2 +#define DNODE_FIND_HAVELOCK 4 + +/* + * Fixed constants. + */ +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 10 /* 1k */ +#define DN_MAX_INDBLKSHIFT 14 /* 16k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ + +/* + * Derived constants. + */ +#define DNODE_SIZE (1 << DNODE_SHIFT) +#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) + +#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) +#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) +#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) + +/* The +2 here is a cheesy way to round up */ +#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ + (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) + +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ + (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) + +#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ + (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) + +#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) + +struct dmu_buf_impl; +struct objset_impl; +struct zio; + +enum dnode_dirtycontext { + DN_UNDIRTIED, + DN_DIRTY_OPEN, + DN_DIRTY_SYNC +}; + +/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ +#define DNODE_FLAG_USED_BYTES (1<<0) + +typedef struct dnode_phys { + uint8_t dn_type; /* dmu_object_type_t */ + uint8_t dn_indblkshift; /* ln2(indirect block size) */ + uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t dn_nblkptr; /* length of dn_blkptr */ + uint8_t dn_bonustype; /* type of data in bonus buffer */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_flags; /* DNODE_FLAG_* */ + uint16_t dn_datablkszsec; /* data block size in 512b sectors */ + uint16_t dn_bonuslen; /* length of dn_bonus */ + uint8_t dn_pad2[4]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t dn_maxblkid; /* largest allocated block ID */ + uint64_t dn_used; /* bytes (or sectors) of disk space */ + + uint64_t dn_pad3[4]; + + blkptr_t dn_blkptr[1]; + uint8_t dn_bonus[DN_MAX_BONUSLEN]; +} dnode_phys_t; + +typedef struct dnode { + /* + * dn_struct_rwlock protects the structure of the dnode, + * including the number of levels of indirection (dn_nlevels), + * dn_maxblkid, and dn_next_* + */ + krwlock_t dn_struct_rwlock; + + /* + * Our link on dataset's dd_dnodes list. + * Protected by dd_accounting_mtx. + */ + list_node_t dn_link; + + /* immutable: */ + struct objset_impl *dn_objset; + uint64_t dn_object; + struct dmu_buf_impl *dn_dbuf; + dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ + + /* + * Copies of stuff in dn_phys. They're valid in the open + * context (eg. even before the dnode is first synced). + * Where necessary, these are protected by dn_struct_rwlock. + */ + dmu_object_type_t dn_type; /* object type */ + uint16_t dn_bonuslen; /* bonus length */ + uint8_t dn_bonustype; /* bonus type */ + uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_nlevels; + uint8_t dn_indblkshift; + uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint16_t dn_datablkszsec; /* in 512b sectors */ + uint32_t dn_datablksz; /* in bytes */ + uint64_t dn_maxblkid; + uint8_t dn_next_nlevels[TXG_SIZE]; + uint8_t dn_next_indblkshift[TXG_SIZE]; + uint16_t dn_next_bonuslen[TXG_SIZE]; + uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + + /* protected by os_lock: */ + list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ + + /* protected by dn_mtx: */ + kmutex_t dn_mtx; + list_t dn_dirty_records[TXG_SIZE]; + avl_tree_t dn_ranges[TXG_SIZE]; + uint64_t dn_allocated_txg; + uint64_t dn_free_txg; + uint64_t dn_assigned_txg; + kcondvar_t dn_notxholds; + enum dnode_dirtycontext dn_dirtyctx; + uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ + + /* protected by own devices */ + refcount_t dn_tx_holds; + refcount_t dn_holds; + + kmutex_t dn_dbufs_mtx; + list_t dn_dbufs; /* linked list of descendent dbuf_t's */ + struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + + /* parent IO for current sync write */ + zio_t *dn_zio; + + /* holds prefetch structure */ + struct zfetch dn_zfetch; +} dnode_t; + +typedef struct free_range { + avl_node_t fr_node; + uint64_t fr_blkid; + uint64_t fr_nblks; +} free_range_t; + +dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, + uint64_t object); +void dnode_special_close(dnode_t *dn); + +void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); +int dnode_hold(struct objset_impl *dd, uint64_t object, + void *ref, dnode_t **dnp); +int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, + void *ref, dnode_t **dnp); +boolean_t dnode_add_ref(dnode_t *dn, void *ref); +void dnode_rele(dnode_t *dn, void *ref); +void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +void dnode_sync(dnode_t *dn, dmu_tx_t *tx); +void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +void dnode_free(dnode_t *dn, dmu_tx_t *tx); +void dnode_byteswap(dnode_phys_t *dnp); +void dnode_buf_byteswap(void *buf, size_t size); +void dnode_verify(dnode_t *dn); +int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); +uint64_t dnode_current_max_length(dnode_t *dn); +void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); +void dnode_clear_range(dnode_t *dn, uint64_t blkid, + uint64_t nblks, dmu_tx_t *tx); +void dnode_diduse_space(dnode_t *dn, int64_t space); +void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); +uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); +void dnode_init(void); +void dnode_fini(void); +int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, + int minlvl, uint64_t blkfill, uint64_t txg); +void dnode_evict_dbufs(dnode_t *dn); + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but that piece of shit + * gcc doesn't support that preprocessor token. + */ +#define dprintf_dnode(dn, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dn)->dn_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj);\ + dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ + __db_buf, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DNODE_VERIFY(dn) dnode_verify(dn) +#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) + +#else + +#define dprintf_dnode(db, fmt, ...) +#define DNODE_VERIFY(dn) +#define FREE_VERIFY(db, start, end, tx) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DNODE_H */ diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h new file mode 100644 index 000000000..8665aec2d --- /dev/null +++ b/module/zfs/include/sys/dsl_dataset.h @@ -0,0 +1,239 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DATASET_H +#define _SYS_DSL_DATASET_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/bplist.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; +struct dsl_pool; + +typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); + +#define DS_FLAG_INCONSISTENT (1ULL<<0) +#define DS_IS_INCONSISTENT(ds) \ + ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) +/* + * NB: nopromote can not yet be set, but we want support for it in this + * on-disk version, so that we don't need to upgrade for it later. It + * will be needed when we implement 'zfs split' (where the split off + * clone should not be promoted). + */ +#define DS_FLAG_NOPROMOTE (1ULL<<1) + +/* + * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly + * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, + * refquota/refreservations). + */ +#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) + +/* + * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose + * name lookups should be performed case-insensitively. + */ +#define DS_FLAG_CI_DATASET (1ULL<<16) + +typedef struct dsl_dataset_phys { + uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ + uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_prev_snap_txg; + uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ + uint64_t ds_num_children; /* clone/snap children; ==0 for head */ + uint64_t ds_creation_time; /* seconds since 1970 */ + uint64_t ds_creation_txg; + uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */ + uint64_t ds_used_bytes; + uint64_t ds_compressed_bytes; + uint64_t ds_uncompressed_bytes; + uint64_t ds_unique_bytes; /* only relevant to snapshots */ + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t ds_fsid_guid; + uint64_t ds_guid; + uint64_t ds_flags; /* DS_FLAG_* */ + blkptr_t ds_bp; + uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ + uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ + uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */ +} dsl_dataset_phys_t; + +typedef struct dsl_dataset { + /* Immutable: */ + struct dsl_dir *ds_dir; + dsl_dataset_phys_t *ds_phys; + dmu_buf_t *ds_dbuf; + uint64_t ds_object; + uint64_t ds_fsid_guid; + + /* only used in syncing context, only valid for non-snapshots: */ + struct dsl_dataset *ds_prev; + uint64_t ds_origin_txg; + + /* has internal locking: */ + bplist_t ds_deadlist; + + /* protected by lock on pool's dp_dirty_datasets list */ + txg_node_t ds_dirty_link; + list_node_t ds_synced_link; + + /* + * ds_phys->ds_<accounting> is also protected by ds_lock. + * Protected by ds_lock: + */ + kmutex_t ds_lock; + void *ds_user_ptr; + dsl_dataset_evict_func_t *ds_user_evict_func; + + /* + * ds_owner is protected by the ds_rwlock and the ds_lock + */ + krwlock_t ds_rwlock; + kcondvar_t ds_exclusive_cv; + void *ds_owner; + + /* no locking; only for making guesses */ + uint64_t ds_trysnap_txg; + + /* for objset_open() */ + kmutex_t ds_opening_lock; + + uint64_t ds_reserved; /* cached refreservation */ + uint64_t ds_quota; /* cached refquota */ + + /* Protected by ds_lock; keep at end of struct for better locality */ + char ds_snapname[MAXNAMELEN]; +} dsl_dataset_t; + +#define dsl_dataset_is_snapshot(ds) \ + ((ds)->ds_phys->ds_num_children != 0) + +#define DS_UNIQUE_IS_ACCURATE(ds) \ + (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) + +int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **); +int dsl_dataset_own(const char *name, int flags, void *owner, + dsl_dataset_t **dsp); +int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, + int flags, void *owner, dsl_dataset_t **); +void dsl_dataset_name(dsl_dataset_t *ds, char *name); +void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +void dsl_dataset_disown(dsl_dataset_t *ds, void *owner); +void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, + void *owner); +void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner); +uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); +uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + uint64_t flags, dmu_tx_t *tx); +int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag); +int dsl_snapshots_destroy(char *fsname, char *snapname); +dsl_checkfunc_t dsl_dataset_destroy_check; +dsl_syncfunc_t dsl_dataset_destroy_sync; +dsl_checkfunc_t dsl_dataset_snapshot_check; +dsl_syncfunc_t dsl_dataset_snapshot_sync; +int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost); +int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); +int dsl_dataset_promote(const char *name); +int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, + boolean_t force); + +void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, + void *p, dsl_dataset_evict_func_t func); +void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); + +blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); +void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); + +spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); + +boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); + +void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); + +void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); +int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, + dmu_tx_t *tx); +int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); + +void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); +void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); +void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); +void dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); + +int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); + +int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, + uint64_t *ref_rsrv); +int dsl_dataset_set_quota(const char *dsname, uint64_t quota); +void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx); +int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation); +void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags); +int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation, + dmu_tx_t *tx); + +#ifdef ZFS_DEBUG +#define dprintf_ds(ds, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \ + dsl_dataset_name(ds, __ds_name); \ + dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, MAXNAMELEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_ds(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DATASET_H */ diff --git a/module/zfs/include/sys/dsl_deleg.h b/module/zfs/include/sys/dsl_deleg.h new file mode 100644 index 000000000..a29e44e67 --- /dev/null +++ b/module/zfs/include/sys/dsl_deleg.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DELEG_H +#define _SYS_DSL_DELEG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_DELEG_PERM_NONE "" +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" + +/* + * Note: the names of properties that are marked delegatable are also + * valid delegated permissions + */ + +int dsl_deleg_get(const char *ddname, nvlist_t **nvp); +int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); +int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); +void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); +int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx); +boolean_t dsl_delegation_on(objset_t *os); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DELEG_H */ diff --git a/module/zfs/include/sys/dsl_dir.h b/module/zfs/include/sys/dsl_dir.h new file mode 100644 index 000000000..86b9636ce --- /dev/null +++ b/module/zfs/include/sys/dsl_dir.h @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DIR_H +#define _SYS_DSL_DIR_H + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/refcount.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; + +typedef enum dd_used { + DD_USED_HEAD, + DD_USED_SNAP, + DD_USED_CHILD, + DD_USED_CHILD_RSRV, + DD_USED_REFRSRV, + DD_USED_NUM +} dd_used_t; + +#define DD_FLAG_USED_BREAKDOWN (1<<0) + +typedef struct dsl_dir_phys { + uint64_t dd_creation_time; /* not actually used */ + uint64_t dd_head_dataset_obj; + uint64_t dd_parent_obj; + uint64_t dd_origin_obj; + uint64_t dd_child_dir_zapobj; + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + uint64_t dd_used_bytes; + uint64_t dd_compressed_bytes; + uint64_t dd_uncompressed_bytes; + /* Administrative quota setting */ + uint64_t dd_quota; + /* Administrative reservation setting */ + uint64_t dd_reserved; + uint64_t dd_props_zapobj; + uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ + uint64_t dd_flags; + uint64_t dd_used_breakdown[DD_USED_NUM]; + uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */ +} dsl_dir_phys_t; + +struct dsl_dir { + /* These are immutable; no lock needed: */ + uint64_t dd_object; + dsl_dir_phys_t *dd_phys; + dmu_buf_t *dd_dbuf; + dsl_pool_t *dd_pool; + + /* protected by lock on pool's dp_dirty_dirs list */ + txg_node_t dd_dirty_link; + + /* protected by dp_config_rwlock */ + dsl_dir_t *dd_parent; + + /* Protected by dd_lock */ + kmutex_t dd_lock; + list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ + + /* gross estimate of space used by in-flight tx's */ + uint64_t dd_tempreserved[TXG_SIZE]; + /* amount of space we expect to write; == amount of dirty data */ + int64_t dd_space_towrite[TXG_SIZE]; + + /* protected by dd_lock; keep at end of struct for better locality */ + char dd_myname[MAXNAMELEN]; +}; + +void dsl_dir_close(dsl_dir_t *dd, void *tag); +int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); +int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, + const char **tailp); +int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **); +void dsl_dir_name(dsl_dir_t *dd, char *buf); +int dsl_dir_namelen(dsl_dir_t *dd); +int dsl_dir_is_private(dsl_dir_t *dd); +uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, + const char *name, dmu_tx_t *tx); +dsl_checkfunc_t dsl_dir_destroy_check; +dsl_syncfunc_t dsl_dir_destroy_sync; +void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); +uint64_t dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly); +void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); +void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); +int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, + uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, + dmu_tx_t *tx); +void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); +void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); +void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); +void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); +int dsl_dir_set_quota(const char *ddname, uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); +int dsl_dir_rename(dsl_dir_t *dd, const char *newname); +int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); +int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); +boolean_t dsl_dir_is_clone(dsl_dir_t *dd); +void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, + uint64_t reservation, cred_t *cr, dmu_tx_t *tx); + +/* internal reserved dir name */ +#define MOS_DIR_NAME "$MOS" +#define ORIGIN_DIR_NAME "$ORIGIN" + +#ifdef ZFS_DEBUG +#define dprintf_dd(dd, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \ + KM_SLEEP); \ + dsl_dir_name(dd, __ds_name); \ + dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_dd(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DIR_H */ diff --git a/module/zfs/include/sys/dsl_pool.h b/module/zfs/include/sys/dsl_pool.h new file mode 100644 index 000000000..3bb4ad4ef --- /dev/null +++ b/module/zfs/include/sys/dsl_pool.h @@ -0,0 +1,150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_POOL_H +#define _SYS_DSL_POOL_H + +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/txg_impl.h> +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/dnode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +enum scrub_func { + SCRUB_FUNC_NONE, + SCRUB_FUNC_CLEAN, + SCRUB_FUNC_NUMFUNCS +}; + +/* These macros are for indexing into the zfs_all_blkstats_t. */ +#define DMU_OT_DEFERRED DMU_OT_NONE +#define DMU_OT_TOTAL DMU_OT_NUMTYPES + +typedef struct zfs_blkstat { + uint64_t zb_count; + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_gangs; + uint64_t zb_ditto_2_of_2_samevdev; + uint64_t zb_ditto_2_of_3_samevdev; + uint64_t zb_ditto_3_of_3_samevdev; +} zfs_blkstat_t; + +typedef struct zfs_all_blkstats { + zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; +} zfs_all_blkstats_t; + + +typedef struct dsl_pool { + /* Immutable */ + spa_t *dp_spa; + struct objset *dp_meta_objset; + struct dsl_dir *dp_root_dir; + struct dsl_dir *dp_mos_dir; + struct dsl_dataset *dp_origin_snap; + uint64_t dp_root_dir_obj; + + /* No lock needed - sync context only */ + blkptr_t dp_meta_rootbp; + list_t dp_synced_datasets; + hrtime_t dp_read_overhead; + uint64_t dp_throughput; + uint64_t dp_write_limit; + + /* Uses dp_lock */ + kmutex_t dp_lock; + uint64_t dp_space_towrite[TXG_SIZE]; + uint64_t dp_tempreserved[TXG_SIZE]; + + enum scrub_func dp_scrub_func; + uint64_t dp_scrub_queue_obj; + uint64_t dp_scrub_min_txg; + uint64_t dp_scrub_max_txg; + zbookmark_t dp_scrub_bookmark; + boolean_t dp_scrub_pausing; + boolean_t dp_scrub_isresilver; + uint64_t dp_scrub_start_time; + kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ + boolean_t dp_scrub_restart; + + /* Has its own locking */ + tx_state_t dp_tx; + txg_list_t dp_dirty_datasets; + txg_list_t dp_dirty_dirs; + txg_list_t dp_sync_tasks; + + /* + * Protects administrative changes (properties, namespace) + * It is only held for write in syncing context. Therefore + * syncing context does not need to ever have it for read, since + * nobody else could possibly have it for write. + */ + krwlock_t dp_config_rwlock; + + zfs_all_blkstats_t *dp_blkstats; +} dsl_pool_t; + +int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +void dsl_pool_close(dsl_pool_t *dp); +dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); +void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); +void dsl_pool_zil_clean(dsl_pool_t *dp); +int dsl_pool_sync_context(dsl_pool_t *dp); +uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); +int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); +void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); +void dsl_pool_memory_pressure(dsl_pool_t *dp); +void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); +int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, + zio_done_func_t *done, void *private, uint32_t arc_flags); +void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); +void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); + +int dsl_pool_scrub_cancel(dsl_pool_t *dp); +int dsl_pool_scrub_clean(dsl_pool_t *dp); +void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_scrub_restart(dsl_pool_t *dp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_POOL_H */ diff --git a/module/zfs/include/sys/dsl_prop.h b/module/zfs/include/sys/dsl_prop.h new file mode 100644 index 000000000..d66caa86c --- /dev/null +++ b/module/zfs/include/sys/dsl_prop.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_PROP_H +#define _SYS_DSL_PROP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; + +/* The callback func may not call into the DMU or DSL! */ +typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); + +typedef struct dsl_prop_cb_record { + list_node_t cbr_node; /* link on dd_prop_cbs */ + struct dsl_dataset *cbr_ds; + const char *cbr_propname; + dsl_prop_changed_cb_t *cbr_func; + void *cbr_arg; +} dsl_prop_cb_record_t; + +int dsl_prop_register(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +int dsl_prop_numcb(struct dsl_dataset *ds); + +int dsl_prop_get(const char *ddname, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local); +int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint); + +int dsl_prop_set(const char *ddname, const char *propname, + int intsz, int numints, const void *buf); +void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + cred_t *cr, dmu_tx_t *tx); + +void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); +void dsl_prop_nvlist_add_string(nvlist_t *nv, + zfs_prop_t prop, const char *value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_PROP_H */ diff --git a/module/zfs/include/sys/dsl_synctask.h b/module/zfs/include/sys/dsl_synctask.h new file mode 100644 index 000000000..4995bfe5a --- /dev/null +++ b/module/zfs/include/sys/dsl_synctask.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_SYNCTASK_H +#define _SYS_DSL_SYNCTASK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/txg.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; + +typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *); + +typedef struct dsl_sync_task { + list_node_t dst_node; + dsl_checkfunc_t *dst_checkfunc; + dsl_syncfunc_t *dst_syncfunc; + void *dst_arg1; + void *dst_arg2; + int dst_err; +} dsl_sync_task_t; + +typedef struct dsl_sync_task_group { + txg_node_t dstg_node; + list_t dstg_tasks; + struct dsl_pool *dstg_pool; + cred_t *dstg_cr; + uint64_t dstg_txg; + int dstg_err; + int dstg_space; + boolean_t dstg_nowaiter; +} dsl_sync_task_group_t; + +dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); +void dsl_sync_task_create(dsl_sync_task_group_t *dstg, + dsl_checkfunc_t *, dsl_syncfunc_t *, + void *arg1, void *arg2, int blocks_modified); +int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); +void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); +void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); +void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); + +int dsl_sync_task_do(struct dsl_pool *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified); +void dsl_sync_task_do_nowait(struct dsl_pool *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SYNCTASK_H */ diff --git a/module/zfs/include/sys/metaslab.h b/module/zfs/include/sys/metaslab.h new file mode 100644 index 000000000..1c9d89e8f --- /dev/null +++ b/module/zfs/include/sys/metaslab.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_METASLAB_H +#define _SYS_METASLAB_H + +#include <sys/spa.h> +#include <sys/space_map.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct metaslab_class metaslab_class_t; +typedef struct metaslab_group metaslab_group_t; + +extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, + uint64_t start, uint64_t size, uint64_t txg); +extern void metaslab_fini(metaslab_t *msp); +extern void metaslab_sync(metaslab_t *msp, uint64_t txg); +extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); + +#define METASLAB_HINTBP_FAVOR 0x0 +#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_GANG_HEADER 0x2 + +extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); +extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, + boolean_t now); +extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); + +extern metaslab_class_t *metaslab_class_create(void); +extern void metaslab_class_destroy(metaslab_class_t *mc); +extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); +extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); + +extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, + vdev_t *vd); +extern void metaslab_group_destroy(metaslab_group_t *mg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_H */ diff --git a/module/zfs/include/sys/metaslab_impl.h b/module/zfs/include/sys/metaslab_impl.h new file mode 100644 index 000000000..5980cbc84 --- /dev/null +++ b/module/zfs/include/sys/metaslab_impl.h @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_METASLAB_IMPL_H +#define _SYS_METASLAB_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/metaslab.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/txg.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct metaslab_class { + metaslab_group_t *mc_rotor; + uint64_t mc_allocated; +}; + +struct metaslab_group { + kmutex_t mg_lock; + avl_tree_t mg_metaslab_tree; + uint64_t mg_aliquot; + int64_t mg_bias; + metaslab_class_t *mg_class; + vdev_t *mg_vd; + metaslab_group_t *mg_prev; + metaslab_group_t *mg_next; +}; + +/* + * Each metaslab's free space is tracked in space map object in the MOS, + * which is only updated in syncing context. Each time we sync a txg, + * we append the allocs and frees from that txg to the space map object. + * When the txg is done syncing, metaslab_sync_done() updates ms_smo + * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. + */ +struct metaslab { + kmutex_t ms_lock; /* metaslab lock */ + space_map_obj_t ms_smo; /* synced space map object */ + space_map_obj_t ms_smo_syncing; /* syncing space map object */ + space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ + space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t ms_map; /* in-core free space map */ + uint64_t ms_weight; /* weight vs. others in group */ + metaslab_group_t *ms_group; /* metaslab group */ + avl_node_t ms_group_node; /* node in metaslab group tree */ + txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_IMPL_H */ diff --git a/module/zfs/include/sys/refcount.h b/module/zfs/include/sys/refcount.h new file mode 100644 index 000000000..d3fe7b1f8 --- /dev/null +++ b/module/zfs/include/sys/refcount.h @@ -0,0 +1,104 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_REFCOUNT_H +#define _SYS_REFCOUNT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/inttypes.h> +#include <sys/list.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * If the reference is held only by the calling function and not any + * particular object, use FTAG (which is a string) for the holder_tag. + * Otherwise, use the object that holds the reference. + */ +#define FTAG ((char *)__func__) + +#if defined(DEBUG) || !defined(_KERNEL) +typedef struct reference { + list_node_t ref_link; + void *ref_holder; + uint64_t ref_number; + uint8_t *ref_removed; +} reference_t; + +typedef struct refcount { + kmutex_t rc_mtx; + list_t rc_list; + list_t rc_removed; + int64_t rc_count; + int64_t rc_removed_count; +} refcount_t; + +/* Note: refcount_t must be initialized with refcount_create() */ + +void refcount_create(refcount_t *rc); +void refcount_destroy(refcount_t *rc); +void refcount_destroy_many(refcount_t *rc, uint64_t number); +int refcount_is_zero(refcount_t *rc); +int64_t refcount_count(refcount_t *rc); +int64_t refcount_add(refcount_t *rc, void *holder_tag); +int64_t refcount_remove(refcount_t *rc, void *holder_tag); +int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); +int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); + +void refcount_init(void); +void refcount_fini(void); + +#else /* DEBUG */ + +typedef struct refcount { + uint64_t rc_count; +} refcount_t; + +#define refcount_create(rc) ((rc)->rc_count = 0) +#define refcount_destroy(rc) ((rc)->rc_count = 0) +#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) +#define refcount_is_zero(rc) ((rc)->rc_count == 0) +#define refcount_count(rc) ((rc)->rc_count) +#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1) +#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1) +#define refcount_add_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, number) +#define refcount_remove_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, -number) + +#define refcount_init() +#define refcount_fini() + +#endif /* DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_REFCOUNT_H */ diff --git a/module/zfs/include/sys/rrwlock.h b/module/zfs/include/sys/rrwlock.h new file mode 100644 index 000000000..19a43c97f --- /dev/null +++ b/module/zfs/include/sys/rrwlock.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_RR_RW_LOCK_H +#define _SYS_RR_RW_LOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/inttypes.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +/* + * A reader-writer lock implementation that allows re-entrant reads, but + * still gives writers priority on "new" reads. + * + * See rrwlock.c for more details about the implementation. + * + * Fields of the rrwlock_t structure: + * - rr_lock: protects modification and reading of rrwlock_t fields + * - rr_cv: cv for waking up readers or waiting writers + * - rr_writer: thread id of the current writer + * - rr_anon_rount: number of active anonymous readers + * - rr_linked_rcount: total number of non-anonymous active readers + * - rr_writer_wanted: a writer wants the lock + */ +typedef struct rrwlock { + kmutex_t rr_lock; + kcondvar_t rr_cv; + kthread_t *rr_writer; + refcount_t rr_anon_rcount; + refcount_t rr_linked_rcount; + boolean_t rr_writer_wanted; +} rrwlock_t; + +/* + * 'tag' is used in reference counting tracking. The + * 'tag' must be the same in a rrw_enter() as in its + * corresponding rrw_exit(). + */ +void rrw_init(rrwlock_t *rrl); +void rrw_destroy(rrwlock_t *rrl); +void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); +void rrw_exit(rrwlock_t *rrl, void *tag); +boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); + +#define RRW_READ_HELD(x) rrw_held(x, RW_READER) +#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RR_RW_LOCK_H */ diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h new file mode 100644 index 000000000..24b3ca447 --- /dev/null +++ b/module/zfs/include/sys/spa.h @@ -0,0 +1,554 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_H +#define _SYS_SPA_H + +#include <sys/avl.h> +#include <sys/zfs_context.h> +#include <sys/nvpair.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Forward references that lots of things need. + */ +typedef struct spa spa_t; +typedef struct vdev vdev_t; +typedef struct metaslab metaslab_t; +typedef struct zilog zilog_t; +typedef struct spa_aux_vdev spa_aux_vdev_t; +struct dsl_pool; + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) \ + ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len)) +#define BF64_SET(x, low, len, val) \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +#define BF32_SET_SB(x, low, len, shift, bias, val) \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) +#define BF64_SET_SB(x, low, len, shift, bias, val) \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) + +/* + * We currently support nine block sizes, from 512 bytes to 128K. + * We could go higher, but the benefits are near-zero and the cost + * of COWing a giant block to modify one byte would become excessive. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 17 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) + +/* + * Size of block to hold the configuration data (a packed nvlist) + */ +#define SPA_CONFIG_BLOCKSIZE (1 << 14) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * E endianness + * type DMU object type + * lvl level of indirection + * birth txg transaction group in which the block was born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ +typedef struct blkptr { + dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[3]; /* Extra space for the future */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) +#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_HOLE(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_PSIZE(bp) \ + BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define BP_SET_PSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) + +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_GET_ASIZE(bp) \ + (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_GET_UCSIZE(bp) \ + ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + +#define BP_GET_NDVAS(bp) \ + (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_COUNT_GANG(bp) \ + (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ + DVA_GET_GANG(&(bp)->blk_dva[1]) + \ + DVA_GET_GANG(&(bp)->blk_dva[2])) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ + ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ + ((zc1).zc_word[3] - (zc2).zc_word[3]))) + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) +#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) +#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_pad[2] = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +#define BLK_FILL_ALREADY_FREED (-1ULL) + +/* + * Note: the byteorder is either 0 or -1, both of which are palindromes. + * This simplifies the endianness handling a bit. + */ +#ifdef _BIG_ENDIAN +#define ZFS_HOST_BYTEORDER (0ULL) +#else +#define ZFS_HOST_BYTEORDER (-1ULL) +#endif + +#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) + +#define BP_SPRINTF_LEN 320 + +#include <sys/dmu.h> + +#define BP_GET_BUFC_TYPE(bp) \ + (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA); +/* + * Routines found in spa.c + */ + +/* state manipulation functions */ +extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_get_stats(const char *pool, nvlist_t **config, + char *altroot, size_t buflen); +extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, + const char *history_str, nvlist_t *zplprops); +extern int spa_check_rootconf(char *devpath, char *devid, + nvlist_t **bestconf, uint64_t *besttxg); +extern boolean_t spa_rootdev_validate(nvlist_t *nv); +extern int spa_import_rootpool(char *devpath, char *devid); +extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); +extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); +extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); +extern int spa_destroy(char *pool); +extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force); +extern int spa_reset(char *pool); +extern void spa_async_request(spa_t *spa, int flag); +extern void spa_async_unrequest(spa_t *spa, int flag); +extern void spa_async_suspend(spa_t *spa); +extern void spa_async_resume(spa_t *spa); +extern spa_t *spa_inject_addref(char *pool); +extern void spa_inject_delref(spa_t *spa); + +#define SPA_ASYNC_CONFIG_UPDATE 0x01 +#define SPA_ASYNC_REMOVE 0x02 +#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_RESILVER_DONE 0x08 +#define SPA_ASYNC_RESILVER 0x10 + +/* device manipulation */ +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, + int replacing); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); +extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); + +/* spare state (which is global across all pools) */ +extern void spa_spare_add(vdev_t *vd); +extern void spa_spare_remove(vdev_t *vd); +extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); +extern void spa_spare_activate(vdev_t *vd); + +/* L2ARC state (which is global across all pools) */ +extern void spa_l2cache_add(vdev_t *vd); +extern void spa_l2cache_remove(vdev_t *vd); +extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); +extern void spa_l2cache_activate(vdev_t *vd); +extern void spa_l2cache_drop(spa_t *spa); +extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc); + +/* scrubbing */ +extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); + +/* spa syncing */ +extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ +extern void spa_sync_allpools(void); + +/* spa namespace global mutex */ +extern kmutex_t spa_namespace_lock; + +/* + * SPA configuration functions in spa_config.c + */ + +#define SPA_CONFIG_UPDATE_POOL 0 +#define SPA_CONFIG_UPDATE_VDEVS 1 + +extern void spa_config_sync(spa_t *, boolean_t, boolean_t); +extern void spa_config_load(void); +extern nvlist_t *spa_all_configs(uint64_t *); +extern void spa_config_set(spa_t *spa, nvlist_t *config); +extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats); +extern void spa_config_update(spa_t *spa, int what); +extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); + +/* + * Miscellaneous SPA routines in spa_misc.c + */ + +/* Namespace manipulation */ +extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_add(const char *name, const char *altroot); +extern void spa_remove(spa_t *spa); +extern spa_t *spa_next(spa_t *prev); + +/* Refcount functions */ +extern void spa_open_ref(spa_t *spa, void *tag); +extern void spa_close(spa_t *spa, void *tag); +extern boolean_t spa_refcount_zero(spa_t *spa); + +#define SCL_CONFIG 0x01 +#define SCL_STATE 0x02 +#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ +#define SCL_ALLOC 0x08 +#define SCL_ZIO 0x10 +#define SCL_FREE 0x20 +#define SCL_VDEV 0x40 +#define SCL_LOCKS 7 +#define SCL_ALL ((1 << SCL_LOCKS) - 1) +#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) + +/* Pool configuration locks */ +extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_exit(spa_t *spa, int locks, void *tag); +extern int spa_config_held(spa_t *spa, int locks, krw_t rw); + +/* Pool vdev add/remove lock */ +extern uint64_t spa_vdev_enter(spa_t *spa); +extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); + +/* Pool vdev state change lock */ +extern void spa_vdev_state_enter(spa_t *spa); +extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); + +/* Accessor functions */ +extern boolean_t spa_shutting_down(spa_t *spa); +extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern blkptr_t *spa_get_rootblkptr(spa_t *spa); +extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); +extern void spa_altroot(spa_t *, char *, size_t); +extern int spa_sync_pass(spa_t *spa); +extern char *spa_name(spa_t *spa); +extern uint64_t spa_guid(spa_t *spa); +extern uint64_t spa_last_synced_txg(spa_t *spa); +extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern pool_state_t spa_state(spa_t *spa); +extern uint64_t spa_freeze_txg(spa_t *spa); +extern uint64_t spa_get_alloc(spa_t *spa); +extern uint64_t spa_get_space(spa_t *spa); +extern uint64_t spa_get_dspace(spa_t *spa); +extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_version(spa_t *spa); +extern int spa_max_replication(spa_t *spa); +extern int spa_busy(void); +extern uint8_t spa_get_failmode(spa_t *spa); +extern boolean_t spa_suspended(spa_t *spa); + +/* Miscellaneous support routines */ +extern int spa_rename(const char *oldname, const char *newname); +extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); +extern char *spa_strdup(const char *); +extern void spa_strfree(char *); +extern uint64_t spa_get_random(uint64_t range); +extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); +extern void spa_freeze(spa_t *spa); +extern void spa_upgrade(spa_t *spa, uint64_t version); +extern void spa_evict_all(void); +extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, + boolean_t l2cache); +extern boolean_t spa_has_spare(spa_t *, uint64_t guid); +extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_has_slogs(spa_t *spa); +extern boolean_t spa_is_root(spa_t *spa); + +/* history logging */ +typedef enum history_log_type { + LOG_CMD_POOL_CREATE, + LOG_CMD_NORMAL, + LOG_INTERNAL +} history_log_type_t; + +typedef struct history_arg { + const char *ha_history_str; + history_log_type_t ha_log_type; + history_internal_events_t ha_event; + char ha_zone[MAXPATHLEN]; +} history_arg_t; + +extern char *spa_his_ievent_table[]; + +extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); +extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, + char *his_buf); +extern int spa_history_log(spa_t *spa, const char *his_buf, + history_log_type_t what); +void spa_history_internal_log(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); + +/* error handling */ +struct zbookmark; +struct zio; +extern void spa_log_error(spa_t *spa, struct zio *zio); +extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t stateoroffset, uint64_t length); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); +extern uint64_t spa_get_errlog_size(spa_t *spa); +extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern void spa_errlog_rotate(spa_t *spa); +extern void spa_errlog_drain(spa_t *spa); +extern void spa_errlog_sync(spa_t *spa, uint64_t txg); +extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); + +/* vdev cache */ +extern void vdev_cache_stat_init(void); +extern void vdev_cache_stat_fini(void); + +/* Initialization and termination */ +extern void spa_init(int flags); +extern void spa_fini(void); +extern void spa_boot_init(); + +/* properties */ +extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); +extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); + +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); + +#ifdef ZFS_DEBUG +#define dprintf_bp(bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_bp(bp, fmt, ...) +#endif + +extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_H */ diff --git a/module/zfs/include/sys/spa_boot.h b/module/zfs/include/sys/spa_boot.h new file mode 100644 index 000000000..b56073b97 --- /dev/null +++ b/module/zfs/include/sys/spa_boot.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_BOOT_H +#define _SYS_SPA_BOOT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern char *spa_get_bootprop(char *prop); +extern void spa_free_bootprop(char *prop); +extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_BOOT_H */ diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h new file mode 100644 index 000000000..8aeb414fe --- /dev/null +++ b/module/zfs/include/sys/spa_impl.h @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_IMPL_H +#define _SYS_SPA_IMPL_H + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/metaslab.h> +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/uberblock_impl.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/refcount.h> +#include <sys/bplist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_error_entry { + zbookmark_t se_bookmark; + char *se_name; + avl_node_t se_avl; +} spa_error_entry_t; + +typedef struct spa_history_phys { + uint64_t sh_pool_create_len; /* ending offset of zpool create */ + uint64_t sh_phys_max_off; /* physical EOF */ + uint64_t sh_bof; /* logical BOF */ + uint64_t sh_eof; /* logical EOF */ + uint64_t sh_records_lost; /* num of records overwritten */ +} spa_history_phys_t; + +struct spa_aux_vdev { + uint64_t sav_object; /* MOS object for device list */ + nvlist_t *sav_config; /* cached device config */ + vdev_t **sav_vdevs; /* devices */ + int sav_count; /* number devices */ + boolean_t sav_sync; /* sync the device list */ + nvlist_t **sav_pending; /* pending device additions */ + uint_t sav_npending; /* # pending devices */ +}; + +typedef struct spa_config_lock { + kmutex_t scl_lock; + kthread_t *scl_writer; + int scl_write_wanted; + kcondvar_t scl_cv; + refcount_t scl_count; +} spa_config_lock_t; + +typedef struct spa_config_dirent { + list_node_t scd_link; + char *scd_path; +} spa_config_dirent_t; + +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +enum zio_taskq_type { + ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_TYPES +}; + +struct spa { + /* + * Fields protected by spa_namespace_lock. + */ + char spa_name[MAXNAMELEN]; /* pool name */ + avl_node_t spa_avl; /* node in spa_namespace_avl */ + nvlist_t *spa_config; /* last synced config */ + nvlist_t *spa_config_syncing; /* currently syncing config */ + uint64_t spa_config_txg; /* txg of last config change */ + int spa_sync_pass; /* iterate-to-convergence */ + pool_state_t spa_state; /* pool state */ + int spa_inject_ref; /* injection references */ + uint8_t spa_sync_on; /* sync threads are running */ + spa_load_state_t spa_load_state; /* current load operation */ + taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; + dsl_pool_t *spa_dsl_pool; + metaslab_class_t *spa_normal_class; /* normal data class */ + metaslab_class_t *spa_log_class; /* intent log data class */ + uint64_t spa_first_txg; /* first txg after spa_open() */ + uint64_t spa_final_txg; /* txg of export/destroy */ + uint64_t spa_freeze_txg; /* freeze pool at this txg */ + objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ + vdev_t *spa_root_vdev; /* top-level vdev container */ + uint64_t spa_load_guid; /* initial guid for spa_load */ + list_t spa_config_dirty_list; /* vdevs with dirty config */ + list_t spa_state_dirty_list; /* vdevs with dirty state */ + spa_aux_vdev_t spa_spares; /* hot spares */ + spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_syncing_txg; /* txg currently syncing */ + uint64_t spa_sync_bplist_obj; /* object for deferred frees */ + bplist_t spa_sync_bplist; /* deferred-free bplist */ + uberblock_t spa_ubsync; /* last synced uberblock */ + uberblock_t spa_uberblock; /* current uberblock */ + kmutex_t spa_scrub_lock; /* resilver/scrub lock */ + uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ + uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ + uint64_t spa_scrub_errors; /* scrub I/O error count */ + kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ + uint8_t spa_scrub_active; /* active or suspended? */ + uint8_t spa_scrub_type; /* type of scrub we're doing */ + uint8_t spa_scrub_finished; /* indicator to rotate logs */ + uint8_t spa_scrub_started; /* started since last boot */ + uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ + kmutex_t spa_async_lock; /* protect async state */ + kthread_t *spa_async_thread; /* thread doing async task */ + int spa_async_suspended; /* async tasks suspended */ + kcondvar_t spa_async_cv; /* wait for thread_exit() */ + uint16_t spa_async_tasks; /* async task mask */ + kmutex_t spa_async_root_lock; /* protects async root count */ + uint64_t spa_async_root_count; /* number of async root zios */ + kcondvar_t spa_async_root_cv; /* notify when count == 0 */ + char *spa_root; /* alternate root directory */ + uint64_t spa_ena; /* spa-wide ereport ENA */ + boolean_t spa_last_open_failed; /* true if last open faled */ + kmutex_t spa_errlog_lock; /* error log lock */ + uint64_t spa_errlog_last; /* last error log object */ + uint64_t spa_errlog_scrub; /* scrub error log object */ + kmutex_t spa_errlist_lock; /* error list/ereport lock */ + avl_tree_t spa_errlist_last; /* last error list */ + avl_tree_t spa_errlist_scrub; /* scrub error list */ + uint64_t spa_deflate; /* should we deflate? */ + uint64_t spa_history; /* history object */ + kmutex_t spa_history_lock; /* history lock */ + vdev_t *spa_pending_vdev; /* pending vdev additions */ + kmutex_t spa_props_lock; /* property lock */ + uint64_t spa_pool_props_object; /* object for properties */ + uint64_t spa_bootfs; /* default boot filesystem */ + uint64_t spa_failmode; /* failure mode for the pool */ + uint64_t spa_delegation; /* delegation on/off */ + list_t spa_config_list; /* previous cache file(s) */ + zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ + kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ + kcondvar_t spa_suspend_cv; /* notification of resume */ + uint8_t spa_suspended; /* pool is suspended */ + boolean_t spa_import_faulted; /* allow faulted vdevs */ + boolean_t spa_is_root; /* pool is root */ + int spa_minref; /* num refs when first opened */ + spa_log_state_t spa_log_state; /* log state */ + /* + * spa_refcnt & spa_config_lock must be the last elements + * because refcount_t changes size based on compilation options. + * In order for the MDB module to function correctly, the other + * fields must remain in the same location. + */ + spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ + refcount_t spa_refcount; /* number of opens */ +}; + +extern const char *spa_config_path; + +#define BOOTFS_COMPRESS_VALID(compress) \ + ((compress) == ZIO_COMPRESS_LZJB || \ + ((compress) == ZIO_COMPRESS_ON && \ + ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ + (compress) == ZIO_COMPRESS_OFF) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_IMPL_H */ diff --git a/module/zfs/include/sys/space_map.h b/module/zfs/include/sys/space_map.h new file mode 100644 index 000000000..db9daef1f --- /dev/null +++ b/module/zfs/include/sys/space_map.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPACE_MAP_H +#define _SYS_SPACE_MAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/avl.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct space_map_ops space_map_ops_t; + +typedef struct space_map { + avl_tree_t sm_root; /* AVL tree of map segments */ + uint64_t sm_space; /* sum of all segments in the map */ + uint64_t sm_start; /* start of map */ + uint64_t sm_size; /* size of map */ + uint8_t sm_shift; /* unit shift */ + uint8_t sm_pad[3]; /* unused */ + uint8_t sm_loaded; /* map loaded? */ + uint8_t sm_loading; /* map loading? */ + kcondvar_t sm_load_cv; /* map load completion */ + space_map_ops_t *sm_ops; /* space map block picker ops vector */ + void *sm_ppd; /* picker-private data */ + kmutex_t *sm_lock; /* pointer to lock that protects map */ +} space_map_t; + +typedef struct space_seg { + avl_node_t ss_node; /* AVL node */ + uint64_t ss_start; /* starting offset of this segment */ + uint64_t ss_end; /* ending offset (non-inclusive) */ +} space_seg_t; + +typedef struct space_map_obj { + uint64_t smo_object; /* on-disk space map object */ + uint64_t smo_objsize; /* size of the object */ + uint64_t smo_alloc; /* space allocated from the map */ +} space_map_obj_t; + +struct space_map_ops { + void (*smop_load)(space_map_t *sm); + void (*smop_unload)(space_map_t *sm); + uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); + void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); + void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); +}; + +/* + * debug entry + * + * 1 3 10 50 + * ,---+--------+------------+---------------------------------. + * | 1 | action | syncpass | txg (lower bits) | + * `---+--------+------------+---------------------------------' + * 63 62 60 59 50 49 0 + * + * + * + * non-debug entry + * + * 1 47 1 15 + * ,-----------------------------------------------------------. + * | 0 | offset (sm_shift units) | type | run | + * `-----------------------------------------------------------' + * 63 62 17 16 15 0 + */ + +/* All this stuff takes and returns bytes */ +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) +#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) +#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) + +#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) +#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) + +#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) +#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) + +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) + +#define SM_ALLOC 0x0 +#define SM_FREE 0x1 + +/* + * The data for a given space map can be kept on blocks of any size. + * Larger blocks entail fewer i/o operations, but they also cause the + * DMU to keep more data in-core, and also to waste more i/o bandwidth + * when only a few blocks have changed since the last transaction group. + * This could use a lot more research, but for now, set the freelist + * block size to 4k (2^12). + */ +#define SPACE_MAP_BLOCKSHIFT 12 + +typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size); + +extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, + uint8_t shift, kmutex_t *lp); +extern void space_map_destroy(space_map_t *sm); +extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); +extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_vacate(space_map_t *sm, + space_map_func_t *func, space_map_t *mdest); +extern void space_map_walk(space_map_t *sm, + space_map_func_t *func, space_map_t *mdest); +extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_union(space_map_t *smd, space_map_t *sms); + +extern void space_map_load_wait(space_map_t *sm); +extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, + uint8_t maptype, space_map_obj_t *smo, objset_t *os); +extern void space_map_unload(space_map_t *sm); + +extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); +extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); + +extern void space_map_sync(space_map_t *sm, uint8_t maptype, + space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); +extern void space_map_truncate(space_map_obj_t *smo, + objset_t *os, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPACE_MAP_H */ diff --git a/module/zfs/include/sys/txg.h b/module/zfs/include/sys/txg.h new file mode 100644 index 000000000..23bdff211 --- /dev/null +++ b/module/zfs/include/sys/txg.h @@ -0,0 +1,130 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TXG_H +#define _SYS_TXG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ +#define TXG_SIZE 4 /* next power of 2 */ +#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ +#define TXG_INITIAL TXG_SIZE /* initial txg */ +#define TXG_IDX (txg & TXG_MASK) + +#define TXG_WAIT 1ULL +#define TXG_NOWAIT 2ULL + +typedef struct tx_cpu tx_cpu_t; + +typedef struct txg_handle { + tx_cpu_t *th_cpu; + uint64_t th_txg; +} txg_handle_t; + +typedef struct txg_node { + struct txg_node *tn_next[TXG_SIZE]; + uint8_t tn_member[TXG_SIZE]; +} txg_node_t; + +typedef struct txg_list { + kmutex_t tl_lock; + size_t tl_offset; + txg_node_t *tl_head[TXG_SIZE]; +} txg_list_t; + +struct dsl_pool; + +extern void txg_init(struct dsl_pool *dp, uint64_t txg); +extern void txg_fini(struct dsl_pool *dp); +extern void txg_sync_start(struct dsl_pool *dp); +extern void txg_sync_stop(struct dsl_pool *dp); +extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); +extern void txg_rele_to_quiesce(txg_handle_t *txghp); +extern void txg_rele_to_sync(txg_handle_t *txghp); +extern void txg_suspend(struct dsl_pool *dp); +extern void txg_resume(struct dsl_pool *dp); + +/* + * Delay the caller by the specified number of ticks or until + * the txg closes (whichever comes first). This is intended + * to be used to throttle writers when the system nears its + * capacity. + */ +extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks); + +/* + * Wait until the given transaction group has finished syncing. + * Try to make this happen as soon as possible (eg. kick off any + * necessary syncs immediately). If txg==0, wait for the currently open + * txg to finish syncing. + */ +extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); + +/* + * Wait until the given transaction group, or one after it, is + * the open transaction group. Try to make this happen as soon + * as possible (eg. kick off any necessary syncs immediately). + * If txg == 0, wait for the next open txg. + */ +extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); + +/* + * Returns TRUE if we are "backed up" waiting for the syncing + * transaction to complete; otherwise returns FALSE. + */ +extern boolean_t txg_stalled(struct dsl_pool *dp); + +/* returns TRUE if someone is waiting for the next txg to sync */ +extern boolean_t txg_sync_waiting(struct dsl_pool *dp); + +/* + * Per-txg object lists. + */ + +#define TXG_CLEAN(txg) ((txg) - 1) + +extern void txg_list_create(txg_list_t *tl, size_t offset); +extern void txg_list_destroy(txg_list_t *tl); +extern int txg_list_empty(txg_list_t *tl, uint64_t txg); +extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); +extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); +extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_head(txg_list_t *tl, uint64_t txg); +extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_H */ diff --git a/module/zfs/include/sys/txg_impl.h b/module/zfs/include/sys/txg_impl.h new file mode 100644 index 000000000..7413c662b --- /dev/null +++ b/module/zfs/include/sys/txg_impl.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TXG_IMPL_H +#define _SYS_TXG_IMPL_H + +#include <sys/spa.h> +#include <sys/txg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct tx_cpu { + kmutex_t tc_lock; + kcondvar_t tc_cv[TXG_SIZE]; + uint64_t tc_count[TXG_SIZE]; + char tc_pad[16]; +}; + +typedef struct tx_state { + tx_cpu_t *tx_cpu; /* protects right to enter txg */ + kmutex_t tx_sync_lock; /* protects tx_state_t */ + krwlock_t tx_suspend; + uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ + uint64_t tx_syncing_txg; /* currently syncing txg id */ + uint64_t tx_synced_txg; /* last synced txg id */ + + uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ + uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ + + kcondvar_t tx_sync_more_cv; + kcondvar_t tx_sync_done_cv; + kcondvar_t tx_quiesce_more_cv; + kcondvar_t tx_quiesce_done_cv; + kcondvar_t tx_timeout_cv; + kcondvar_t tx_exit_cv; /* wait for all threads to exit */ + + uint8_t tx_threads; /* number of threads */ + uint8_t tx_exiting; /* set when we're exiting */ + + kthread_t *tx_sync_thread; + kthread_t *tx_quiesce_thread; +} tx_state_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_IMPL_H */ diff --git a/module/zfs/include/sys/uberblock.h b/module/zfs/include/sys/uberblock.h new file mode 100644 index 000000000..93d936ae4 --- /dev/null +++ b/module/zfs/include/sys/uberblock.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UBERBLOCK_H +#define _SYS_UBERBLOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct uberblock uberblock_t; + +extern int uberblock_verify(uberblock_t *ub); +extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_H */ diff --git a/module/zfs/include/sys/uberblock_impl.h b/module/zfs/include/sys/uberblock_impl.h new file mode 100644 index 000000000..55a0dd5ae --- /dev/null +++ b/module/zfs/include/sys/uberblock_impl.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UBERBLOCK_IMPL_H +#define _SYS_UBERBLOCK_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/uberblock.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The uberblock version is incremented whenever an incompatible on-disk + * format change is made to the SPA, DMU, or ZAP. + * + * Note: the first two fields should never be moved. When a storage pool + * is opened, the uberblock must be read off the disk before the version + * can be checked. If the ub_version field is moved, we may not detect + * version mismatch. If the ub_magic field is moved, applications that + * expect the magic number in the first word won't work. + */ +#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ +#define UBERBLOCK_SHIFT 10 /* up to 1K */ + +struct uberblock { + uint64_t ub_magic; /* UBERBLOCK_MAGIC */ + uint64_t ub_version; /* SPA_VERSION */ + uint64_t ub_txg; /* txg of last sync */ + uint64_t ub_guid_sum; /* sum of all vdev guids */ + uint64_t ub_timestamp; /* UTC time of last sync */ + blkptr_t ub_rootbp; /* MOS objset_phys_t */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_IMPL_H */ diff --git a/module/zfs/include/sys/unique.h b/module/zfs/include/sys/unique.h new file mode 100644 index 000000000..2ef3093ed --- /dev/null +++ b/module/zfs/include/sys/unique.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UNIQUE_H +#define _SYS_UNIQUE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* The number of significant bits in each unique value. */ +#define UNIQUE_BITS 56 + +void unique_init(void); +void unique_fini(void); + +/* + * Return a new unique value (which will not be uniquified against until + * it is unique_insert()-ed. + */ +uint64_t unique_create(void); + +/* Return a unique value, which equals the one passed in if possible. */ +uint64_t unique_insert(uint64_t value); + +/* Indicate that this value no longer needs to be uniquified against. */ +void unique_remove(uint64_t value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UNIQUE_H */ diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h new file mode 100644 index 000000000..c070d6f3d --- /dev/null +++ b/module/zfs/include/sys/vdev.h @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_H +#define _SYS_VDEV_H + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/space_map.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t zfs_nocacheflush; + +extern int vdev_open(vdev_t *); +extern int vdev_validate(vdev_t *); +extern void vdev_close(vdev_t *); +extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); +extern void vdev_init(vdev_t *, uint64_t txg); +extern void vdev_reopen(vdev_t *); +extern int vdev_validate_aux(vdev_t *vd); +extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); + +extern boolean_t vdev_is_bootable(vdev_t *vd); +extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); +extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); +extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); +extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); +extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + int scrub_done); +extern boolean_t vdev_resilver_needed(vdev_t *vd, + uint64_t *minp, uint64_t *maxp); + +extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); +extern void vdev_metaslab_fini(vdev_t *vd); + +extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); +extern void vdev_clear_stats(vdev_t *vd); +extern void vdev_stat_update(zio_t *zio, uint64_t psize); +extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, + boolean_t complete); +extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); +extern void vdev_propagate_state(vdev_t *vd); +extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, + vdev_aux_t aux); + +extern void vdev_space_update(vdev_t *vd, int64_t space_delta, + int64_t alloc_delta, boolean_t update_root); + +extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); + +extern int vdev_fault(spa_t *spa, uint64_t guid); +extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *); +extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); +extern void vdev_clear(spa_t *spa, vdev_t *vd); + +extern boolean_t vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_readable(vdev_t *vd); +extern boolean_t vdev_writeable(vdev_t *vd); +extern boolean_t vdev_allocatable(vdev_t *vd); +extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); + +extern void vdev_cache_init(vdev_t *vd); +extern void vdev_cache_fini(vdev_t *vd); +extern int vdev_cache_read(zio_t *zio); +extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); + +extern void vdev_queue_init(vdev_t *vd); +extern void vdev_queue_fini(vdev_t *vd); +extern zio_t *vdev_queue_io(zio_t *zio); +extern void vdev_queue_io_done(zio_t *zio); + +extern void vdev_config_dirty(vdev_t *vd); +extern void vdev_config_clean(vdev_t *vd); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); + +extern void vdev_state_dirty(vdev_t *vd); +extern void vdev_state_clean(vdev_t *vd); + +extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, + boolean_t getstats, boolean_t isspare, boolean_t isl2cache); + +/* + * Label routines + */ +struct uberblock; +extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); +extern int vdev_label_number(uint64_t psise, uint64_t offset); +extern nvlist_t *vdev_label_read_config(vdev_t *vd); +extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); + +typedef enum { + VDEV_LABEL_CREATE, /* create/add a new device */ + VDEV_LABEL_REPLACE, /* replace an existing device */ + VDEV_LABEL_SPARE, /* add a new hot spare */ + VDEV_LABEL_REMOVE, /* remove an existing device */ + VDEV_LABEL_L2CACHE /* add an L2ARC cache device */ +} vdev_labeltype_t; + +extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_H */ diff --git a/module/zfs/include/sys/vdev_file.h b/module/zfs/include/sys/vdev_file.h new file mode 100644 index 000000000..cd4967357 --- /dev/null +++ b/module/zfs/include/sys/vdev_file.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_FILE_H +#define _SYS_VDEV_FILE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vdev.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_file { + vnode_t *vf_vnode; +} vdev_file_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_FILE_H */ diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h new file mode 100644 index 000000000..26904d089 --- /dev/null +++ b/module/zfs/include/sys/vdev_impl.h @@ -0,0 +1,305 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_IMPL_H +#define _SYS_VDEV_IMPL_H + +#include <sys/avl.h> +#include <sys/dmu.h> +#include <sys/metaslab.h> +#include <sys/nvpair.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/dkio.h> +#include <sys/uberblock_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Virtual device descriptors. + * + * All storage pool operations go through the virtual device framework, + * which provides data replication and I/O scheduling. + */ + +/* + * Forward declarations that lots of things need. + */ +typedef struct vdev_queue vdev_queue_t; +typedef struct vdev_cache vdev_cache_t; +typedef struct vdev_cache_entry vdev_cache_entry_t; + +/* + * Virtual device operations + */ +typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); +typedef void vdev_close_func_t(vdev_t *vd); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef int vdev_io_start_func_t(zio_t *zio); +typedef void vdev_io_done_func_t(zio_t *zio); +typedef void vdev_state_change_func_t(vdev_t *vd, int, int); + +typedef struct vdev_ops { + vdev_open_func_t *vdev_op_open; + vdev_close_func_t *vdev_op_close; + vdev_asize_func_t *vdev_op_asize; + vdev_io_start_func_t *vdev_op_io_start; + vdev_io_done_func_t *vdev_op_io_done; + vdev_state_change_func_t *vdev_op_state_change; + char vdev_op_type[16]; + boolean_t vdev_op_leaf; +} vdev_ops_t; + +/* + * Virtual device properties + */ +struct vdev_cache_entry { + char *ve_data; + uint64_t ve_offset; + uint64_t ve_lastused; + avl_node_t ve_offset_node; + avl_node_t ve_lastused_node; + uint32_t ve_hits; + uint16_t ve_missed_update; + zio_t *ve_fill_io; +}; + +struct vdev_cache { + avl_tree_t vc_offset_tree; + avl_tree_t vc_lastused_tree; + kmutex_t vc_lock; +}; + +struct vdev_queue { + avl_tree_t vq_deadline_tree; + avl_tree_t vq_read_tree; + avl_tree_t vq_write_tree; + avl_tree_t vq_pending_tree; + kmutex_t vq_lock; +}; + +/* + * Virtual device descriptor + */ +struct vdev { + /* + * Common to all vdev types. + */ + uint64_t vdev_id; /* child number in vdev parent */ + uint64_t vdev_guid; /* unique ID for this vdev */ + uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_ashift; /* block alignment shift */ + uint64_t vdev_state; /* see VDEV_STATE_* #defines */ + uint64_t vdev_prevstate; /* used when reopening a vdev */ + vdev_ops_t *vdev_ops; /* vdev operations */ + spa_t *vdev_spa; /* spa for this vdev */ + void *vdev_tsd; /* type-specific data */ + vdev_t *vdev_top; /* top-level vdev */ + vdev_t *vdev_parent; /* parent vdev */ + vdev_t **vdev_child; /* array of children */ + uint64_t vdev_children; /* number of children */ + space_map_t vdev_dtl_map; /* dirty time log in-core state */ + space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */ + vdev_stat_t vdev_stat; /* virtual device statistics */ + + /* + * Top-level vdev state. + */ + uint64_t vdev_ms_array; /* metaslab array object */ + uint64_t vdev_ms_shift; /* metaslab size shift */ + uint64_t vdev_ms_count; /* number of metaslabs */ + metaslab_group_t *vdev_mg; /* metaslab group */ + metaslab_t **vdev_ms; /* metaslab array */ + txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ + txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ + txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ + boolean_t vdev_remove_wanted; /* async remove wanted? */ + boolean_t vdev_probe_wanted; /* async probe wanted? */ + list_node_t vdev_config_dirty_node; /* config dirty list */ + list_node_t vdev_state_dirty_node; /* state dirty list */ + uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ + uint64_t vdev_islog; /* is an intent log device */ + + /* + * Leaf vdev state. + */ + uint64_t vdev_psize; /* physical device capacity */ + space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ + txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ + uint64_t vdev_wholedisk; /* true if this is a whole disk */ + uint64_t vdev_offline; /* persistent offline state */ + uint64_t vdev_faulted; /* persistent faulted state */ + uint64_t vdev_degraded; /* persistent degraded state */ + uint64_t vdev_removed; /* persistent removed state */ + uint64_t vdev_nparity; /* number of parity devices for raidz */ + char *vdev_path; /* vdev path (if any) */ + char *vdev_devid; /* vdev devid (if any) */ + char *vdev_physpath; /* vdev device path (if any) */ + uint64_t vdev_not_present; /* not present during import */ + uint64_t vdev_unspare; /* unspare when resilvering done */ + hrtime_t vdev_last_try; /* last reopen time */ + boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_checkremove; /* temporary online test */ + boolean_t vdev_forcefault; /* force online fault */ + uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ + uint8_t vdev_detached; /* device detached? */ + uint8_t vdev_cant_read; /* vdev is failing all reads */ + uint8_t vdev_cant_write; /* vdev is failing all writes */ + uint64_t vdev_isspare; /* was a hot spare */ + uint64_t vdev_isl2cache; /* was a l2cache device */ + vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ + vdev_cache_t vdev_cache; /* physical block cache */ + spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ + zio_t *vdev_probe_zio; /* root of current probe */ + + /* + * For DTrace to work in userland (libzpool) context, these fields must + * remain at the end of the structure. DTrace will use the kernel's + * CTF definition for 'struct vdev', and since the size of a kmutex_t is + * larger in userland, the offsets for the rest fields would be + * incorrect. + */ + kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ + kmutex_t vdev_stat_lock; /* vdev_stat */ + kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ +}; + +#define VDEV_SKIP_SIZE (8 << 10) +#define VDEV_BOOT_HEADER_SIZE (8 << 10) +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCK_RING (128 << 10) + +#define VDEV_UBERBLOCK_SHIFT(vd) \ + MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_COUNT(vd) \ + (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) + +/* ZFS boot block */ +#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL +#define VDEV_BOOT_VERSION 1 /* version number */ + +typedef struct vdev_boot_header { + uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ + uint64_t vb_version; /* VDEV_BOOT_VERSION */ + uint64_t vb_offset; /* start offset (bytes) */ + uint64_t vb_size; /* size (bytes) */ + char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; +} vdev_boot_header_t; + +typedef struct vdev_phys { + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; + zio_block_tail_t vp_zbt; +} vdev_phys_t; + +typedef struct vdev_label { + char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ + vdev_boot_header_t vl_boot_header; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 112K */ + char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ +} vdev_label_t; /* 256K total */ + +/* + * vdev_dirty() flags + */ +#define VDD_METASLAB 0x01 +#define VDD_DTL 0x02 + +/* + * Size and offset of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * Size of label regions at the start and end of each leaf device. + */ +#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_LABELS 4 + +#define VDEV_ALLOC_LOAD 0 +#define VDEV_ALLOC_ADD 1 +#define VDEV_ALLOC_SPARE 2 +#define VDEV_ALLOC_L2CACHE 3 + +/* + * Allocate or free a vdev + */ +extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, + vdev_t *parent, uint_t id, int alloctype); +extern void vdev_free(vdev_t *vd); + +/* + * Add or remove children and parents + */ +extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_compact_children(vdev_t *pvd); +extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); +extern void vdev_remove_parent(vdev_t *cvd); + +/* + * vdev sync load and sync + */ +extern void vdev_load(vdev_t *vd); +extern void vdev_sync(vdev_t *vd, uint64_t txg); +extern void vdev_sync_done(vdev_t *vd, uint64_t txg); +extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); + +/* + * Available vdev types. + */ +extern vdev_ops_t vdev_root_ops; +extern vdev_ops_t vdev_mirror_ops; +extern vdev_ops_t vdev_replacing_ops; +extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_disk_ops; +extern vdev_ops_t vdev_file_ops; +extern vdev_ops_t vdev_missing_ops; +extern vdev_ops_t vdev_spare_ops; + +/* + * Common size functions + */ +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_get_rsize(vdev_t *vd); + +/* + * zdb uses this tunable, so it must be declared here to make lint happy. + */ +extern int zfs_vdev_cache_size; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_IMPL_H */ diff --git a/module/zfs/include/sys/zap.h b/module/zfs/include/sys/zap.h new file mode 100644 index 000000000..f88cc068b --- /dev/null +++ b/module/zfs/include/sys/zap.h @@ -0,0 +1,425 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_H +#define _SYS_ZAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZAP - ZFS Attribute Processor + * + * The ZAP is a module which sits on top of the DMU (Data Management + * Unit) and implements a higher-level storage primitive using DMU + * objects. Its primary consumer is the ZPL (ZFS Posix Layer). + * + * A "zapobj" is a DMU object which the ZAP uses to stores attributes. + * Users should use only zap routines to access a zapobj - they should + * not access the DMU object directly using DMU routines. + * + * The attributes stored in a zapobj are name-value pairs. The name is + * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including + * terminating NULL). The value is an array of integers, which may be + * 1, 2, 4, or 8 bytes long. The total space used by the array (number + * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. + * Note that an 8-byte integer value can be used to store the location + * (object number) of another dmu object (which may be itself a zapobj). + * Note that you can use a zero-length attribute to store a single bit + * of information - the attribute is present or not. + * + * The ZAP routines are thread-safe. However, you must observe the + * DMU's restriction that a transaction may not be operated on + * concurrently. + * + * Any of the routines that return an int may return an I/O error (EIO + * or ECHECKSUM). + * + * + * Implementation / Performance Notes: + * + * The ZAP is intended to operate most efficiently on attributes with + * short (49 bytes or less) names and single 8-byte values, for which + * the microzap will be used. The ZAP should be efficient enough so + * that the user does not need to cache these attributes. + * + * The ZAP's locking scheme makes its routines thread-safe. Operations + * on different zapobjs will be processed concurrently. Operations on + * the same zapobj which only read data will be processed concurrently. + * Operations on the same zapobj which modify data will be processed + * concurrently when there are many attributes in the zapobj (because + * the ZAP uses per-block locking - more than 128 * (number of cpus) + * small attributes will suffice). + */ + +/* + * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C + * strings) for the names of attributes, rather than a byte string + * bounded by an explicit length. If some day we want to support names + * in character sets which have embedded zeros (eg. UTF-16, UTF-32), + * we'll have to add routines for using length-bounded strings. + */ + +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZAP_MAXNAMELEN 256 +#define ZAP_MAXVALUELEN 1024 + +/* + * The matchtype specifies which entry will be accessed. + * MT_EXACT: only find an exact match (non-normalized) + * MT_FIRST: find the "first" normalized (case and Unicode + * form) match; the designated "first" match will not change as long + * as the set of entries with this normalization doesn't change + * MT_BEST: if there is an exact match, find that, otherwise find the + * first normalized match + */ +typedef enum matchtype +{ + MT_EXACT, + MT_BEST, + MT_FIRST +} matchtype_t; + +/* + * Create a new zapobj with no attributes and return its object number. + * MT_EXACT will cause the zap object to only support MT_EXACT lookups, + * otherwise any matchtype can be used for lookups. + * + * normflags specifies what normalization will be done. values are: + * 0: no normalization (legacy on-disk format, supports MT_EXACT matching + * only) + * U8_TEXTPREP_TOLOWER: case normalization will be performed. + * MT_FIRST/MT_BEST matching will find entries that match without + * regard to case (eg. looking for "foo" can find an entry "Foo"). + * Eventually, other flags will permit unicode normalization as well. + */ +uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * Create a new zapobj with no attributes from the given (unallocated) + * object number. + */ +int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_norm(objset_t *ds, uint64_t obj, + int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * The zapobj passed in must be a valid ZAP object for all of the + * following routines. + */ + +/* + * Destroy this zapobj and all its attributes. + * + * Frees the object number using dmu_object_free. + */ +int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); + +/* + * Manipulate attributes. + * + * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. + */ + +/* + * Retrieve the contents of the attribute with the given name. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + * + * If 'integer_size' is smaller than the attribute's integer size, the + * call will fail and return EINVAL. + * + * If 'integer_size' is equal to or larger than the attribute's integer + * size, the call will succeed and return 0. * When converting to a + * larger integer size, the integers will be treated as unsigned (ie. no + * sign-extension will be performed). + * + * 'num_integers' is the length (in integers) of 'buf'. + * + * If the attribute is longer than the buffer, as many integers as will + * fit will be transferred to 'buf'. If the entire attribute was not + * transferred, the call will return EOVERFLOW. + * + * If rn_len is nonzero, realname will be set to the name of the found + * entry (which may be different from the requested name if matchtype is + * not MT_EXACT). + * + * If normalization_conflictp is not NULL, it will be set if there is + * another name with the same case/unicode normalized form. + */ +int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *normalization_conflictp); + +/* + * Create an attribute with the given name and value. + * + * If an attribute with the given name already exists, the call will + * fail and return EEXIST. + */ +int zap_add(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); + +/* + * Set the attribute with the given name to the given value. If an + * attribute with the given name does not exist, it will be created. If + * an attribute with the given name already exists, the previous value + * will be overwritten. The integer_size may be different from the + * existing attribute's integer size, in which case the attribute's + * integer size will be updated to the new value. + */ +int zap_update(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); + +/* + * Get the length (in integers) and the integer size of the specified + * attribute. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_length(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers); + +/* + * Remove the specified attribute. + * + * If the specified attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); +int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx); + +/* + * Returns (in *count) the number of attributes in the specified zap + * object. + */ +int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); + + +/* + * Returns (in name) the name of the entry whose (value & mask) + * (za_first_integer) is value, or ENOENT if not found. The string + * pointed to by name must be at least 256 bytes long. If mask==0, the + * match must be exact (ie, same as mask=-1ULL). + */ +int zap_value_search(objset_t *os, uint64_t zapobj, + uint64_t value, uint64_t mask, char *name); + +/* + * Transfer all the entries from fromobj into intoobj. Only works on + * int_size=8 num_integers=1 values. Fails if there are any duplicated + * entries. + */ +int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); + +/* + * Manipulate entries where the name + value are the "same" (the name is + * a stringified version of the value). + */ +int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); + +struct zap; +struct zap_leaf; +typedef struct zap_cursor { + /* This structure is opaque! */ + objset_t *zc_objset; + struct zap *zc_zap; + struct zap_leaf *zc_leaf; + uint64_t zc_zapobj; + uint64_t zc_hash; + uint32_t zc_cd; +} zap_cursor_t; + +typedef struct { + int za_integer_length; + /* + * za_normalization_conflict will be set if there are additional + * entries with this normalized form (eg, "foo" and "Foo"). + */ + boolean_t za_normalization_conflict; + uint64_t za_num_integers; + uint64_t za_first_integer; /* no sign extension for <8byte ints */ + char za_name[MAXNAMELEN]; +} zap_attribute_t; + +/* + * The interface for listing all the attributes of a zapobj can be + * thought of as cursor moving down a list of the attributes one by + * one. The cookie returned by the zap_cursor_serialize routine is + * persistent across system calls (and across reboot, even). + */ + +/* + * Initialize a zap cursor, pointing to the "first" attribute of the + * zapobj. You must _fini the cursor when you are done with it. + */ +void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); +void zap_cursor_fini(zap_cursor_t *zc); + +/* + * Get the attribute currently pointed to by the cursor. Returns + * ENOENT if at the end of the attributes. + */ +int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); + +/* + * Advance the cursor to the next attribute. + */ +void zap_cursor_advance(zap_cursor_t *zc); + +/* + * Get a persistent cookie pointing to the current position of the zap + * cursor. The low 4 bits in the cookie are always zero, and thus can + * be used as to differentiate a serialized cookie from a different type + * of value. The cookie will be less than 2^32 as long as there are + * fewer than 2^22 (4.2 million) entries in the zap object. + */ +uint64_t zap_cursor_serialize(zap_cursor_t *zc); + +/* + * Initialize a zap cursor pointing to the position recorded by + * zap_cursor_serialize (in the "serialized" argument). You can also + * use a "serialized" argument of 0 to start at the beginning of the + * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to + * zap_cursor_init(...).) + */ +void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, + uint64_t zapobj, uint64_t serialized); + + +#define ZAP_HISTOGRAM_SIZE 10 + +typedef struct zap_stats { + /* + * Size of the pointer table (in number of entries). + * This is always a power of 2, or zero if it's a microzap. + * In general, it should be considerably greater than zs_num_leafs. + */ + uint64_t zs_ptrtbl_len; + + uint64_t zs_blocksize; /* size of zap blocks */ + + /* + * The number of blocks used. Note that some blocks may be + * wasted because old ptrtbl's and large name/value blocks are + * not reused. (Although their space is reclaimed, we don't + * reuse those offsets in the object.) + */ + uint64_t zs_num_blocks; + + /* + * Pointer table values from zap_ptrtbl in the zap_phys_t + */ + uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ + uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ + uint64_t zs_ptrtbl_zt_blk; /* starting block number */ + uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ + uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ + + /* + * Values of the other members of the zap_phys_t + */ + uint64_t zs_block_type; /* ZBT_HEADER */ + uint64_t zs_magic; /* ZAP_MAGIC */ + uint64_t zs_num_leafs; /* The number of leaf blocks */ + uint64_t zs_num_entries; /* The number of zap entries */ + uint64_t zs_salt; /* salt to stir into hash function */ + + /* + * Histograms. For all histograms, the last index + * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater + * than what can be represented. For example + * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number + * of leafs with more than 45 entries. + */ + + /* + * zs_leafs_with_n_pointers[n] is the number of leafs with + * 2^n pointers to it. + */ + uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_with_n_entries[n] is the number of leafs with + * [n*5, (n+1)*5) entries. In the current implementation, there + * can be at most 55 entries in any block, but there may be + * fewer if the name or value is large, or the block is not + * completely full. + */ + uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_n_tenths_full[n] is the number of leafs whose + * fullness is in the range [n/10, (n+1)/10). + */ + uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_entries_using_n_chunks[n] is the number of entries which + * consume n 24-byte chunks. (Note, large names/values only use + * one chunk, but contribute to zs_num_blocks_large.) + */ + uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_buckets_with_n_entries[n] is the number of buckets (each + * leaf has 64 buckets) with n entries. + * zs_buckets_with_n_entries[1] should be very close to + * zs_num_entries. + */ + uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; +} zap_stats_t; + +/* + * Get statistics about a ZAP object. Note: you need to be aware of the + * internal implementation of the ZAP to correctly interpret some of the + * statistics. This interface shouldn't be relied on unless you really + * know what you're doing. + */ +int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_H */ diff --git a/module/zfs/include/sys/zap_impl.h b/module/zfs/include/sys/zap_impl.h new file mode 100644 index 000000000..0dc02ab6b --- /dev/null +++ b/module/zfs/include/sys/zap_impl.h @@ -0,0 +1,218 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_IMPL_H +#define _SYS_ZAP_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zap.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern int fzap_default_block_shift; + +#define ZAP_MAGIC 0x2F52AB2ABULL + +#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) + +#define ZAP_MAXCD (uint32_t)(-1) +#define ZAP_HASHBITS 28 +#define MZAP_ENT_LEN 64 +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT +#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) + +typedef struct mzap_ent_phys { + uint64_t mze_value; + uint32_t mze_cd; + uint16_t mze_pad; /* in case we want to chain them someday */ + char mze_name[MZAP_NAME_LEN]; +} mzap_ent_phys_t; + +typedef struct mzap_phys { + uint64_t mz_block_type; /* ZBT_MICRO */ + uint64_t mz_salt; + uint64_t mz_normflags; + uint64_t mz_pad[5]; + mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ +} mzap_phys_t; + +typedef struct mzap_ent { + avl_node_t mze_node; + int mze_chunkid; + uint64_t mze_hash; + mzap_ent_phys_t mze_phys; +} mzap_ent_t; + + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: + * + * ptrtbl fits in first block: + * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ... + * + * ptrtbl too big for first block: + * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +struct dmu_buf; +struct zap_leaf; + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) +/* any other values are ptrtbl blocks */ + +/* + * the embedded pointer table takes up half a block: + * block size / entry size (2^3) / 2 + */ +#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) + +/* + * The embedded pointer table starts half-way through the block. Since + * the pointer table itself is half the block, it starts at (64-bit) + * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)). + */ +#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \ + ((uint64_t *)(zap)->zap_f.zap_phys) \ + [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))] + +/* + * TAKE NOTE: + * If zap_phys_t is modified, zap_byteswap() must be modified. + */ +typedef struct zap_phys { + uint64_t zap_block_type; /* ZBT_HEADER */ + uint64_t zap_magic; /* ZAP_MAGIC */ + + struct zap_table_phys { + uint64_t zt_blk; /* starting block number */ + uint64_t zt_numblks; /* number of blocks */ + uint64_t zt_shift; /* bits to index it */ + uint64_t zt_nextblk; /* next (larger) copy start block */ + uint64_t zt_blks_copied; /* number source blocks copied */ + } zap_ptrtbl; + + uint64_t zap_freeblk; /* the next free block */ + uint64_t zap_num_leafs; /* number of leafs */ + uint64_t zap_num_entries; /* number of entries */ + uint64_t zap_salt; /* salt to stir into hash function */ + uint64_t zap_normflags; /* flags for u8_textprep_str() */ + /* + * This structure is followed by padding, and then the embedded + * pointer table. The embedded pointer table takes up second + * half of the block. It is accessed using the + * ZAP_EMBEDDED_PTRTBL_ENT() macro. + */ +} zap_phys_t; + +typedef struct zap_table_phys zap_table_phys_t; + +typedef struct zap { + objset_t *zap_objset; + uint64_t zap_object; + struct dmu_buf *zap_dbuf; + krwlock_t zap_rwlock; + boolean_t zap_ismicro; + int zap_normflags; + uint64_t zap_salt; + union { + struct { + zap_phys_t *zap_phys; + + /* + * zap_num_entries_mtx protects + * zap_num_entries + */ + kmutex_t zap_num_entries_mtx; + int zap_block_shift; + } zap_fat; + struct { + mzap_phys_t *zap_phys; + int16_t zap_num_entries; + int16_t zap_num_chunks; + int16_t zap_alloc_next; + avl_tree_t zap_avl; + } zap_micro; + } zap_u; +} zap_t; + +typedef struct zap_name { + zap_t *zn_zap; + const char *zn_name_orij; + uint64_t zn_hash; + matchtype_t zn_matchtype; + const char *zn_name_norm; + char zn_normbuf[ZAP_MAXNAMELEN]; +} zap_name_t; + +#define zap_f zap_u.zap_fat +#define zap_m zap_u.zap_micro + +boolean_t zap_match(zap_name_t *zn, const char *matchname); +int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); +void zap_unlockdir(zap_t *zap); +void zap_evict(dmu_buf_t *db, void *vmzap); +zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt); +void zap_name_free(zap_name_t *zn); + +#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) + +void fzap_byteswap(void *buf, size_t size); +int fzap_count(zap_t *zap, uint64_t *count); +int fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *normalization_conflictp); +int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); +int fzap_update(zap_name_t *zn, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers); +int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); +int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); +void fzap_get_stats(zap_t *zap, zap_stats_t *zs); +void zap_put_leaf(struct zap_leaf *l); + +int fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_IMPL_H */ diff --git a/module/zfs/include/sys/zap_leaf.h b/module/zfs/include/sys/zap_leaf.h new file mode 100644 index 000000000..14144e059 --- /dev/null +++ b/module/zfs/include/sys/zap_leaf.h @@ -0,0 +1,244 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_LEAF_H +#define _SYS_ZAP_LEAF_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +struct zap; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ +#define ZAP_LEAF_CHUNKSIZE 24 + +/* + * The amount of space available for chunks is: + * block size (1<<l->l_bs) - hash entry size (2) * number of hash + * entries - header space (2*chunksize) + */ +#define ZAP_LEAF_NUMCHUNKS(l) \ + (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ + ZAP_LEAF_CHUNKSIZE - 2) + +/* + * The amount of space within the chunk available for the array is: + * chunk size - space for type (1) - space for next pointer (2) + */ +#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) + +#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ + (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) + +/* + * Low water mark: when there are only this many chunks free, start + * growing the ptrtbl. Ideally, this should be larger than a + * "reasonably-sized" entry. 20 chunks is more than enough for the + * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), + * while still being only around 3% for 16k blocks. + */ +#define ZAP_LEAF_LOW_WATER (20) + +/* + * The leaf hash table has block size / 2^5 (32) number of entries, + * which should be more than enough for the maximum number of entries, + * which is less than block size / CHUNKSIZE (24) / minimum number of + * chunks per entry (3). + */ +#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) + +/* + * The chunks start immediately after the hash table. The end of the + * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a + * chunk_t. + */ +#define ZAP_LEAF_CHUNK(l, idx) \ + ((zap_leaf_chunk_t *) \ + ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] +#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) + +typedef enum zap_chunk_type { + ZAP_CHUNK_FREE = 253, + ZAP_CHUNK_ENTRY = 252, + ZAP_CHUNK_ARRAY = 251, + ZAP_CHUNK_TYPE_MAX = 250 +} zap_chunk_type_t; + +#define ZLF_ENTRIES_CDSORTED (1<<0) + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ +typedef struct zap_leaf_phys { + struct zap_leaf_header { + uint64_t lh_block_type; /* ZBT_LEAF */ + uint64_t lh_pad1; + uint64_t lh_prefix; /* hash prefix of this leaf */ + uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ + uint16_t lh_nfree; /* number free chunks */ + uint16_t lh_nentries; /* number of entries */ + uint16_t lh_prefix_len; /* num bits used to id this */ + +/* above is accessable to zap, below is zap_leaf private */ + + uint16_t lh_freelist; /* chunk head of free list */ + uint8_t lh_flags; /* ZLF_* flags */ + uint8_t lh_pad2[11]; + } l_hdr; /* 2 24-byte chunks */ + + /* + * The header is followed by a hash table with + * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is + * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) + * zap_leaf_chunk structures. These structures are accessed + * with the ZAP_LEAF_CHUNK() macro. + */ + + uint16_t l_hash[1]; +} zap_leaf_phys_t; + +typedef union zap_leaf_chunk { + struct zap_leaf_entry { + uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ + uint8_t le_int_size; /* size of ints */ + uint16_t le_next; /* next entry in hash chain */ + uint16_t le_name_chunk; /* first chunk of the name */ + uint16_t le_name_length; /* bytes in name, incl null */ + uint16_t le_value_chunk; /* first chunk of the value */ + uint16_t le_value_length; /* value length in ints */ + uint32_t le_cd; /* collision differentiator */ + uint64_t le_hash; /* hash value of the name */ + } l_entry; + struct zap_leaf_array { + uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ + uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; + uint16_t la_next; /* next blk or CHAIN_END */ + } l_array; + struct zap_leaf_free { + uint8_t lf_type; /* always ZAP_CHUNK_FREE */ + uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t lf_next; /* next in free list, or CHAIN_END */ + } l_free; +} zap_leaf_chunk_t; + +typedef struct zap_leaf { + krwlock_t l_rwlock; + uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ + int l_bs; /* block size shift */ + dmu_buf_t *l_dbuf; + zap_leaf_phys_t *l_phys; +} zap_leaf_t; + + +typedef struct zap_entry_handle { + /* below is set by zap_leaf.c and is public to zap.c */ + uint64_t zeh_num_integers; + uint64_t zeh_hash; + uint32_t zeh_cd; + uint8_t zeh_integer_size; + + /* below is private to zap_leaf.c */ + uint16_t zeh_fakechunk; + uint16_t *zeh_chunkp; + zap_leaf_t *zeh_leaf; +} zap_entry_handle_t; + +/* + * Return a handle to the named entry, or ENOENT if not found. The hash + * value must equal zap_hash(name). + */ +extern int zap_leaf_lookup(zap_leaf_t *l, + zap_name_t *zn, zap_entry_handle_t *zeh); + +/* + * Return a handle to the entry with this hash+cd, or the entry with the + * next closest hash+cd. + */ +extern int zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); + +/* + * Read the first num_integers in the attribute. Integer size + * conversion will be done without sign extension. Return EINVAL if + * integer_size is too small. Return EOVERFLOW if there are more than + * num_integers in the attribute. + */ +extern int zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf); + +extern int zap_entry_read_name(const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); + +/* + * Replace the value of an existing entry. + * + * zap_entry_update may fail if it runs out of space (ENOSPC). + */ +extern int zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf); + +/* + * Remove an entry. + */ +extern void zap_entry_remove(zap_entry_handle_t *zeh); + +/* + * Create an entry. An equal entry must not exist, and this entry must + * belong in this leaf (according to its hash value). Fills in the + * entry handle on success. Returns 0 on success or ENOSPC on failure. + */ +extern int zap_entry_create(zap_leaf_t *l, + const char *name, uint64_t h, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); + +/* + * Return true if there are additional entries with the same normalized + * form. + */ +extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, + zap_name_t *zn, const char *name, zap_t *zap); + +/* + * Other stuff. + */ + +extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); +extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); +extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); +extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_LEAF_H */ diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h new file mode 100644 index 000000000..bd91b33d1 --- /dev/null +++ b/module/zfs/include/sys/zfs_acl.h @@ -0,0 +1,214 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_ACL_H +#define _SYS_FS_ZFS_ACL_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#endif +#include <sys/acl.h> +#include <sys/dmu.h> +#include <sys/zfs_fuid.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct znode_phys; + +#define ACE_SLOT_CNT 6 +#define ZFS_ACL_VERSION_INITIAL 0ULL +#define ZFS_ACL_VERSION_FUID 1ULL +#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID + +/* + * ZFS ACLs are store in various forms. + * Files created with ACL version ZFS_ACL_VERSION_INITIAL + * will all be created with fixed length ACEs of type + * zfs_oldace_t. + * + * Files with ACL version ZFS_ACL_VERSION_FUID will be created + * with various sized ACEs. The abstraction entries will utilize + * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t + * and some specialized CIFS ACEs will use zfs_object_ace_t. + */ + +/* + * All ACEs have a common hdr. For + * owner@, group@, and everyone@ this is all + * thats needed. + */ +typedef struct zfs_ace_hdr { + uint16_t z_type; + uint16_t z_flags; + uint32_t z_access_mask; +} zfs_ace_hdr_t; + +typedef zfs_ace_hdr_t zfs_ace_abstract_t; + +/* + * Standard ACE + */ +typedef struct zfs_ace { + zfs_ace_hdr_t z_hdr; + uint64_t z_fuid; +} zfs_ace_t; + +/* + * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE + * and will only be set/retrieved in a CIFS context. + */ + +typedef struct zfs_object_ace { + zfs_ace_t z_ace; + uint8_t z_object_type[16]; /* object type */ + uint8_t z_inherit_type[16]; /* inherited object type */ +} zfs_object_ace_t; + +typedef struct zfs_oldace { + uint32_t z_fuid; /* "who" */ + uint32_t z_access_mask; /* access mask */ + uint16_t z_flags; /* flags, i.e inheritance */ + uint16_t z_type; /* type of entry allow/deny */ +} zfs_oldace_t; + +typedef struct zfs_acl_phys_v0 { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_pad; /* pad */ + zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_acl_phys_v0_t; + +#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) + +typedef struct zfs_acl_phys { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_size; /* Number of bytes in ACL */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_count; /* ace count */ + uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ +} zfs_acl_phys_t; + + + +typedef struct acl_ops { + uint32_t (*ace_mask_get) (void *acep); /* get access mask */ + void (*ace_mask_set) (void *acep, + uint32_t mask); /* set access mask */ + uint16_t (*ace_flags_get) (void *acep); /* get flags */ + void (*ace_flags_set) (void *acep, + uint16_t flags); /* set flags */ + uint16_t (*ace_type_get)(void *acep); /* get type */ + void (*ace_type_set)(void *acep, + uint16_t type); /* set type */ + uint64_t (*ace_who_get)(void *acep); /* get who/fuid */ + void (*ace_who_set)(void *acep, + uint64_t who); /* set who/fuid */ + size_t (*ace_size)(void *acep); /* how big is this ace */ + size_t (*ace_abstract_size)(void); /* sizeof abstract entry */ + int (*ace_mask_off)(void); /* off of access mask in ace */ + int (*ace_data)(void *acep, void **datap); + /* ptr to data if any */ +} acl_ops_t; + +/* + * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's. + * Each node will have one or more ACEs associated with it. You will + * only have multiple nodes during a chmod operation. Normally only + * one node is required. + */ +typedef struct zfs_acl_node { + list_node_t z_next; /* Next chunk of ACEs */ + void *z_acldata; /* pointer into actual ACE(s) */ + void *z_allocdata; /* pointer to kmem allocated memory */ + size_t z_allocsize; /* Size of blob in bytes */ + size_t z_size; /* length of ACL data */ + int z_ace_count; /* number of ACEs in this acl node */ + int z_ace_idx; /* ace iterator positioned on */ +} zfs_acl_node_t; + +typedef struct zfs_acl { + int z_acl_count; /* Number of ACEs */ + size_t z_acl_bytes; /* Number of bytes in ACL */ + uint_t z_version; /* version of ACL */ + void *z_next_ace; /* pointer to next ACE */ + int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ + zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ + list_t z_acl; /* chunks of ACE data */ + acl_ops_t z_ops; /* ACL operations */ + boolean_t z_has_fuids; /* FUIDs present in ACL? */ +} zfs_acl_t; + +#define ACL_DATA_ALLOCED 0x1 +#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) + +/* + * Property values for acl_mode and acl_inherit. + * + * acl_mode can take discard, noallow, groupmask and passthrough. + * whereas acl_inherit has secure instead of groupmask. + */ + +#define ZFS_ACL_DISCARD 0 +#define ZFS_ACL_NOALLOW 1 +#define ZFS_ACL_GROUPMASK 2 +#define ZFS_ACL_PASSTHROUGH 3 +#define ZFS_ACL_RESTRICTED 4 +#define ZFS_ACL_PASSTHROUGH_X 5 + +struct znode; +struct zfsvfs; +struct zfs_fuid_info; + +#ifdef _KERNEL +void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, + dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **); +int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +void zfs_acl_rele(void *); +void zfs_oldace_byteswap(ace_t *, int); +void zfs_ace_byteswap(void *, size_t, boolean_t); +extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); +extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); +extern int zfs_acl_access(struct znode *, int, cred_t *); +int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); +int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); +int zfs_zaccess_rename(struct znode *, struct znode *, + struct znode *, struct znode *, cred_t *cr); +void zfs_acl_free(zfs_acl_t *); +int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **); +int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, + struct zfs_fuid_info **, dmu_tx_t *); + +#endif + +#ifdef __cplusplus +} +#endif +#endif /* _SYS_FS_ZFS_ACL_H */ diff --git a/module/zfs/include/sys/zfs_context.h b/module/zfs/include/sys/zfs_context.h new file mode 100644 index 000000000..a5be3e130 --- /dev/null +++ b/module/zfs/include/sys/zfs_context.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_CONTEXT_H +#define _SYS_ZFS_CONTEXT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/note.h> +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/atomic.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/buf.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuvar.h> +#include <sys/kobj.h> +#include <sys/conf.h> +#include <sys/disp.h> +#include <sys/debug.h> +#include <sys/random.h> +#include <sys/byteorder.h> +#include <sys/systm.h> +#include <sys/list.h> +#include <sys/uio.h> +#include <sys/dirent.h> +#include <sys/time.h> +#include <vm/seg_kmem.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/zfs_debug.h> +#include <sys/sysevent.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/fm/util.h> + +#define CPU_SEQID (CPU->cpu_seqid) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/module/zfs/include/sys/zfs_ctldir.h b/module/zfs/include/sys/zfs_ctldir.h new file mode 100644 index 000000000..ce29625d1 --- /dev/null +++ b/module/zfs/include/sys/zfs_ctldir.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_CTLDIR_NAME ".zfs" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ + ((zdp)->z_zfsvfs->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + ((zdp)->z_zfsvfs->z_show_ctldir)) + +void zfsctl_create(zfsvfs_t *); +void zfsctl_destroy(zfsvfs_t *); +vnode_t *zfsctl_root(znode_t *); +void zfsctl_init(void); +void zfsctl_fini(void); + +int zfsctl_rename_snapshot(const char *from, const char *to); +int zfsctl_destroy_snapshot(const char *snapname, int force); +int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); + +int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp); + +int zfsctl_make_fid(zfsvfs_t *zfsvfsp, uint64_t object, uint32_t gen, + fid_t *fidp); +int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); + +#define ZFSCTL_INO_ROOT 0x1 +#define ZFSCTL_INO_SNAPDIR 0x2 + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CTLDIR_H */ diff --git a/module/zfs/include/sys/zfs_debug.h b/module/zfs/include/sys/zfs_debug.h new file mode 100644 index 000000000..450ac1c81 --- /dev/null +++ b/module/zfs/include/sys/zfs_debug.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_DEBUG_H +#define _SYS_ZFS_DEBUG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +/* + * ZFS debugging + */ + +#if defined(DEBUG) || !defined(_KERNEL) +#define ZFS_DEBUG +#endif + +extern int zfs_flags; + +#define ZFS_DEBUG_DPRINTF 0x0001 +#define ZFS_DEBUG_DBUF_VERIFY 0x0002 +#define ZFS_DEBUG_DNODE_VERIFY 0x0004 +#define ZFS_DEBUG_SNAPNAMES 0x0008 +#define ZFS_DEBUG_MODIFY 0x0010 + +#ifdef ZFS_DEBUG +extern void __dprintf(const char *file, const char *func, + int line, const char *fmt, ...); +#define dprintf(...) \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) \ + __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) +#else +#define dprintf(...) ((void)0) +#endif /* ZFS_DEBUG */ + +extern void zfs_panic_recover(const char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_DEBUG_H */ diff --git a/module/zfs/include/sys/zfs_dir.h b/module/zfs/include/sys/zfs_dir.h new file mode 100644 index 000000000..ebb66e8ae --- /dev/null +++ b/module/zfs/include/sys/zfs_dir.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/pathname.h> +#include <sys/dmu.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ +#define IS_REPLAY 0x04 /* we are replaying intent log */ + +extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, + int, int *, pathname_t *); +extern void zfs_dirent_unlock(zfs_dirlock_t *); +extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, + boolean_t *); +extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, + pathname_t *); +extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, + uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **); +extern void zfs_rmnode(znode_t *); +extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/module/zfs/include/sys/zfs_fuid.h b/module/zfs/include/sys/zfs_fuid.h new file mode 100644 index 000000000..810ffc81a --- /dev/null +++ b/module/zfs/include/sys/zfs_fuid.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_FUID_H +#define _SYS_FS_ZFS_FUID_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef _KERNEL +#include <sys/kidmap.h> +#include <sys/sid.h> +#include <sys/dmu.h> +#include <sys/zfs_vfsops.h> +#endif +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_OWNER, + ZFS_GROUP, + ZFS_ACE_USER, + ZFS_ACE_GROUP +} zfs_fuid_type_t; + +/* + * Estimate space needed for one more fuid table entry. + * for now assume its current size + 1K + */ +#define FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) + +#define FUID_INDEX(x) (x >> 32) +#define FUID_RID(x) (x & 0xffffffff) +#define FUID_ENCODE(idx, rid) ((idx << 32) | rid) +/* + * FUIDs cause problems for the intent log + * we need to replay the creation of the FUID, + * but we can't count on the idmapper to be around + * and during replay the FUID index may be different than + * before. Also, if an ACL has 100 ACEs and 12 different + * domains we don't want to log 100 domain strings, but rather + * just the unique 12. + */ + +/* + * The FUIDs in the log will index into + * domain string table and the bottom half will be the rid. + * Used for mapping ephemeral uid/gid during ACL setting to FUIDs + */ +typedef struct zfs_fuid { + list_node_t z_next; + uint64_t z_id; /* uid/gid being converted to fuid */ + uint64_t z_domidx; /* index in AVL domain table */ + uint64_t z_logfuid; /* index for domain in log */ +} zfs_fuid_t; + +/* list of unique domains */ +typedef struct zfs_fuid_domain { + list_node_t z_next; + uint64_t z_domidx; /* AVL tree idx */ + const char *z_domain; /* domain string */ +} zfs_fuid_domain_t; + +/* + * FUID information necessary for logging create, setattr, and setacl. + */ +typedef struct zfs_fuid_info { + list_t z_fuids; + list_t z_domains; + uint64_t z_fuid_owner; + uint64_t z_fuid_group; + char **z_domain_table; /* Used during replay */ + uint32_t z_fuid_cnt; /* How many fuids in z_fuids */ + uint32_t z_domain_cnt; /* How many domains */ + size_t z_domain_str_sz; /* len of domain strings z_domain list */ +} zfs_fuid_info_t; + +#ifdef _KERNEL +struct znode; +extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); +extern void zfs_fuid_destroy(zfsvfs_t *); +extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, + dmu_tx_t *, cred_t *, zfs_fuid_info_t **); +extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, + dmu_tx_t *, zfs_fuid_info_t **); +extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid, + uid_t *gid); +extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); +extern void zfs_fuid_info_free(); +extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); +#endif + +char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); +uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); +void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_FUID_H */ diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h new file mode 100644 index 000000000..1692608bb --- /dev/null +++ b/module/zfs/include/sys/zfs_ioctl.h @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_H +#define _SYS_ZFS_IOCTL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/cred.h> +#include <sys/dmu.h> +#include <sys/zio.h> +#include <sys/dsl_deleg.h> + +#ifdef _KERNEL +#include <sys/nvpair.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Property values for snapdir + */ +#define ZFS_SNAPDIR_HIDDEN 0 +#define ZFS_SNAPDIR_VISIBLE 1 + +#define DMU_BACKUP_STREAM_VERSION (1ULL) +#define DMU_BACKUP_HEADER_VERSION (2ULL) +#define DMU_BACKUP_MAGIC 0x2F5bacbacULL + +#define DRR_FLAG_CLONE (1<<0) +#define DRR_FLAG_CI_DATA (1<<1) + +/* + * zfs ioctl command structure + */ +typedef struct dmu_replay_record { + enum { + DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, + DRR_WRITE, DRR_FREE, DRR_END, + } drr_type; + uint32_t drr_payloadlen; + union { + struct drr_begin { + uint64_t drr_magic; + uint64_t drr_version; + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_flags; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; + } drr_begin; + struct drr_end { + zio_cksum_t drr_checksum; + } drr_end; + struct drr_object { + uint64_t drr_object; + dmu_object_type_t drr_type; + dmu_object_type_t drr_bonustype; + uint32_t drr_blksz; + uint32_t drr_bonuslen; + uint8_t drr_checksum; + uint8_t drr_compress; + uint8_t drr_pad[6]; + /* bonus content follows */ + } drr_object; + struct drr_freeobjects { + uint64_t drr_firstobj; + uint64_t drr_numobjs; + } drr_freeobjects; + struct drr_write { + uint64_t drr_object; + dmu_object_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_offset; + uint64_t drr_length; + /* content follows */ + } drr_write; + struct drr_free { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + } drr_free; + } drr_u; +} dmu_replay_record_t; + +typedef struct zinject_record { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_pad; /* pad out to 64 bit alignment */ +} zinject_record_t; + +#define ZINJECT_NULL 0x1 +#define ZINJECT_FLUSH_ARC 0x2 +#define ZINJECT_UNLOAD_SPA 0x4 + +typedef struct zfs_share { + uint64_t z_exportdata; + uint64_t z_sharedata; + uint64_t z_sharetype; /* 0 = share, 1 = unshare */ + uint64_t z_sharemax; /* max length of share string */ +} zfs_share_t; + +/* + * ZFS file systems may behave the usual, POSIX-compliant way, where + * name lookups are case-sensitive. They may also be set up so that + * all the name lookups are case-insensitive, or so that only some + * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. + */ +typedef enum zfs_case { + ZFS_CASE_SENSITIVE, + ZFS_CASE_INSENSITIVE, + ZFS_CASE_MIXED +} zfs_case_t; + +typedef struct zfs_cmd { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + zfs_share_t zc_share; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; +} zfs_cmd_t; + +#define ZVOL_MAX_MINOR (1 << 16) +#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) + +#ifdef _KERNEL + +typedef struct zfs_creat { + nvlist_t *zct_zplprops; + nvlist_t *zct_props; +} zfs_creat_t; + +extern dev_info_t *zfs_dip; + +extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); +extern int zfs_secpolicy_rename_perms(const char *from, + const char *to, cred_t *cr); +extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); +extern int zfs_busy(void); +extern int zfs_unmount_snap(char *, void *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/module/zfs/include/sys/zfs_rlock.h b/module/zfs/include/sys/zfs_rlock.h new file mode 100644 index 000000000..f302b663e --- /dev/null +++ b/module/zfs/include/sys/zfs_rlock.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_RLOCK_H +#define _SYS_FS_ZFS_RLOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#include <sys/zfs_znode.h> + +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} rl_type_t; + +typedef struct rl { + znode_t *r_zp; /* znode this lock applies to */ + avl_node_t r_node; /* avl node link */ + uint64_t r_off; /* file range offset */ + uint64_t r_len; /* file range length */ + uint_t r_cnt; /* range reference count in tree */ + rl_type_t r_type; /* range type */ + kcondvar_t r_wr_cv; /* cv for waiting writers */ + kcondvar_t r_rd_cv; /* cv for waiting readers */ + uint8_t r_proxy; /* acting for original range */ + uint8_t r_write_wanted; /* writer wants to lock this range */ + uint8_t r_read_wanted; /* reader wants to lock this range */ +} rl_t; + +/* + * Lock a range (offset, length) as either shared (READER) + * or exclusive (WRITER or APPEND). APPEND is a special type that + * is converted to WRITER that specified to lock from the start of the + * end of file. zfs_range_lock() returns the range lock structure. + */ +rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); + +/* + * Unlock range and destroy range lock structure. + */ +void zfs_range_unlock(rl_t *rl); + +/* + * Reduce range locked as RW_WRITER from whole file to specified range. + * Asserts the whole file was previously locked. + */ +void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); + +/* + * AVL comparison function used to compare range locks + */ +int zfs_range_compare(const void *arg1, const void *arg2); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_RLOCK_H */ diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h new file mode 100644 index 000000000..87b75e6e7 --- /dev/null +++ b/module/zfs/include/sys/zfs_vfsops.h @@ -0,0 +1,140 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/list.h> +#include <sys/vfs.h> +#include <sys/zil.h> +#include <sys/rrwlock.h> +#include <sys/zfs_ioctl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfsvfs zfsvfs_t; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted; /* unmounted */ + rrwlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all vnodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + vnode_t *z_ctldir; /* .zfs directory pointer */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ + uint64_t z_version; /* ZPL version */ +#define ZFS_OBJ_MTX_SZ 64 + kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ +}; + +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern uint_t zfs_fsyncer_key; + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h new file mode 100644 index 000000000..a5416525c --- /dev/null +++ b/module/zfs/include/sys/zfs_znode.h @@ -0,0 +1,356 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_ZNODE_H +#define _SYS_FS_ZFS_ZNODE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/attr.h> +#include <sys/list.h> +#include <sys/dmu.h> +#include <sys/zfs_vfsops.h> +#include <sys/rrwlock.h> +#endif +#include <sys/zfs_acl.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Additional file level attributes, that are stored + * in the upper half of zp_flags + */ +#define ZFS_READONLY 0x0000000100000000 +#define ZFS_HIDDEN 0x0000000200000000 +#define ZFS_SYSTEM 0x0000000400000000 +#define ZFS_ARCHIVE 0x0000000800000000 +#define ZFS_IMMUTABLE 0x0000001000000000 +#define ZFS_NOUNLINK 0x0000002000000000 +#define ZFS_APPENDONLY 0x0000004000000000 +#define ZFS_NODUMP 0x0000008000000000 +#define ZFS_OPAQUE 0x0000010000000000 +#define ZFS_AV_QUARANTINED 0x0000020000000000 +#define ZFS_AV_MODIFIED 0x0000040000000000 + +#define ZFS_ATTR_SET(zp, attr, value) \ +{ \ + if (value) \ + zp->z_phys->zp_flags |= attr; \ + else \ + zp->z_phys->zp_flags &= ~attr; \ +} + +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ +#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ +#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ +#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ + +/* + * Is ID ephemeral? + */ +#define IS_EPHEMERAL(x) (x > MAXUID) + +/* + * Should we use FUIDs? + */ +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + +#define MASTER_NODE_OBJ 1 + +/* + * Special attributes for master node. + */ +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_STR "VERSION" +#define ZFS_FUID_TABLES "FUID" + +#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) + +/* Path component length */ +/* + * The generic fs code uses MAXNAMELEN to represent + * what the largest component length is. Unfortunately, + * this length includes the terminating NULL. ZFS needs + * to tell the users via pathconf() and statvfs() what the + * true maximum length of a component is, excluding the NULL. + */ +#define ZFS_MAXNAMELEN (MAXNAMELEN - 1) + +/* + * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in + * the directory entries. + */ +#define IFTODT(mode) (((mode) & S_IFMT) >> 12) + +/* + * The directory entry has the type (currently unused on Solaris) in the + * top 4 bits, and the object number in the low 48 bits. The "middle" + * 12 bits are unused. + */ +#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) +#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) + +/* + * This is the persistent portion of the znode. It is stored + * in the "bonus buffer" of the file. Short symbolic links + * are also stored in the bonus buffer. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_zap; /* 144 - extra attributes */ + uint64_t zp_pad[3]; /* 152 - future */ + zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) + */ +} znode_phys_t; + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#ifdef _KERNEL +typedef struct zfs_dirlock { + char *dl_name; /* directory entry being locked */ + uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ + uint16_t dl_namesize; /* set if dl_name was allocated */ + kcondvar_t dl_cv; /* wait for entry to be unlocked */ + struct znode *dl_dzp; /* directory znode */ + struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ +} zfs_dirlock_t; + +typedef struct znode { + struct zfsvfs *z_zfsvfs; + vnode_t *z_vnode; + uint64_t z_id; /* object ID for this znode */ + kmutex_t z_lock; /* znode modification lock */ + krwlock_t z_map_lock; /* page map lock */ + krwlock_t z_parent_lock; /* parent lock for directories */ + krwlock_t z_name_lock; /* "master" lock for dirent locks */ + zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ + kmutex_t z_range_lock; /* protects changes to z_range_avl */ + avl_tree_t z_range_avl; /* avl tree of file range locks */ + uint8_t z_unlinked; /* file has been unlinked */ + uint8_t z_atime_dirty; /* atime needs to be synced */ + uint8_t z_zn_prefetch; /* Prefetch znodes? */ + uint_t z_blksz; /* block size in bytes */ + uint_t z_seq; /* modification sequence number */ + uint64_t z_mapcnt; /* number of pages mapped to file */ + uint64_t z_last_itx; /* last ZIL itx on this znode */ + uint64_t z_gen; /* generation (same as zp_gen) */ + uint32_t z_sync_cnt; /* synchronous open count */ + kmutex_t z_acl_lock; /* acl data lock */ + list_node_t z_link_node; /* all znodes in fs link */ + /* + * These are dmu managed fields. + */ + znode_phys_t *z_phys; /* pointer to persistent znode */ + dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ +} znode_t; + + +/* + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + */ + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((znode_t *)(VP)->v_data) + +/* + * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation. + * ZFS_EXIT() must be called before exitting the vop. + * ZFS_VERIFY_ZP() verifies the znode is valid. + */ +#define ZFS_ENTER(zfsvfs) \ + { \ + rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ + ZFS_EXIT(zfsvfs); \ + return (EIO); \ + } \ + } + +#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) + +#define ZFS_VERIFY_ZP(zp) \ + if ((zp)->z_dbuf == NULL) { \ + ZFS_EXIT((zp)->z_zfsvfs); \ + return (EIO); \ + } \ + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ + (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) +#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ + mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ + mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ + mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) + +/* + * Macros to encode/decode ZFS stored time values from/to struct timespec + */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +{ \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ +} + +#define ZFS_TIME_DECODE(tp, stmp) \ +{ \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} + +/* + * Timestamp defines + */ +#define ACCESSED (AT_ATIME) +#define STATE_CHANGED (AT_CTIME) +#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) + +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ + zfs_time_stamper(zp, ACCESSED, NULL) + +extern int zfs_init_fs(zfsvfs_t *, znode_t **); +extern void zfs_set_dataprop(objset_t *); +extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, + dmu_tx_t *tx); +extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); +extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); +extern void zfs_znode_init(void); +extern void zfs_znode_fini(void); +extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); +extern int zfs_rezget(znode_t *); +extern void zfs_zinactive(znode_t *); +extern void zfs_znode_delete(znode_t *, dmu_tx_t *); +extern void zfs_znode_free(znode_t *); +extern void zfs_remove_op_tables(); +extern int zfs_create_op_tables(); +extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr); +extern dev_t zfs_cmpldev(uint64_t); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); +extern int zfs_set_version(const char *name, uint64_t newvers); +extern int zfs_get_stats(objset_t *os, nvlist_t *nv); +extern void zfs_znode_dmu_fini(znode_t *); + +extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, + vattr_t *vap); +extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, + vattr_t *vap); +extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, char *name); +extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name); +extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link); +extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); +extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t len, int ioflag); +extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len); +extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); +extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); +extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); +extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); + +extern caddr_t zfs_map_page(page_t *, enum seg_rw); +extern void zfs_unmap_page(page_t *, caddr_t); + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +#endif /* _KERNEL */ + +extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZNODE_H */ diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h new file mode 100644 index 000000000..4d02d14f7 --- /dev/null +++ b/module/zfs/include/sys/zil.h @@ -0,0 +1,382 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIL_H +#define _SYS_ZIL_H + +#include <sys/types.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Intent log format: + * + * Each objset has its own intent log. The log header (zil_header_t) + * for objset N's intent log is kept in the Nth object of the SPA's + * intent_log objset. The log header points to a chain of log blocks, + * each of which contains log records (i.e., transactions) followed by + * a log block trailer (zil_trailer_t). The format of a log record + * depends on the record (or transaction) type, but all records begin + * with a common structure that defines the type, length, and txg. + */ + +/* + * Intent log header - this on disk structure holds fields to manage + * the log. All fields are 64 bit to easily handle cross architectures. + */ +typedef struct zil_header { + uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ + uint64_t zh_replay_seq; /* highest replayed sequence number */ + blkptr_t zh_log; /* log chain */ + uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_pad[5]; +} zil_header_t; + +/* + * Log block trailer - structure at the end of the header and each log block + * + * The zit_bt contains a zbt_cksum which for the intent log is + * the sequence number of this log block. A seq of 0 is invalid. + * The zbt_cksum is checked by the SPA against the sequence + * number passed in the blk_cksum field of the blkptr_t + */ +typedef struct zil_trailer { + uint64_t zit_pad; + blkptr_t zit_next_blk; /* next block in chain */ + uint64_t zit_nused; /* bytes in log block used */ + zio_block_tail_t zit_bt; /* block trailer */ +} zil_trailer_t; + +#define ZIL_MIN_BLKSZ 4096ULL +#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE +#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) + +/* + * The words of a log block checksum. + */ +#define ZIL_ZC_GUID_0 0 +#define ZIL_ZC_GUID_1 1 +#define ZIL_ZC_OBJSET 2 +#define ZIL_ZC_SEQ 3 + +typedef enum zil_create { + Z_FILE, + Z_DIR, + Z_XATTRDIR, +} zil_create_t; + +/* + * size of xvattr log section. + * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps + * for create time and a single 64 bit integer for all of the attributes, + * and 4 64 bit integers (32 bytes) for the scanstamp. + * + */ + +#define ZIL_XVAT_SIZE(mapsize) \ + sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ + (sizeof (uint64_t) * 7) + +/* + * Size of ACL in log. The ACE data is padded out to properly align + * on 8 byte boundary. + */ + +#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) + +/* + * Intent log transaction types and record structures + */ +#define TX_CREATE 1 /* Create file */ +#define TX_MKDIR 2 /* Make directory */ +#define TX_MKXATTR 3 /* Make XATTR directory */ +#define TX_SYMLINK 4 /* Create symbolic link to a file */ +#define TX_REMOVE 5 /* Remove file */ +#define TX_RMDIR 6 /* Remove directory */ +#define TX_LINK 7 /* Create hard link to a file */ +#define TX_RENAME 8 /* Rename a file */ +#define TX_WRITE 9 /* File write */ +#define TX_TRUNCATE 10 /* Truncate a file */ +#define TX_SETATTR 11 /* Set file attributes */ +#define TX_ACL_V0 12 /* Set old formatted ACL */ +#define TX_ACL 13 /* Set ACL */ +#define TX_CREATE_ACL 14 /* create with ACL */ +#define TX_CREATE_ATTR 15 /* create + attrs */ +#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ +#define TX_MKDIR_ACL 17 /* mkdir with ACL */ +#define TX_MKDIR_ATTR 18 /* mkdir with attr */ +#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ +#define TX_MAX_TYPE 20 /* Max transaction type */ + +/* + * The transactions for mkdir, symlink, remove, rmdir, link, and rename + * may have the following bit set, indicating the original request + * specified case-insensitive handling of names. + */ +#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ + +/* + * Format of log records. + * The fields are carefully defined to allow them to be aligned + * and sized the same on sparc & intel architectures. + * Each log record has a common structure at the beginning. + * + * Note, lrc_seq holds two different sequence numbers. Whilst in memory + * it contains the transaction sequence number. The log record on + * disk holds the sequence number of all log records which is used to + * ensure we don't replay the same record. The two sequence numbers are + * different because the transactions can now be pushed out of order. + */ +typedef struct { /* common log record header */ + uint64_t lrc_txtype; /* intent log transaction type */ + uint64_t lrc_reclen; /* transaction record length */ + uint64_t lrc_txg; /* dmu transaction group number */ + uint64_t lrc_seq; /* see comment above */ +} lr_t; + +/* + * Handle option extended vattr attributes. + * + * Whenever new attributes are added the version number + * will need to be updated as will code in + * zfs_log.c and zfs_replay.c + */ +typedef struct { + uint32_t lr_attr_masksize; /* number of elements in array */ + uint32_t lr_attr_bitmap; /* First entry of array */ + /* remainder of array and any additional fields */ +} lr_attr_t; + +/* + * log record for creates without optional ACL. + * This log record does support optional xvattr_t attributes. + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* object id of directory */ + uint64_t lr_foid; /* object id of created file object */ + uint64_t lr_mode; /* mode of object */ + uint64_t lr_uid; /* uid of object */ + uint64_t lr_gid; /* gid of object */ + uint64_t lr_gen; /* generation (txg of creation) */ + uint64_t lr_crtime[2]; /* creation time */ + uint64_t lr_rdev; /* rdev of object to create */ + /* name of object to create follows this */ + /* for symlinks, link content follows name */ + /* for creates with xvattr data, the name follows the xvattr info */ +} lr_create_t; + +/* + * FUID ACL record will be an array of ACEs from the original ACL. + * If this array includes ephemeral IDs, the record will also include + * an array of log-specific FUIDs to replace the ephemeral IDs. + * Only one copy of each unique domain will be present, so the log-specific + * FUIDs will use an index into a compressed domain table. On replay this + * information will be used to construct real FUIDs (and bypass idmap, + * since it may not be available). + */ + +/* + * Log record for creates with optional ACL + * This log record is also used for recording any FUID + * information needed for replaying the create. If the + * file doesn't have any actual ACEs then the lr_aclcnt + * would be zero. + */ +typedef struct { + lr_create_t lr_create; /* common create portion */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ + /* lr_acl_bytes number of variable sized ace's follows */ + /* if create is also setting xvattr's, then acl data follows xvattr */ + /* if ACE FUIDs are needed then they will follow the xvattr_t */ + /* Following the FUIDs will be the domain table information. */ + /* The FUIDs for the owner and group will be in the lr_create */ + /* portion of the record. */ + /* name follows ACL data */ +} lr_acl_create_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + /* name of object to remove follows this */ +} lr_remove_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + uint64_t lr_link_obj; /* obj id of link */ + /* name of object to link follows this */ +} lr_link_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_sdoid; /* obj id of source directory */ + uint64_t lr_tdoid; /* obj id of target directory */ + /* 2 strings: names of source and destination follow this */ +} lr_rename_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to write */ + uint64_t lr_offset; /* offset to write to */ + uint64_t lr_length; /* user data length to write */ + uint64_t lr_blkoff; /* offset represented by lr_blkptr */ + blkptr_t lr_blkptr; /* spa block pointer for replay */ + /* write data will follow for small writes */ +} lr_write_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id of file to truncate */ + uint64_t lr_offset; /* offset to truncate from */ + uint64_t lr_length; /* length to truncate */ +} lr_truncate_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to change attributes */ + uint64_t lr_mask; /* mask of attributes to set */ + uint64_t lr_mode; /* mode to set */ + uint64_t lr_uid; /* uid to set */ + uint64_t lr_gid; /* gid to set */ + uint64_t lr_size; /* size to set */ + uint64_t lr_atime[2]; /* access time */ + uint64_t lr_mtime[2]; /* modification time */ + /* optional attribute lr_attr_t may be here */ +} lr_setattr_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of acl entries */ + /* lr_aclcnt number of ace_t entries follow this */ +} lr_acl_v0_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ + /* lr_acl_bytes number of variable sized ace's follows */ +} lr_acl_t; + +/* + * ZIL structure definitions, interface function prototype and globals. + */ + +/* + * ZFS intent log transaction structure + */ +typedef enum { + WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ + /* and put blkptr in log, rather than actual data) */ + WR_COPIED, /* immediate - data is copied into lr_write_t */ + WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ +} itx_wr_state_t; + +typedef struct itx { + list_node_t itx_node; /* linkage on zl_itx_list */ + void *itx_private; /* type-specific opaque data */ + itx_wr_state_t itx_wr_state; /* write state */ + uint8_t itx_sync; /* synchronous transaction */ + uint64_t itx_sod; /* record size on disk */ + lr_t itx_lr; /* common part of log record */ + /* followed by type-specific part of lr_xx_t and its immediate data */ +} itx_t; + + +/* + * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done() + * to handle the cleanup of the dmu_sync() buffer write + */ +typedef struct { + zilog_t *zgd_zilog; /* zilog */ + blkptr_t *zgd_bp; /* block pointer */ + struct rl *zgd_rl; /* range lock */ +} zgd_t; + + +typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, + uint64_t txg); +typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, + uint64_t txg); +typedef int zil_replay_func_t(); +typedef void zil_replay_cleaner_t(); +typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); + +extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); + +extern void zil_init(void); +extern void zil_fini(void); + +extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); +extern void zil_free(zilog_t *zilog); + +extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); +extern void zil_close(zilog_t *zilog); + +extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, + zil_replay_func_t *replay_func[TX_MAX_TYPE], + zil_replay_cleaner_t *replay_cleaner); +extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); +extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); + +extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); +extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); + +extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); + +extern int zil_claim(char *osname, void *txarg); +extern int zil_check_log_chain(char *osname, void *txarg); +extern int zil_clear_log_chain(char *osname, void *txarg); +extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_clean(zilog_t *zilog); +extern int zil_is_committed(zilog_t *zilog); + +extern int zil_suspend(zilog_t *zilog); +extern void zil_resume(zilog_t *zilog); + +extern void zil_add_block(zilog_t *zilog, blkptr_t *bp); + +extern int zil_disable; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_H */ diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h new file mode 100644 index 000000000..0fc800b96 --- /dev/null +++ b/module/zfs/include/sys/zil_impl.h @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIL_IMPL_H +#define _SYS_ZIL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zil.h> +#include <sys/dmu_objset.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Log write buffer. + */ +typedef struct lwb { + zilog_t *lwb_zilog; /* back pointer to log struct */ + blkptr_t lwb_blk; /* on disk address of this log blk */ + int lwb_nused; /* # used bytes in buffer */ + int lwb_sz; /* size of block and buffer */ + char *lwb_buf; /* log write buffer */ + zio_t *lwb_zio; /* zio for this buffer */ + uint64_t lwb_max_txg; /* highest txg in this lwb */ + txg_handle_t lwb_txgh; /* txg handle for txg_exit() */ + list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ +} lwb_t; + +/* + * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs + * we've touched so we know which ones need a write cache flush at the end. + */ +typedef struct zil_vdev_node { + uint64_t zv_vdev; /* vdev to be flushed */ + avl_node_t zv_node; /* AVL tree linkage */ +} zil_vdev_node_t; + +/* + * Stable storage intent log management structure. One per dataset. + */ +struct zilog { + kmutex_t zl_lock; /* protects most zilog_t fields */ + struct dsl_pool *zl_dmu_pool; /* DSL pool */ + spa_t *zl_spa; /* handle for read/write log */ + const zil_header_t *zl_header; /* log header buffer */ + objset_t *zl_os; /* object set we're logging */ + zil_get_data_t *zl_get_data; /* callback to get object content */ + zio_t *zl_root_zio; /* log writer root zio */ + uint64_t zl_itx_seq; /* next itx sequence number */ + uint64_t zl_commit_seq; /* committed upto this number */ + uint64_t zl_lr_seq; /* log record sequence number */ + uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ + uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ + uint32_t zl_suspend; /* log suspend count */ + kcondvar_t zl_cv_writer; /* log writer thread completion */ + kcondvar_t zl_cv_suspend; /* log suspend completion */ + uint8_t zl_suspending; /* log is currently suspending */ + uint8_t zl_keep_first; /* keep first log block in destroy */ + uint8_t zl_stop_replay; /* don't replay any further */ + uint8_t zl_stop_sync; /* for debugging */ + uint8_t zl_writer; /* boolean: write setup in progress */ + uint8_t zl_log_error; /* boolean: log write error */ + list_t zl_itx_list; /* in-memory itx list */ + uint64_t zl_itx_list_sz; /* total size of records on list */ + uint64_t zl_cur_used; /* current commit log size used */ + uint64_t zl_prev_used; /* previous commit log size used */ + list_t zl_lwb_list; /* in-flight log write list */ + kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ + avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ + taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ + avl_tree_t zl_dva_tree; /* track DVAs during log parse */ + clock_t zl_replay_time; /* lbolt of when replay started */ + uint64_t zl_replay_blks; /* number of log blocks replayed */ +}; + +typedef struct zil_dva_node { + dva_t zn_dva; + avl_node_t zn_node; +} zil_dva_node_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_IMPL_H */ diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h new file mode 100644 index 000000000..4de78dfee --- /dev/null +++ b/module/zfs/include/sys/zio.h @@ -0,0 +1,424 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZIO_H +#define _ZIO_H + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/fs/zfs.h> +#include <sys/zio_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ + +typedef struct zio_block_tail { + uint64_t zbt_magic; /* for validation, endianness */ + zio_cksum_t zbt_cksum; /* 256-bit checksum */ +} zio_block_tail_t; + +/* + * Gang block headers are self-checksumming and contain an array + * of block pointers. + */ +#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE +#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_block_tail_t)) / sizeof (blkptr_t)) +#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_block_tail_t) - \ + (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ + sizeof (uint64_t)) + +typedef struct zio_gbh { + blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; + uint64_t zg_filler[SPA_GBH_FILLER]; + zio_block_tail_t zg_tail; +} zio_gbh_phys_t; + +enum zio_checksum { + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_FUNCTIONS +}; + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_FUNCTIONS +}; + +#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +#define ZIO_FAILURE_MODE_WAIT 0 +#define ZIO_FAILURE_MODE_CONTINUE 1 +#define ZIO_FAILURE_MODE_PANIC 2 + +#define ZIO_PRIORITY_NOW (zio_priority_table[0]) +#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) +#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) +#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) +#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) +#define ZIO_PRIORITY_FREE (zio_priority_table[5]) +#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) +#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) +#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) +#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) +#define ZIO_PRIORITY_TABLE_SIZE 10 + +#define ZIO_FLAG_MUSTSUCCEED 0x00000 +#define ZIO_FLAG_CANFAIL 0x00001 +#define ZIO_FLAG_SPECULATIVE 0x00002 +#define ZIO_FLAG_CONFIG_WRITER 0x00004 +#define ZIO_FLAG_DONT_RETRY 0x00008 + +#define ZIO_FLAG_DONT_CACHE 0x00010 +#define ZIO_FLAG_DONT_QUEUE 0x00020 +#define ZIO_FLAG_DONT_AGGREGATE 0x00040 +#define ZIO_FLAG_DONT_PROPAGATE 0x00080 + +#define ZIO_FLAG_IO_BYPASS 0x00100 +#define ZIO_FLAG_IO_REPAIR 0x00200 +#define ZIO_FLAG_IO_RETRY 0x00400 +#define ZIO_FLAG_IO_REWRITE 0x00800 + +#define ZIO_FLAG_PROBE 0x01000 +#define ZIO_FLAG_RESILVER 0x02000 +#define ZIO_FLAG_SCRUB 0x04000 +#define ZIO_FLAG_SCRUB_THREAD 0x08000 + +#define ZIO_FLAG_GANG_CHILD 0x10000 + +#define ZIO_FLAG_GANG_INHERIT \ + (ZIO_FLAG_CANFAIL | \ + ZIO_FLAG_SPECULATIVE | \ + ZIO_FLAG_CONFIG_WRITER | \ + ZIO_FLAG_DONT_RETRY | \ + ZIO_FLAG_DONT_CACHE | \ + ZIO_FLAG_DONT_AGGREGATE | \ + ZIO_FLAG_RESILVER | \ + ZIO_FLAG_SCRUB | \ + ZIO_FLAG_SCRUB_THREAD) + +#define ZIO_FLAG_VDEV_INHERIT \ + (ZIO_FLAG_GANG_INHERIT | \ + ZIO_FLAG_IO_REPAIR | \ + ZIO_FLAG_IO_RETRY | \ + ZIO_FLAG_PROBE) + +#define ZIO_PIPELINE_CONTINUE 0x100 +#define ZIO_PIPELINE_STOP 0x101 + +#define ZIO_GANG_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ + ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) + +enum zio_child { + ZIO_CHILD_VDEV = 0, + ZIO_CHILD_GANG, + ZIO_CHILD_LOGICAL, + ZIO_CHILD_TYPES +}; + +enum zio_wait_type { + ZIO_WAIT_READY = 0, + ZIO_WAIT_DONE, + ZIO_WAIT_TYPES +}; + +/* + * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent + * graveyard) to indicate checksum errors and fragmentation. + */ +#define ECKSUM EBADE +#define EFRAGS EBADR + +typedef struct zio zio_t; +typedef void zio_done_func_t(zio_t *zio); + +extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; +extern char *zio_type_name[ZIO_TYPES]; + +/* + * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely + * identifies any block in the pool. By convention, the meta-objset (MOS) + * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is + * level -1 of the meta-dnode, and intent log blocks (which are chained + * off the root block) have blkid == sequence number. In summary: + * + * mos is objset 0 + * meta-dnode is object 0 + * root block is <objset, 0, -1, 0> + * intent log is <objset, 0, -1, ZIL sequence number> + * + * Note: this structure is called a bookmark because its first purpose was + * to remember where to resume a pool-wide traverse. The absolute ordering + * for block visitation during traversal is defined in compare_bookmark(). + * + * Note: this structure is passed between userland and the kernel. + * Therefore it must not change size or alignment between 32/64 bit + * compilation options. + */ +typedef struct zbookmark { + uint64_t zb_objset; + uint64_t zb_object; + int64_t zb_level; + uint64_t zb_blkid; +} zbookmark_t; + +typedef struct zio_prop { + enum zio_checksum zp_checksum; + enum zio_compress zp_compress; + dmu_object_type_t zp_type; + uint8_t zp_level; + uint8_t zp_ndvas; +} zio_prop_t; + +typedef struct zio_gang_node { + zio_gbh_phys_t *gn_gbh; + struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; +} zio_gang_node_t; + +typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, + zio_gang_node_t *gn, void *data); + +typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); + +typedef struct zio_transform { + void *zt_orig_data; + uint64_t zt_orig_size; + uint64_t zt_bufsize; + zio_transform_func_t *zt_transform; + struct zio_transform *zt_next; +} zio_transform_t; + +typedef int zio_pipe_stage_t(zio_t *zio); + +/* + * The io_reexecute flags are distinct from io_flags because the child must + * be able to propagate them to the parent. The normal io_flags are local + * to the zio, not protected by any lock, and not modifiable by children; + * the reexecute flags are protected by io_lock, modifiable by children, + * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. + */ +#define ZIO_REEXECUTE_NOW 0x01 +#define ZIO_REEXECUTE_SUSPEND 0x02 + +struct zio { + /* Core information about this I/O */ + zbookmark_t io_bookmark; + zio_prop_t io_prop; + zio_type_t io_type; + enum zio_child io_child_type; + int io_cmd; + uint8_t io_priority; + uint8_t io_reexecute; + uint8_t io_async_root; + uint64_t io_txg; + spa_t *io_spa; + blkptr_t *io_bp; + blkptr_t io_bp_copy; + zio_t *io_parent; + zio_t *io_child; + zio_t *io_sibling_prev; + zio_t *io_sibling_next; + zio_t *io_logical; + zio_transform_t *io_transform_stack; + + /* Callback info */ + zio_done_func_t *io_ready; + zio_done_func_t *io_done; + void *io_private; + blkptr_t io_bp_orig; + + /* Data represented by this I/O */ + void *io_data; + uint64_t io_size; + + /* Stuff for the vdev stack */ + vdev_t *io_vd; + void *io_vsd; + zio_done_func_t *io_vsd_free; + uint64_t io_offset; + uint64_t io_deadline; + avl_node_t io_offset_node; + avl_node_t io_deadline_node; + avl_tree_t *io_vdev_tree; + zio_t *io_delegate_list; + zio_t *io_delegate_next; + + /* Internal pipeline state */ + int io_flags; + zio_stage_t io_stage; + uint32_t io_pipeline; + int io_orig_flags; + zio_stage_t io_orig_stage; + uint32_t io_orig_pipeline; + int io_error; + int io_child_error[ZIO_CHILD_TYPES]; + uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t *io_stall; + zio_gang_node_t *io_gang_tree; + void *io_executor; + void *io_waiter; + kmutex_t io_lock; + kcondvar_t io_cv; + + /* FMA state */ + uint64_t io_ena; +}; + +extern zio_t *zio_null(zio_t *pio, spa_t *spa, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_root(spa_t *spa, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, + uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb); + +extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb); + +extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, zbookmark_t *zb); + +extern void zio_skip_write(zio_t *zio); + +extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *private, int priority, int flags); + +extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, void *data, int checksum, + zio_done_func_t *done, void *private, int priority, int flags, + boolean_t labels); + +extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, void *data, int checksum, + zio_done_func_t *done, void *private, int priority, int flags, + boolean_t labels); + +extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t txg); +extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); +extern void zio_flush(zio_t *zio, vdev_t *vd); + +extern int zio_wait(zio_t *zio); +extern void zio_nowait(zio_t *zio); +extern void zio_execute(zio_t *zio); +extern void zio_interrupt(zio_t *zio); + +extern void *zio_buf_alloc(size_t size); +extern void zio_buf_free(void *buf, size_t size); +extern void *zio_data_buf_alloc(size_t size); +extern void zio_data_buf_free(void *buf, size_t size); + +extern void zio_resubmit_stage_async(void *); + +extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, + uint64_t offset, void *data, uint64_t size, int type, int priority, + int flags, zio_done_func_t *done, void *private); + +extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, + void *data, uint64_t size, int type, int priority, + int flags, zio_done_func_t *done, void *private); + +extern void zio_vdev_io_bypass(zio_t *zio); +extern void zio_vdev_io_reissue(zio_t *zio); +extern void zio_vdev_io_redone(zio_t *zio); + +extern void zio_checksum_verified(zio_t *zio); +extern int zio_worst_error(int e1, int e2); + +extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); +extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); + +extern void zio_suspend(spa_t *spa, zio_t *zio); +extern void zio_resume(spa_t *spa); +extern void zio_resume_wait(spa_t *spa); + +/* + * Initial setup and teardown. + */ +extern void zio_init(void); +extern void zio_fini(void); + +/* + * Fault injection + */ +struct zinject_record; +extern uint32_t zio_injection_enabled; +extern int zio_inject_fault(char *name, int flags, int *id, + struct zinject_record *record); +extern int zio_inject_list_next(int *id, char *name, size_t buflen, + struct zinject_record *record); +extern int zio_clear_fault(int id); +extern int zio_handle_fault_injection(zio_t *zio, int error); +extern int zio_handle_device_injection(vdev_t *vd, int error); +extern int zio_handle_label_injection(zio_t *zio, int error); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_H */ diff --git a/module/zfs/include/sys/zio_checksum.h b/module/zfs/include/sys/zio_checksum.h new file mode 100644 index 000000000..da407399d --- /dev/null +++ b/module/zfs/include/sys/zio_checksum.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIO_CHECKSUM_H +#define _SYS_ZIO_CHECKSUM_H + +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Signature for checksum functions. + */ +typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); + +/* + * Information about each checksum function. + */ +typedef struct zio_checksum_info { + zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ + int ci_correctable; /* number of correctable bits */ + int ci_zbt; /* uses zio block tail? */ + char *ci_name; /* descriptive name */ +} zio_checksum_info_t; + +extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; + +/* + * Checksum routines. + */ +extern zio_checksum_t fletcher_2_native; +extern zio_checksum_t fletcher_4_native; +extern zio_checksum_t fletcher_4_incremental_native; + +extern zio_checksum_t fletcher_2_byteswap; +extern zio_checksum_t fletcher_4_byteswap; +extern zio_checksum_t fletcher_4_incremental_byteswap; + +extern zio_checksum_t zio_checksum_SHA256; + +extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, + void *data, uint64_t size); +extern int zio_checksum_error(zio_t *zio); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_CHECKSUM_H */ diff --git a/module/zfs/include/sys/zio_compress.h b/module/zfs/include/sys/zio_compress.h new file mode 100644 index 000000000..66ee8d45b --- /dev/null +++ b/module/zfs/include/sys/zio_compress.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIO_COMPRESS_H +#define _SYS_ZIO_COMPRESS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Common signature for all zio compress/decompress functions. + */ +typedef size_t zio_compress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +typedef int zio_decompress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); + +/* + * Information about each compression function. + */ +typedef struct zio_compress_info { + zio_compress_func_t *ci_compress; /* compression function */ + zio_decompress_func_t *ci_decompress; /* decompression function */ + int ci_level; /* level parameter */ + char *ci_name; /* algorithm name */ +} zio_compress_info_t; + +extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; + +/* + * Compression routines. + */ +extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); + +/* + * Compress and decompress data if necessary. + */ +extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, + void **destp, uint64_t *destsizep, uint64_t *destbufsizep); +extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, uint64_t destsize); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_COMPRESS_H */ diff --git a/module/zfs/include/sys/zio_impl.h b/module/zfs/include/sys/zio_impl.h new file mode 100644 index 000000000..e7503b733 --- /dev/null +++ b/module/zfs/include/sys/zio_impl.h @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZIO_IMPL_H +#define _ZIO_IMPL_H + +#include <sys/zfs_context.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * I/O Groups: pipeline stage definitions. + */ +typedef enum zio_stage { + ZIO_STAGE_OPEN = 0, /* RWFCI */ + + ZIO_STAGE_ISSUE_ASYNC, /* -W--- */ + + ZIO_STAGE_READ_BP_INIT, /* R---- */ + ZIO_STAGE_WRITE_BP_INIT, /* -W--- */ + + ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ + + ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE, /* RWFC- */ + + ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ + ZIO_STAGE_DVA_FREE, /* --F-- */ + ZIO_STAGE_DVA_CLAIM, /* ---C- */ + + ZIO_STAGE_READY, /* RWFCI */ + + ZIO_STAGE_VDEV_IO_START, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ + + ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ + + ZIO_STAGE_DONE, /* RWFCI */ + ZIO_STAGES +} zio_stage_t; + +#define ZIO_INTERLOCK_STAGES \ + ((1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_INTERLOCK_PIPELINE \ + ZIO_INTERLOCK_STAGES + +#define ZIO_VDEV_IO_STAGES \ + ((1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_VDEV_IO_DONE) | \ + (1U << ZIO_STAGE_VDEV_IO_ASSESS)) + +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_READ_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY)) + +#define ZIO_READ_PHYS_PIPELINE \ + ZIO_READ_COMMON_STAGES + +#define ZIO_READ_PIPELINE \ + (ZIO_READ_COMMON_STAGES | \ + (1U << ZIO_STAGE_READ_BP_INIT)) + +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_ISSUE_ASYNC) | \ + (1U << ZIO_STAGE_CHECKSUM_GENERATE)) + +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES + +#define ZIO_REWRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_WRITE_BP_INIT)) + +#define ZIO_WRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_WRITE_BP_INIT) | \ + (1U << ZIO_STAGE_DVA_ALLOCATE)) + +#define ZIO_GANG_STAGES \ + ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \ + (1U << ZIO_STAGE_GANG_ISSUE)) + +#define ZIO_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_DVA_FREE)) + +#define ZIO_CLAIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_DVA_CLAIM)) + +#define ZIO_IOCTL_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_VDEV_IO_ASSESS)) + +#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \ + ((1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_DVA_ALLOCATE) | \ + (1U << ZIO_STAGE_DVA_CLAIM)) + +extern void zio_inject_init(void); +extern void zio_inject_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_IMPL_H */ diff --git a/module/zfs/include/sys/zvol.h b/module/zfs/include/sys/zvol.h new file mode 100644 index 000000000..06adc667e --- /dev/null +++ b/module/zfs/include/sys/zvol.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZVOL_H +#define _SYS_ZVOL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZVOL_OBJ 1ULL +#define ZVOL_ZAP_OBJ 2ULL + +#ifdef _KERNEL +extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); +extern int zvol_check_volblocksize(uint64_t volblocksize); +extern int zvol_get_stats(objset_t *os, nvlist_t *nv); +extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); +extern int zvol_create_minor(const char *, major_t); +extern int zvol_remove_minor(const char *); +extern int zvol_set_volsize(const char *, major_t, uint64_t); +extern int zvol_set_volblocksize(const char *, uint64_t); + +extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); +extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); +extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); +extern int zvol_strategy(buf_t *bp); +extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); +extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); +extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, + int *rvalp); +extern int zvol_busy(void); +extern void zvol_init(void); +extern void zvol_fini(void); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZVOL_H */ diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c new file mode 100644 index 000000000..7fcde8475 --- /dev/null +++ b/module/zfs/lzjb.c @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * We keep our own copy of this algorithm for 2 main reasons: + * 1. If we didn't, anyone modifying common/os/compress.c would + * directly break our on disk format + * 2. Our version of lzjb does not have a number of checks that the + * common/os version needs and uses + * In particular, we are adding the "feature" that compress() can + * take a destination buffer size and return -1 if the data will not + * compress to d_len or less. + */ + +#include <sys/types.h> + +#define MATCH_BITS 6 +#define MATCH_MIN 3 +#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) +#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) +#define LEMPEL_SIZE 256 + +/*ARGSUSED*/ +size_t +lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *cpy, *copymap; + int copymask = 1 << (NBBY - 1); + int mlen, offset; + uint16_t *hp; + uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */ + + while (src < (uchar_t *)s_start + s_len) { + if ((copymask <<= 1) == (1 << NBBY)) { + if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { + if (d_len != s_len) + return (s_len); + mlen = s_len; + for (src = s_start, dst = d_start; mlen; mlen--) + *dst++ = *src++; + return (s_len); + } + copymask = 1; + copymap = dst; + *dst++ = 0; + } + if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { + *dst++ = *src++; + continue; + } + hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) & + (LEMPEL_SIZE - 1)]; + offset = (intptr_t)(src - *hp) & OFFSET_MASK; + *hp = (uint16_t)(uintptr_t)src; + cpy = src - offset; + if (cpy >= (uchar_t *)s_start && cpy != src && + src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { + *copymap |= copymask; + for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) + if (src[mlen] != cpy[mlen]) + break; + *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | + (offset >> NBBY); + *dst++ = (uchar_t)offset; + src += mlen; + } else { + *dst++ = *src++; + } + } + return (dst - (uchar_t *)d_start); +} + +/*ARGSUSED*/ +int +lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *d_end = (uchar_t *)d_start + d_len; + uchar_t *cpy, copymap; + int copymask = 1 << (NBBY - 1); + + while (dst < d_end) { + if ((copymask <<= 1) == (1 << NBBY)) { + copymask = 1; + copymap = *src++; + } + if (copymap & copymask) { + int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; + int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; + src += 2; + if ((cpy = dst - offset) < (uchar_t *)d_start) + return (-1); + while (--mlen >= 0 && dst < d_end) + *dst++ = *cpy++; + } else { + *dst++ = *src++; + } + } + return (0); +} diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c new file mode 100644 index 000000000..87727fac2 --- /dev/null +++ b/module/zfs/metaslab.c @@ -0,0 +1,1049 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/space_map.h> +#include <sys/metaslab_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> + +uint64_t metaslab_aliquot = 512ULL << 10; +uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ + +/* + * ========================================================================== + * Metaslab classes + * ========================================================================== + */ +metaslab_class_t * +metaslab_class_create(void) +{ + metaslab_class_t *mc; + + mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + + mc->mc_rotor = NULL; + + return (mc); +} + +void +metaslab_class_destroy(metaslab_class_t *mc) +{ + metaslab_group_t *mg; + + while ((mg = mc->mc_rotor) != NULL) { + metaslab_class_remove(mc, mg); + metaslab_group_destroy(mg); + } + + kmem_free(mc, sizeof (metaslab_class_t)); +} + +void +metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) +{ + metaslab_group_t *mgprev, *mgnext; + + ASSERT(mg->mg_class == NULL); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; + mg->mg_class = mc; +} + +void +metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) +{ + metaslab_group_t *mgprev, *mgnext; + + ASSERT(mg->mg_class == mc); + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; + mg->mg_class = NULL; +} + +/* + * ========================================================================== + * Metaslab groups + * ========================================================================== + */ +static int +metaslab_compare(const void *x1, const void *x2) +{ + const metaslab_t *m1 = x1; + const metaslab_t *m2 = x2; + + if (m1->ms_weight < m2->ms_weight) + return (1); + if (m1->ms_weight > m2->ms_weight) + return (-1); + + /* + * If the weights are identical, use the offset to force uniqueness. + */ + if (m1->ms_map.sm_start < m2->ms_map.sm_start) + return (-1); + if (m1->ms_map.sm_start > m2->ms_map.sm_start) + return (1); + + ASSERT3P(m1, ==, m2); + + return (0); +} + +metaslab_group_t * +metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) +{ + metaslab_group_t *mg; + + mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&mg->mg_metaslab_tree, metaslab_compare, + sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); + mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); + mg->mg_vd = vd; + metaslab_class_add(mc, mg); + + return (mg); +} + +void +metaslab_group_destroy(metaslab_group_t *mg) +{ + avl_destroy(&mg->mg_metaslab_tree); + mutex_destroy(&mg->mg_lock); + kmem_free(mg, sizeof (metaslab_group_t)); +} + +static void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +{ + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == NULL); + msp->ms_group = mg; + msp->ms_weight = 0; + avl_add(&mg->mg_metaslab_tree, msp); + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_group = NULL; + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + /* + * Although in principle the weight can be any value, in + * practice we do not use values in the range [1, 510]. + */ + ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + mutex_exit(&mg->mg_lock); +} + +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static void +metaslab_ff_load(space_map_t *sm) +{ + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); +} + +static void +metaslab_ff_unload(space_map_t *sm) +{ + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; +} + +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + space_seg_t *ss, ssearch; + avl_index_t where; + + ssearch.ss_start = *cursor; + ssearch.ss_end = *cursor + size; + + ss = avl_find(t, &ssearch, &where); + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + + while (ss != NULL) { + uint64_t offset = P2ROUNDUP(ss->ss_start, align); + + if (offset + size <= ss->ss_end) { + *cursor = offset + size; + return (offset); + } + ss = AVL_NEXT(t, ss); + } + + /* + * If we know we've searched the whole map (*cursor == 0), give up. + * Otherwise, reset the cursor to the beginning and try again. + */ + if (*cursor == 0) + return (-1ULL); + + *cursor = 0; + return (metaslab_ff_alloc(sm, size)); +} + +/* ARGSUSED */ +static void +metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +/* ARGSUSED */ +static void +metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +static space_map_ops_t metaslab_ff_ops = { + metaslab_ff_load, + metaslab_ff_unload, + metaslab_ff_alloc, + metaslab_ff_claim, + metaslab_ff_free +}; + +/* + * ========================================================================== + * Metaslabs + * ========================================================================== + */ +metaslab_t * +metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, + uint64_t start, uint64_t size, uint64_t txg) +{ + vdev_t *vd = mg->mg_vd; + metaslab_t *msp; + + msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); + + msp->ms_smo_syncing = *smo; + + /* + * We create the main space map here, but we don't create the + * allocmaps and freemaps until metaslab_sync_done(). This serves + * two purposes: it allows metaslab_sync_done() to detect the + * addition of new space; and for debugging, it ensures that we'd + * data fault on any attempt to use this metaslab before it's ready. + */ + space_map_create(&msp->ms_map, start, size, + vd->vdev_ashift, &msp->ms_lock); + + metaslab_group_add(mg, msp); + + /* + * If we're opening an existing pool (txg == 0) or creating + * a new one (txg == TXG_INITIAL), all space is available now. + * If we're adding space to an existing pool, the new space + * does not become available until after this txg has synced. + */ + if (txg <= TXG_INITIAL) + metaslab_sync_done(msp, 0); + + if (txg != 0) { + /* + * The vdev is dirty, but the metaslab isn't -- it just needs + * to have metaslab_sync_done() invoked from vdev_sync_done(). + * [We could just dirty the metaslab, but that would cause us + * to allocate a space map object for it, which is wasteful + * and would mess up the locality logic in metaslab_weight().] + */ + ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); + vdev_dirty(vd, 0, NULL, txg); + vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); + } + + return (msp); +} + +void +metaslab_fini(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + int t; + + vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, + -msp->ms_smo.smo_alloc, B_TRUE); + + metaslab_group_remove(mg, msp); + + mutex_enter(&msp->ms_lock); + + space_map_unload(&msp->ms_map); + space_map_destroy(&msp->ms_map); + + for (t = 0; t < TXG_SIZE; t++) { + space_map_destroy(&msp->ms_allocmap[t]); + space_map_destroy(&msp->ms_freemap[t]); + } + + mutex_exit(&msp->ms_lock); + mutex_destroy(&msp->ms_lock); + + kmem_free(msp, sizeof (metaslab_t)); +} + +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) +#define METASLAB_ACTIVE_MASK \ + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) +#define METASLAB_SMO_BONUS_MULTIPLIER 2 + +static uint64_t +metaslab_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; + vdev_t *vd = mg->mg_vd; + uint64_t weight, space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The baseline weight is the metaslab's free space. + */ + space = sm->sm_size - smo->smo_alloc; + weight = space; + + /* + * Modern disks have uniform bit density and constant angular velocity. + * Therefore, the outer recording zones are faster (higher bandwidth) + * than the inner zones by the ratio of outer to inner track diameter, + * which is typically around 2:1. We account for this by assigning + * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). + * In effect, this means that we'll select the metaslab with the most + * free bandwidth rather than simply the one with the most free space. + */ + weight = 2 * weight - + ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; + ASSERT(weight >= space && weight <= 2 * space); + + /* + * For locality, assign higher weight to metaslabs we've used before. + */ + if (smo->smo_object != 0) + weight *= METASLAB_SMO_BONUS_MULTIPLIER; + ASSERT(weight >= space && + weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); + + /* + * If this metaslab is one we're actively using, adjust its weight to + * make it preferable to any inactive metaslab so we'll polish it off. + */ + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + + return (weight); +} + +static int +metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +{ + space_map_t *sm = &msp->ms_map; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { + int error = space_map_load(sm, &metaslab_ff_ops, + SM_FREE, &msp->ms_smo, + msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); + if (error) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + metaslab_group_sort(msp->ms_group, msp, + msp->ms_weight | activation_weight); + } + ASSERT(sm->sm_loaded); + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); + + return (0); +} + +static void +metaslab_passivate(metaslab_t *msp, uint64_t size) +{ + /* + * If size < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. In that case, it had better be empty, + * or we would be leaving space on the table. + */ + ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); + metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); + ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); +} + +/* + * Write a metaslab to disk in the context of the specified transaction group. + */ +void +metaslab_sync(metaslab_t *msp, uint64_t txg) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; + space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; + space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo_syncing; + dmu_buf_t *db; + dmu_tx_t *tx; + int t; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + /* + * The only state that can actually be changing concurrently with + * metaslab_sync() is the metaslab's ms_map. No other thread can + * be modifying this txg's allocmap, freemap, freed_map, or smo. + * Therefore, we only hold ms_lock to satify space_map ASSERTs. + * We drop it whenever we call into the DMU, because the DMU + * can call down to us (e.g. via zio_free()) at any time. + */ + mutex_enter(&msp->ms_lock); + + if (smo->smo_object == 0) { + ASSERT(smo->smo_objsize == 0); + ASSERT(smo->smo_alloc == 0); + mutex_exit(&msp->ms_lock); + smo->smo_object = dmu_object_alloc(mos, + DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, + DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); + ASSERT(smo->smo_object != 0); + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + (sm->sm_start >> vd->vdev_ms_shift), + sizeof (uint64_t), &smo->smo_object, tx); + mutex_enter(&msp->ms_lock); + } + + space_map_walk(freemap, space_map_add, freed_map); + + if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= + 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { + /* + * The in-core space map representation is twice as compact + * as the on-disk one, so it's time to condense the latter + * by generating a pure allocmap from first principles. + * + * This metaslab is 100% allocated, + * minus the content of the in-core map (sm), + * minus what's been freed this txg (freed_map), + * minus allocations from txgs in the future + * (because they haven't been committed yet). + */ + space_map_vacate(allocmap, NULL, NULL); + space_map_vacate(freemap, NULL, NULL); + + space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); + + space_map_walk(sm, space_map_remove, allocmap); + space_map_walk(freed_map, space_map_remove, allocmap); + + for (t = 1; t < TXG_CONCURRENT_STATES; t++) + space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], + space_map_remove, allocmap); + + mutex_exit(&msp->ms_lock); + space_map_truncate(smo, mos, tx); + mutex_enter(&msp->ms_lock); + } + + space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); + space_map_sync(freemap, SM_FREE, smo, mos, tx); + + mutex_exit(&msp->ms_lock); + + VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(smo, db->db_data, sizeof (*smo)); + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); +} + +/* + * Called after a transaction group has completely synced to mark + * all of the metaslab's free space as usable. + */ +void +metaslab_sync_done(metaslab_t *msp, uint64_t txg) +{ + space_map_obj_t *smo = &msp->ms_smo; + space_map_obj_t *smosync = &msp->ms_smo_syncing; + space_map_t *sm = &msp->ms_map; + space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + int t; + + mutex_enter(&msp->ms_lock); + + /* + * If this metaslab is just becoming available, initialize its + * allocmaps and freemaps and add its capacity to the vdev. + */ + if (freed_map->sm_size == 0) { + for (t = 0; t < TXG_SIZE; t++) { + space_map_create(&msp->ms_allocmap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + space_map_create(&msp->ms_freemap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + } + vdev_space_update(vd, sm->sm_size, 0, B_TRUE); + } + + vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); + + ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); + ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); + + /* + * If there's a space_map_load() in progress, wait for it to complete + * so that we have a consistent view of the in-core space map. + * Then, add everything we freed in this txg to the map. + */ + space_map_load_wait(sm); + space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); + + *smo = *smosync; + + /* + * If the map is loaded but no longer active, evict it as soon as all + * future allocations have synced. (If we unloaded it now and then + * loaded a moment later, the map wouldn't reflect those allocations.) + */ + if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { + int evictable = 1; + + for (t = 1; t < TXG_CONCURRENT_STATES; t++) + if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) + evictable = 0; + + if (evictable) + space_map_unload(sm); + } + + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + + mutex_exit(&msp->ms_lock); +} + +static uint64_t +metaslab_distance(metaslab_t *msp, dva_t *dva) +{ + uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; + uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; + uint64_t start = msp->ms_map.sm_start >> ms_shift; + + if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) + return (1ULL << 63); + + if (offset < start) + return ((start - offset) << ms_shift); + if (offset > start) + return ((offset - start) << ms_shift); + return (0); +} + +static uint64_t +metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, + uint64_t min_distance, dva_t *dva, int d) +{ + metaslab_t *msp = NULL; + uint64_t offset = -1ULL; + avl_tree_t *t = &mg->mg_metaslab_tree; + uint64_t activation_weight; + uint64_t target_distance; + int i; + + activation_weight = METASLAB_WEIGHT_PRIMARY; + for (i = 0; i < d; i++) + if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) + activation_weight = METASLAB_WEIGHT_SECONDARY; + + for (;;) { + mutex_enter(&mg->mg_lock); + for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { + if (msp->ms_weight < size) { + mutex_exit(&mg->mg_lock); + return (-1ULL); + } + + if (activation_weight == METASLAB_WEIGHT_PRIMARY) + break; + + target_distance = min_distance + + (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); + + for (i = 0; i < d; i++) + if (metaslab_distance(msp, &dva[i]) < + target_distance) + break; + if (i == d) + break; + } + mutex_exit(&mg->mg_lock); + if (msp == NULL) + return (-1ULL); + + mutex_enter(&msp->ms_lock); + + /* + * Ensure that the metaslab we have selected is still + * capable of handling our request. It's possible that + * another thread may have changed the weight while we + * were blocked on the metaslab lock. + */ + if (msp->ms_weight < size) { + mutex_exit(&msp->ms_lock); + continue; + } + + if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && + activation_weight == METASLAB_WEIGHT_PRIMARY) { + metaslab_passivate(msp, + msp->ms_weight & ~METASLAB_ACTIVE_MASK); + mutex_exit(&msp->ms_lock); + continue; + } + + if (metaslab_activate(msp, activation_weight) != 0) { + mutex_exit(&msp->ms_lock); + continue; + } + + if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + break; + + metaslab_passivate(msp, size - 1); + + mutex_exit(&msp->ms_lock); + } + + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + mutex_exit(&msp->ms_lock); + + return (offset); +} + +/* + * Allocate a block for the specified i/o. + */ +static int +metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) +{ + metaslab_group_t *mg, *rotor; + vdev_t *vd; + int dshift = 3; + int all_zero; + uint64_t offset = -1ULL; + uint64_t asize; + uint64_t distance; + + ASSERT(!DVA_IS_VALID(&dva[d])); + + /* + * For testing, make some blocks above a certain size be gang blocks. + */ + if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) + return (ENOSPC); + + /* + * Start at the rotor and loop through all mgs until we find something. + * Note that there's no locking on mc_rotor or mc_allocated because + * nothing actually breaks if we miss a few updates -- we just won't + * allocate quite as evenly. It all balances out over time. + * + * If we are doing ditto or log blocks, try to spread them across + * consecutive vdevs. If we're forced to reuse a vdev before we've + * allocated all of our ditto blocks, then try and spread them out on + * that vdev as much as possible. If it turns out to not be possible, + * gradually lower our standards until anything becomes acceptable. + * Also, allocating on consecutive vdevs (as opposed to random vdevs) + * gives us hope of containing our fault domains to something we're + * able to reason about. Otherwise, any two top-level vdev failures + * will guarantee the loss of data. With consecutive allocation, + * only two adjacent top-level vdev failures will result in data loss. + * + * If we are doing gang blocks (hintdva is non-NULL), try to keep + * ourselves on the same vdev as our gang block header. That + * way, we can hope for locality in vdev_cache, plus it makes our + * fault domains something tractable. + */ + if (hintdva) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); + if (flags & METASLAB_HINTBP_AVOID) + mg = vd->vdev_mg->mg_next; + else + mg = vd->vdev_mg; + } else if (d != 0) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); + mg = vd->vdev_mg->mg_next; + } else { + mg = mc->mc_rotor; + } + + /* + * If the hint put us into the wrong class, just follow the rotor. + */ + if (mg->mg_class != mc) + mg = mc->mc_rotor; + + rotor = mg; +top: + all_zero = B_TRUE; + do { + vd = mg->mg_vd; + /* + * Don't allocate from faulted devices. + */ + if (!vdev_allocatable(vd)) + goto next; + /* + * Avoid writing single-copy data to a failing vdev + */ + if ((vd->vdev_stat.vs_write_errors > 0 || + vd->vdev_state < VDEV_STATE_HEALTHY) && + d == 0 && dshift == 3) { + all_zero = B_FALSE; + goto next; + } + + ASSERT(mg->mg_class == mc); + + distance = vd->vdev_asize >> dshift; + if (distance <= (1ULL << vd->vdev_ms_shift)) + distance = 0; + else + all_zero = B_FALSE; + + asize = vdev_psize_to_asize(vd, psize); + ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); + + offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + if (offset != -1ULL) { + /* + * If we've just selected this metaslab group, + * figure out whether the corresponding vdev is + * over- or under-used relative to the pool, + * and set an allocation bias to even it out. + */ + if (mc->mc_allocated == 0) { + vdev_stat_t *vs = &vd->vdev_stat; + uint64_t alloc, space; + int64_t vu, su; + + alloc = spa_get_alloc(spa); + space = spa_get_space(spa); + + /* + * Determine percent used in units of 0..1024. + * (This is just to avoid floating point.) + */ + vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); + su = (alloc << 10) / (space + 1); + + /* + * Bias by at most +/- 25% of the aliquot. + */ + mg->mg_bias = ((su - vu) * + (int64_t)mg->mg_aliquot) / (1024 * 4); + } + + if (atomic_add_64_nv(&mc->mc_allocated, asize) >= + mg->mg_aliquot + mg->mg_bias) { + mc->mc_rotor = mg->mg_next; + mc->mc_allocated = 0; + } + + DVA_SET_VDEV(&dva[d], vd->vdev_id); + DVA_SET_OFFSET(&dva[d], offset); + DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); + DVA_SET_ASIZE(&dva[d], asize); + + return (0); + } +next: + mc->mc_rotor = mg->mg_next; + mc->mc_allocated = 0; + } while ((mg = mg->mg_next) != rotor); + + if (!all_zero) { + dshift++; + ASSERT(dshift < 64); + goto top; + } + + bzero(&dva[d], sizeof (dva_t)); + + return (ENOSPC); +} + +/* + * Free the block represented by DVA in the context of the specified + * transaction group. + */ +static void +metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + metaslab_t *msp; + + ASSERT(DVA_IS_VALID(dva)); + + if (txg > spa_freeze_txg(spa)) + return; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { + cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)offset); + ASSERT(0); + return; + } + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + mutex_enter(&msp->ms_lock); + + if (now) { + space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], + offset, size); + space_map_free(&msp->ms_map, offset, size); + } else { + if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); + } + + mutex_exit(&msp->ms_lock); +} + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + metaslab_t *msp; + int error; + + ASSERT(DVA_IS_VALID(dva)); + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) + return (ENXIO); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + mutex_enter(&msp->ms_lock); + + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + if (error || txg == 0) { /* txg == 0 indicates dry run */ + mutex_exit(&msp->ms_lock); + return (error); + } + + space_map_claim(&msp->ms_map, offset, size); + + if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */ + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + } + + mutex_exit(&msp->ms_lock); + + return (0); +} + +int +metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) +{ + dva_t *dva = bp->blk_dva; + dva_t *hintdva = hintbp->blk_dva; + int error = 0; + + ASSERT(bp->blk_birth == 0); + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + if (mc->mc_rotor == NULL) { /* no vdevs in this class */ + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (ENOSPC); + } + + ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); + ASSERT(BP_GET_NDVAS(bp) == 0); + ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + + for (int d = 0; d < ndvas; d++) { + error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, + txg, flags); + if (error) { + for (d--; d >= 0; d--) { + metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + bzero(&dva[d], sizeof (dva_t)); + } + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (error); + } + } + ASSERT(error == 0); + ASSERT(BP_GET_NDVAS(bp) == ndvas); + + spa_config_exit(spa, SCL_ALLOC, FTAG); + + bp->blk_birth = txg; + + return (0); +} + +void +metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); + + spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); + + for (int d = 0; d < ndvas; d++) + metaslab_free_dva(spa, &dva[d], txg, now); + + spa_config_exit(spa, SCL_FREE, FTAG); +} + +int +metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int error = 0; + + ASSERT(!BP_IS_HOLE(bp)); + + if (txg != 0) { + /* + * First do a dry run to make sure all DVAs are claimable, + * so we don't have to unwind from partial failures below. + */ + if ((error = metaslab_claim(spa, bp, 0)) != 0) + return (error); + } + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + for (int d = 0; d < ndvas; d++) + if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) + break; + + spa_config_exit(spa, SCL_ALLOC, FTAG); + + ASSERT(error == 0 || txg == 0); + + return (error); +} diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c new file mode 100644 index 000000000..f1b3b23fe --- /dev/null +++ b/module/zfs/refcount.c @@ -0,0 +1,195 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#if defined(DEBUG) || !defined(_KERNEL) + +#ifdef _KERNEL +int reference_tracking_enable = FALSE; /* runs out of memory too easily */ +#else +int reference_tracking_enable = TRUE; +#endif +int reference_history = 4; /* tunable */ + +static kmem_cache_t *reference_cache; +static kmem_cache_t *reference_history_cache; + +void +refcount_init(void) +{ + reference_cache = kmem_cache_create("reference_cache", + sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + reference_history_cache = kmem_cache_create("reference_history_cache", + sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +refcount_fini(void) +{ + kmem_cache_destroy(reference_cache); + kmem_cache_destroy(reference_history_cache); +} + +void +refcount_create(refcount_t *rc) +{ + mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&rc->rc_list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&rc->rc_removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + rc->rc_count = 0; + rc->rc_removed_count = 0; +} + +void +refcount_destroy_many(refcount_t *rc, uint64_t number) +{ + reference_t *ref; + + ASSERT(rc->rc_count == number); + while (ref = list_head(&rc->rc_list)) { + list_remove(&rc->rc_list, ref); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_list); + + while (ref = list_head(&rc->rc_removed)) { + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, ref->ref_removed); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_removed); + mutex_destroy(&rc->rc_mtx); +} + +void +refcount_destroy(refcount_t *rc) +{ + refcount_destroy_many(rc, 0); +} + +int +refcount_is_zero(refcount_t *rc) +{ + ASSERT(rc->rc_count >= 0); + return (rc->rc_count == 0); +} + +int64_t +refcount_count(refcount_t *rc) +{ + ASSERT(rc->rc_count >= 0); + return (rc->rc_count); +} + +int64_t +refcount_add_many(refcount_t *rc, uint64_t number, void *holder) +{ + reference_t *ref; + int64_t count; + + if (reference_tracking_enable) { + ref = kmem_cache_alloc(reference_cache, KM_SLEEP); + ref->ref_holder = holder; + ref->ref_number = number; + } + mutex_enter(&rc->rc_mtx); + ASSERT(rc->rc_count >= 0); + if (reference_tracking_enable) + list_insert_head(&rc->rc_list, ref); + rc->rc_count += number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + + return (count); +} + +int64_t +refcount_add(refcount_t *rc, void *holder) +{ + return (refcount_add_many(rc, 1, holder)); +} + +int64_t +refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) +{ + reference_t *ref; + int64_t count; + + mutex_enter(&rc->rc_mtx); + ASSERT(rc->rc_count >= number); + + if (!reference_tracking_enable) { + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder && ref->ref_number == number) { + list_remove(&rc->rc_list, ref); + if (reference_history > 0) { + ref->ref_removed = + kmem_cache_alloc(reference_history_cache, + KM_SLEEP); + list_insert_head(&rc->rc_removed, ref); + rc->rc_removed_count++; + if (rc->rc_removed_count >= reference_history) { + ref = list_tail(&rc->rc_removed); + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, + ref->ref_removed); + kmem_cache_free(reference_cache, ref); + rc->rc_removed_count--; + } + } else { + kmem_cache_free(reference_cache, ref); + } + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + } + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); +} + +int64_t +refcount_remove(refcount_t *rc, void *holder) +{ + return (refcount_remove_many(rc, 1, holder)); +} + +#endif diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c new file mode 100644 index 000000000..710685dbc --- /dev/null +++ b/module/zfs/rrwlock.c @@ -0,0 +1,249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/refcount.h> +#include <sys/rrwlock.h> + +/* + * This file contains the implementation of a re-entrant read + * reader/writer lock (aka "rrwlock"). + * + * This is a normal reader/writer lock with the additional feature + * of allowing threads who have already obtained a read lock to + * re-enter another read lock (re-entrant read) - even if there are + * waiting writers. + * + * Callers who have not obtained a read lock give waiting writers priority. + * + * The rrwlock_t lock does not allow re-entrant writers, nor does it + * allow a re-entrant mix of reads and writes (that is, it does not + * allow a caller who has already obtained a read lock to be able to + * then grab a write lock without first dropping all read locks, and + * vice versa). + * + * The rrwlock_t uses tsd (thread specific data) to keep a list of + * nodes (rrw_node_t), where each node keeps track of which specific + * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering + * should be rare, a thread that grabs multiple reads on the same rrwlock_t + * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the + * tsd list can represent a different rrwlock_t. This allows a thread + * to enter multiple and unique rrwlock_ts for read locks at the same time. + * + * Since using tsd exposes some overhead, the rrwlock_t only needs to + * keep tsd data when writers are waiting. If no writers are waiting, then + * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd + * is needed. Once a writer attempts to grab the lock, readers then + * keep tsd data and bump the linked readers count (rr_linked_rcount). + * + * If there are waiting writers and there are anonymous readers, then a + * reader doesn't know if it is a re-entrant lock. But since it may be one, + * we allow the read to proceed (otherwise it could deadlock). Since once + * waiting writers are active, readers no longer bump the anonymous count, + * the anonymous readers will eventually flush themselves out. At this point, + * readers will be able to tell if they are a re-entrant lock (have a + * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then + * we must let the proceed. If they are not, then the reader blocks for the + * waiting writers. Hence, we do not starve writers. + */ + +/* global key for TSD */ +uint_t rrw_tsd_key; + +typedef struct rrw_node { + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; +} rrw_node_t; + +static rrw_node_t * +rrn_find(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) + return (rn); + } + return (NULL); +} + +/* + * Add a node to the head of the singly linked list. + */ +static void +rrn_add(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + rn = kmem_alloc(sizeof (*rn), KM_SLEEP); + rn->rn_rrl = rrl; + rn->rn_next = tsd_get(rrw_tsd_key); + VERIFY(tsd_set(rrw_tsd_key, rn) == 0); +} + +/* + * If a node is found for 'rrl', then remove the node from this + * thread's list and return TRUE; otherwise return FALSE. + */ +static boolean_t +rrn_find_and_remove(rrwlock_t *rrl) +{ + rrw_node_t *rn; + rrw_node_t *prev = NULL; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) { + if (prev) + prev->rn_next = rn->rn_next; + else + VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); + kmem_free(rn, sizeof (*rn)); + return (B_TRUE); + } + prev = rn; + } + return (B_FALSE); +} + +void +rrw_init(rrwlock_t *rrl) +{ + mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); + rrl->rr_writer = NULL; + refcount_create(&rrl->rr_anon_rcount); + refcount_create(&rrl->rr_linked_rcount); + rrl->rr_writer_wanted = B_FALSE; +} + +void +rrw_destroy(rrwlock_t *rrl) +{ + mutex_destroy(&rrl->rr_lock); + cv_destroy(&rrl->rr_cv); + ASSERT(rrl->rr_writer == NULL); + refcount_destroy(&rrl->rr_anon_rcount); + refcount_destroy(&rrl->rr_linked_rcount); +} + +static void +rrw_enter_read(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); + + while (rrl->rr_writer || (rrl->rr_writer_wanted && + refcount_is_zero(&rrl->rr_anon_rcount) && + rrn_find(rrl) == NULL)) + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + + if (rrl->rr_writer_wanted) { + /* may or may not be a re-entrant enter */ + rrn_add(rrl); + (void) refcount_add(&rrl->rr_linked_rcount, tag); + } else { + (void) refcount_add(&rrl->rr_anon_rcount, tag); + } + ASSERT(rrl->rr_writer == NULL); + mutex_exit(&rrl->rr_lock); +} + +static void +rrw_enter_write(rrwlock_t *rrl) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + + while (refcount_count(&rrl->rr_anon_rcount) > 0 || + refcount_count(&rrl->rr_linked_rcount) > 0 || + rrl->rr_writer != NULL) { + rrl->rr_writer_wanted = B_TRUE; + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + } + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_writer = curthread; + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrw_enter_read(rrl, tag); + else + rrw_enter_write(rrl); +} + +void +rrw_exit(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount) || + rrl->rr_writer != NULL); + + if (rrl->rr_writer == NULL) { + if (rrn_find_and_remove(rrl)) { + if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + + } else { + if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + } + } else { + ASSERT(rrl->rr_writer == curthread); + ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && + refcount_is_zero(&rrl->rr_linked_rcount)); + rrl->rr_writer = NULL; + cv_broadcast(&rrl->rr_cv); + } + mutex_exit(&rrl->rr_lock); +} + +boolean_t +rrw_held(rrwlock_t *rrl, krw_t rw) +{ + boolean_t held; + + mutex_enter(&rrl->rr_lock); + if (rw == RW_WRITER) { + held = (rrl->rr_writer == curthread); + } else { + held = (!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount)); + } + mutex_exit(&rrl->rr_lock); + + return (held); +} diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c new file mode 100644 index 000000000..ca7076cb6 --- /dev/null +++ b/module/zfs/sha256.c @@ -0,0 +1,129 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +/* + * SHA-256 checksum, as specified in FIPS 180-3, available at: + * http://csrc.nist.gov/publications/PubsFIPS.html + * + * This is a very compact implementation of SHA-256. + * It is designed to be simple and portable, not to be fast. + */ + +/* + * The literal definitions of Ch() and Maj() according to FIPS 180-3 are: + * + * Ch(x, y, z) (x & y) ^ (~x & z) + * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z) + * + * We use equivalent logical reductions here that require one less op. + */ +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) +#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s))) +#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22)) +#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25)) +#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3)) +#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10)) + +static const uint32_t SHA256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +static void +SHA256Transform(uint32_t *H, const uint8_t *cp) +{ + uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64]; + + for (t = 0; t < 16; t++, cp += 4) + W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3]; + + for (t = 16; t < 64; t++) + W[t] = sigma1(W[t - 2]) + W[t - 7] + + sigma0(W[t - 15]) + W[t - 16]; + + a = H[0]; b = H[1]; c = H[2]; d = H[3]; + e = H[4]; f = H[5]; g = H[6]; h = H[7]; + + for (t = 0; t < 64; t++) { + T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t]; + T2 = SIGMA0(a) + Maj(a, b, c); + h = g; g = f; f = e; e = d + T1; + d = c; c = b; b = a; a = T1 + T2; + } + + H[0] += a; H[1] += b; H[2] += c; H[3] += d; + H[4] += e; H[5] += f; H[6] += g; H[7] += h; +} + +void +zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; + uint8_t pad[128]; + int i, padsize; + + for (i = 0; i < (size & ~63ULL); i += 64) + SHA256Transform(H, (uint8_t *)buf + i); + + for (padsize = 0; i < size; i++) + pad[padsize++] = *((uint8_t *)buf + i); + + for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) + pad[padsize] = 0; + + for (i = 56; i >= 0; i -= 8) + pad[padsize++] = (size << 3) >> i; + + for (i = 0; i < padsize; i += 64) + SHA256Transform(H, pad + i); + + ZIO_SET_CHECKSUM(zcp, + (uint64_t)H[0] << 32 | H[1], + (uint64_t)H[2] << 32 | H[3], + (uint64_t)H[4] << 32 | H[5], + (uint64_t)H[6] << 32 | H[7]); +} diff --git a/module/zfs/spa.c b/module/zfs/spa.c new file mode 100644 index 000000000..fb1b96f8b --- /dev/null +++ b/module/zfs/spa.c @@ -0,0 +1,4301 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This file contains all the routines used when modifying on-disk SPA state. + * This includes opening, importing, destroying, exporting a pool, and syncing a + * pool. + */ + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/dmu_traverse.h> +#include <sys/dmu_objset.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/fs/zfs.h> +#include <sys/arc.h> +#include <sys/callb.h> +#include <sys/systeminfo.h> +#include <sys/sunddi.h> +#include <sys/spa_boot.h> + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { + /* ISSUE INTR */ + { 1, 1 }, /* ZIO_TYPE_NULL */ + { 1, 8 }, /* ZIO_TYPE_READ */ + { 8, 1 }, /* ZIO_TYPE_WRITE */ + { 1, 1 }, /* ZIO_TYPE_FREE */ + { 1, 1 }, /* ZIO_TYPE_CLAIM */ + { 1, 1 }, /* ZIO_TYPE_IOCTL */ +}; + +static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); +static boolean_t spa_has_active_shared_spare(spa_t *spa); + +/* + * ========================================================================== + * SPA properties routines + * ========================================================================== + */ + +/* + * Add a (source=src, propname=propval) list to an nvlist. + */ +static void +spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, + uint64_t intval, zprop_source_t src) +{ + const char *propname = zpool_prop_to_name(prop); + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); + + if (strval != NULL) + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); + else + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); + + VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + nvlist_free(propval); +} + +/* + * Get property values from the spa configuration. + */ +static void +spa_prop_get_config(spa_t *spa, nvlist_t **nvp) +{ + uint64_t size = spa_get_space(spa); + uint64_t used = spa_get_alloc(spa); + uint64_t cap, version; + zprop_source_t src = ZPROP_SRC_NONE; + spa_config_dirent_t *dp; + + ASSERT(MUTEX_HELD(&spa->spa_props_lock)); + + /* + * readonly properties + */ + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); + + cap = (size == 0) ? 0 : (used * 100 / size); + spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + spa->spa_root_vdev->vdev_state, src); + + /* + * settable properties that are not stored in the pool property object. + */ + version = spa_version(spa); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + + if (spa->spa_root != NULL) + spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, + 0, ZPROP_SRC_LOCAL); + + if ((dp = list_head(&spa->spa_config_list)) != NULL) { + if (dp->scd_path == NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + "none", 0, ZPROP_SRC_LOCAL); + } else if (strcmp(dp->scd_path, spa_config_path) != 0) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + dp->scd_path, 0, ZPROP_SRC_LOCAL); + } + } +} + +/* + * Get zpool property values. + */ +int +spa_prop_get(spa_t *spa, nvlist_t **nvp) +{ + zap_cursor_t zc; + zap_attribute_t za; + objset_t *mos = spa->spa_meta_objset; + int err; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + mutex_enter(&spa->spa_props_lock); + + /* + * Get properties from the spa config. + */ + spa_prop_get_config(spa, nvp); + + /* If no pool property object, no more prop to get. */ + if (spa->spa_pool_props_object == 0) { + mutex_exit(&spa->spa_props_lock); + return (0); + } + + /* + * Get properties from the MOS pool property object. + */ + for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t intval = 0; + char *strval = NULL; + zprop_source_t src = ZPROP_SRC_DEFAULT; + zpool_prop_t prop; + + if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) + continue; + + switch (za.za_integer_length) { + case 8: + /* integer property */ + if (za.za_first_integer != + zpool_prop_default_numeric(prop)) + src = ZPROP_SRC_LOCAL; + + if (prop == ZPOOL_PROP_BOOTFS) { + dsl_pool_t *dp; + dsl_dataset_t *ds = NULL; + + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + if (err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds)) { + rw_exit(&dp->dp_config_rwlock); + break; + } + + strval = kmem_alloc( + MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, + KM_SLEEP); + dsl_dataset_name(ds, strval); + dsl_dataset_rele(ds, FTAG); + rw_exit(&dp->dp_config_rwlock); + } else { + strval = NULL; + intval = za.za_first_integer; + } + + spa_prop_add_list(*nvp, prop, strval, intval, src); + + if (strval != NULL) + kmem_free(strval, + MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); + + break; + + case 1: + /* string property */ + strval = kmem_alloc(za.za_num_integers, KM_SLEEP); + err = zap_lookup(mos, spa->spa_pool_props_object, + za.za_name, 1, za.za_num_integers, strval); + if (err) { + kmem_free(strval, za.za_num_integers); + break; + } + spa_prop_add_list(*nvp, prop, strval, 0, src); + kmem_free(strval, za.za_num_integers); + break; + + default: + break; + } + } + zap_cursor_fini(&zc); + mutex_exit(&spa->spa_props_lock); +out: + if (err && err != ENOENT) { + nvlist_free(*nvp); + *nvp = NULL; + return (err); + } + + return (0); +} + +/* + * Validate the given pool properties nvlist and modify the list + * for the property values to be set. + */ +static int +spa_prop_validate(spa_t *spa, nvlist_t *props) +{ + nvpair_t *elem; + int error = 0, reset_bootfs = 0; + uint64_t objnum; + + elem = NULL; + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + zpool_prop_t prop; + char *propname, *strval; + uint64_t intval; + objset_t *os; + char *slash; + + propname = nvpair_name(elem); + + if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) + return (EINVAL); + + switch (prop) { + case ZPOOL_PROP_VERSION: + error = nvpair_value_uint64(elem, &intval); + if (!error && + (intval < spa_version(spa) || intval > SPA_VERSION)) + error = EINVAL; + break; + + case ZPOOL_PROP_DELEGATION: + case ZPOOL_PROP_AUTOREPLACE: + case ZPOOL_PROP_LISTSNAPS: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > 1) + error = EINVAL; + break; + + case ZPOOL_PROP_BOOTFS: + if (spa_version(spa) < SPA_VERSION_BOOTFS) { + error = ENOTSUP; + break; + } + + /* + * Make sure the vdev config is bootable + */ + if (!vdev_is_bootable(spa->spa_root_vdev)) { + error = ENOTSUP; + break; + } + + reset_bootfs = 1; + + error = nvpair_value_string(elem, &strval); + + if (!error) { + uint64_t compress; + + if (strval == NULL || strval[0] == '\0') { + objnum = zpool_prop_default_numeric( + ZPOOL_PROP_BOOTFS); + break; + } + + if (error = dmu_objset_open(strval, DMU_OST_ZFS, + DS_MODE_USER | DS_MODE_READONLY, &os)) + break; + + /* We don't support gzip bootable datasets */ + if ((error = dsl_prop_get_integer(strval, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), + &compress, NULL)) == 0 && + !BOOTFS_COMPRESS_VALID(compress)) { + error = ENOTSUP; + } else { + objnum = dmu_objset_id(os); + } + dmu_objset_close(os); + } + break; + + case ZPOOL_PROP_FAILUREMODE: + error = nvpair_value_uint64(elem, &intval); + if (!error && (intval < ZIO_FAILURE_MODE_WAIT || + intval > ZIO_FAILURE_MODE_PANIC)) + error = EINVAL; + + /* + * This is a special case which only occurs when + * the pool has completely failed. This allows + * the user to change the in-core failmode property + * without syncing it out to disk (I/Os might + * currently be blocked). We do this by returning + * EIO to the caller (spa_prop_set) to trick it + * into thinking we encountered a property validation + * error. + */ + if (!error && spa_suspended(spa)) { + spa->spa_failmode = intval; + error = EIO; + } + break; + + case ZPOOL_PROP_CACHEFILE: + if ((error = nvpair_value_string(elem, &strval)) != 0) + break; + + if (strval[0] == '\0') + break; + + if (strcmp(strval, "none") == 0) + break; + + if (strval[0] != '/') { + error = EINVAL; + break; + } + + slash = strrchr(strval, '/'); + ASSERT(slash != NULL); + + if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || + strcmp(slash, "/..") == 0) + error = EINVAL; + break; + } + + if (error) + break; + } + + if (!error && reset_bootfs) { + error = nvlist_remove(props, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); + + if (!error) { + error = nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); + } + } + + return (error); +} + +int +spa_prop_set(spa_t *spa, nvlist_t *nvp) +{ + int error; + + if ((error = spa_prop_validate(spa, nvp)) != 0) + return (error); + + return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, + spa, nvp, 3)); +} + +/* + * If the bootfs property value is dsobj, clear it. + */ +void +spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) +{ + if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { + VERIFY(zap_remove(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); + spa->spa_bootfs = 0; + } +} + +/* + * ========================================================================== + * SPA state manipulation (open/create/destroy/import/export) + * ========================================================================== + */ + +static int +spa_error_entry_compare(const void *a, const void *b) +{ + spa_error_entry_t *sa = (spa_error_entry_t *)a; + spa_error_entry_t *sb = (spa_error_entry_t *)b; + int ret; + + ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, + sizeof (zbookmark_t)); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +/* + * Utility function which retrieves copies of the current logs and + * re-initializes them in the process. + */ +void +spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) +{ + ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); + + bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); + bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); +} + +/* + * Activate an uninitialized pool. + */ +static void +spa_activate(spa_t *spa) +{ + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + + spa->spa_state = POOL_STATE_ACTIVE; + + spa->spa_normal_class = metaslab_class_create(); + spa->spa_log_class = metaslab_class_create(); + + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + spa->spa_zio_taskq[t][q] = taskq_create("spa_zio", + zio_taskq_threads[t][q], maxclsyspri, 50, + INT_MAX, TASKQ_PREPOPULATE); + } + } + + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_config_dirty_node)); + list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_state_dirty_node)); + + txg_list_create(&spa->spa_vdev_txg_list, + offsetof(struct vdev, vdev_txg_node)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); +} + +/* + * Opposite of spa_activate(). + */ +static void +spa_deactivate(spa_t *spa) +{ + ASSERT(spa->spa_sync_on == B_FALSE); + ASSERT(spa->spa_dsl_pool == NULL); + ASSERT(spa->spa_root_vdev == NULL); + + ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + + txg_list_destroy(&spa->spa_vdev_txg_list); + + list_destroy(&spa->spa_config_dirty_list); + list_destroy(&spa->spa_state_dirty_list); + + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + taskq_destroy(spa->spa_zio_taskq[t][q]); + spa->spa_zio_taskq[t][q] = NULL; + } + } + + metaslab_class_destroy(spa->spa_normal_class); + spa->spa_normal_class = NULL; + + metaslab_class_destroy(spa->spa_log_class); + spa->spa_log_class = NULL; + + /* + * If this was part of an import or the open otherwise failed, we may + * still have errors left in the queues. Empty them just in case. + */ + spa_errlog_drain(spa); + + avl_destroy(&spa->spa_errlist_scrub); + avl_destroy(&spa->spa_errlist_last); + + spa->spa_state = POOL_STATE_UNINITIALIZED; +} + +/* + * Verify a pool configuration, and construct the vdev tree appropriately. This + * will create all the necessary vdevs in the appropriate layout, with each vdev + * in the CLOSED state. This will prep the pool before open/creation/import. + * All vdev validation is done by the vdev_alloc() routine. + */ +static int +spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, + uint_t id, int atype) +{ + nvlist_t **child; + uint_t c, children; + int error; + + if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) + return (error); + + if ((*vdp)->vdev_ops->vdev_op_leaf) + return (0); + + error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (error == ENOENT) + return (0); + + if (error) { + vdev_free(*vdp); + *vdp = NULL; + return (EINVAL); + } + + for (c = 0; c < children; c++) { + vdev_t *vd; + if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, + atype)) != 0) { + vdev_free(*vdp); + *vdp = NULL; + return (error); + } + } + + ASSERT(*vdp != NULL); + + return (0); +} + +/* + * Opposite of spa_load(). + */ +static void +spa_unload(spa_t *spa) +{ + int i; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Stop async tasks. + */ + spa_async_suspend(spa); + + /* + * Stop syncing. + */ + if (spa->spa_sync_on) { + txg_sync_stop(spa->spa_dsl_pool); + spa->spa_sync_on = B_FALSE; + } + + /* + * Wait for any outstanding async I/O to complete. + */ + mutex_enter(&spa->spa_async_root_lock); + while (spa->spa_async_root_count != 0) + cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock); + mutex_exit(&spa->spa_async_root_lock); + + /* + * Drop and purge level 2 cache + */ + spa_l2cache_drop(spa); + + /* + * Close the dsl pool. + */ + if (spa->spa_dsl_pool) { + dsl_pool_close(spa->spa_dsl_pool); + spa->spa_dsl_pool = NULL; + } + + /* + * Close all vdevs. + */ + if (spa->spa_root_vdev) + vdev_free(spa->spa_root_vdev); + ASSERT(spa->spa_root_vdev == NULL); + + for (i = 0; i < spa->spa_spares.sav_count; i++) + vdev_free(spa->spa_spares.sav_vdevs[i]); + if (spa->spa_spares.sav_vdevs) { + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); + spa->spa_spares.sav_vdevs = NULL; + } + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + } + spa->spa_spares.sav_count = 0; + + for (i = 0; i < spa->spa_l2cache.sav_count; i++) + vdev_free(spa->spa_l2cache.sav_vdevs[i]); + if (spa->spa_l2cache.sav_vdevs) { + kmem_free(spa->spa_l2cache.sav_vdevs, + spa->spa_l2cache.sav_count * sizeof (void *)); + spa->spa_l2cache.sav_vdevs = NULL; + } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + } + spa->spa_l2cache.sav_count = 0; + + spa->spa_async_suspended = 0; +} + +/* + * Load (or re-load) the current list of vdevs describing the active spares for + * this pool. When this is called, we have some form of basic information in + * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. + */ +static void +spa_load_spares(spa_t *spa) +{ + nvlist_t **spares; + uint_t nspares; + int i; + vdev_t *vd, *tvd; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* + * First, close and free any existing spare vdevs. + */ + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; + + /* Undo the call to spa_activate() below */ + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL && tvd->vdev_isspare) + spa_spare_remove(tvd); + vdev_close(vd); + vdev_free(vd); + } + + if (spa->spa_spares.sav_vdevs) + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); + + if (spa->spa_spares.sav_config == NULL) + nspares = 0; + else + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + spa->spa_spares.sav_count = (int)nspares; + spa->spa_spares.sav_vdevs = NULL; + + if (nspares == 0) + return; + + /* + * Construct the array of vdevs, opening them to get status in the + * process. For each spare, there is potentially two different vdev_t + * structures associated with it: one in the list of spares (used only + * for basic validation purposes) and one in the active vdev + * configuration (if it's spared in). During this phase we open and + * validate each vdev on the spare list. If the vdev also exists in the + * active configuration, then we also mark this vdev as an active spare. + */ + spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) { + VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, + VDEV_ALLOC_SPARE) == 0); + ASSERT(vd != NULL); + + spa->spa_spares.sav_vdevs[i] = vd; + + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL) { + if (!tvd->vdev_isspare) + spa_spare_add(tvd); + + /* + * We only mark the spare active if we were successfully + * able to load the vdev. Otherwise, importing a pool + * with a bad active spare would result in strange + * behavior, because multiple pool would think the spare + * is actively in use. + * + * There is a vulnerability here to an equally bizarre + * circumstance, where a dead active spare is later + * brought back to life (onlined or otherwise). Given + * the rarity of this scenario, and the extra complexity + * it adds, we ignore the possibility. + */ + if (!vdev_is_dead(tvd)) + spa_spare_activate(tvd); + } + + vd->vdev_top = vd; + + if (vdev_open(vd) != 0) + continue; + + if (vdev_validate_aux(vd) == 0) + spa_spare_add(vd); + } + + /* + * Recompute the stashed list of spares, with status information + * this time. + */ + VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, + DATA_TYPE_NVLIST_ARRAY) == 0); + + spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) + spares[i] = vdev_config_generate(spa, + spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); + for (i = 0; i < spa->spa_spares.sav_count; i++) + nvlist_free(spares[i]); + kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); +} + +/* + * Load (or re-load) the current list of vdevs describing the active l2cache for + * this pool. When this is called, we have some form of basic information in + * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. + * Devices which are already active have their details maintained, and are + * not re-opened. + */ +static void +spa_load_l2cache(spa_t *spa) +{ + nvlist_t **l2cache; + uint_t nl2cache; + int i, j, oldnvdevs; + uint64_t guid, size; + vdev_t *vd, **oldvdevs, **newvdevs; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if (sav->sav_config != NULL) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); + } else { + nl2cache = 0; + } + + oldvdevs = sav->sav_vdevs; + oldnvdevs = sav->sav_count; + sav->sav_vdevs = NULL; + sav->sav_count = 0; + + /* + * Process new nvlist of vdevs. + */ + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, + &guid) == 0); + + newvdevs[i] = NULL; + for (j = 0; j < oldnvdevs; j++) { + vd = oldvdevs[j]; + if (vd != NULL && guid == vd->vdev_guid) { + /* + * Retain previous vdev for add/remove ops. + */ + newvdevs[i] = vd; + oldvdevs[j] = NULL; + break; + } + } + + if (newvdevs[i] == NULL) { + /* + * Create new vdev + */ + VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, + VDEV_ALLOC_L2CACHE) == 0); + ASSERT(vd != NULL); + newvdevs[i] = vd; + + /* + * Commit this vdev as an l2cache device, + * even if it fails to open. + */ + spa_l2cache_add(vd); + + vd->vdev_top = vd; + vd->vdev_aux = sav; + + spa_l2cache_activate(vd); + + if (vdev_open(vd) != 0) + continue; + + (void) vdev_validate_aux(vd); + + if (!vdev_is_dead(vd)) { + size = vdev_get_rsize(vd); + l2arc_add_vdev(spa, vd, + VDEV_LABEL_START_SIZE, + size - VDEV_LABEL_START_SIZE); + } + } + } + + /* + * Purge vdevs that were dropped + */ + for (i = 0; i < oldnvdevs; i++) { + uint64_t pool; + + vd = oldvdevs[i]; + if (vd != NULL) { + if ((spa_mode & FWRITE) && + spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && + l2arc_vdev_present(vd)) { + l2arc_remove_vdev(vd); + } + (void) vdev_close(vd); + spa_l2cache_remove(vd); + } + } + + if (oldvdevs) + kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); + + if (sav->sav_config == NULL) + goto out; + + sav->sav_vdevs = newvdevs; + sav->sav_count = (int)nl2cache; + + /* + * Recompute the stashed list of l2cache devices, with status + * information this time. + */ + VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, + DATA_TYPE_NVLIST_ARRAY) == 0); + + l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + l2cache[i] = vdev_config_generate(spa, + sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); +out: + for (i = 0; i < sav->sav_count; i++) + nvlist_free(l2cache[i]); + if (sav->sav_count) + kmem_free(l2cache, sav->sav_count * sizeof (void *)); +} + +static int +load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) +{ + dmu_buf_t *db; + char *packed = NULL; + size_t nvsize = 0; + int error; + *value = NULL; + + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + packed = kmem_alloc(nvsize, KM_SLEEP); + error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); + if (error == 0) + error = nvlist_unpack(packed, nvsize, value, 0); + kmem_free(packed, nvsize); + + return (error); +} + +/* + * Checks to see if the given vdev could not be opened, in which case we post a + * sysevent to notify the autoreplace code that the device has been removed. + */ +static void +spa_check_removed(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + spa_check_removed(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { + zfs_post_autoreplace(vd->vdev_spa, vd); + spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); + } +} + +/* + * Check for missing log devices + */ +int +spa_check_logs(spa_t *spa) +{ + switch (spa->spa_log_state) { + case SPA_LOG_MISSING: + /* need to recheck in case slog has been restored */ + case SPA_LOG_UNKNOWN: + if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, + DS_FIND_CHILDREN)) { + spa->spa_log_state = SPA_LOG_MISSING; + return (1); + } + break; + + case SPA_LOG_CLEAR: + (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, + DS_FIND_CHILDREN); + break; + } + spa->spa_log_state = SPA_LOG_GOOD; + return (0); +} + +/* + * Load an existing storage pool, using the pool's builtin spa_config as a + * source of configuration information. + */ +static int +spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) +{ + int error = 0; + nvlist_t *nvroot = NULL; + vdev_t *rvd; + uberblock_t *ub = &spa->spa_uberblock; + uint64_t config_cache_txg = spa->spa_config_txg; + uint64_t pool_guid; + uint64_t version; + uint64_t autoreplace = 0; + char *ereport = FM_EREPORT_ZFS_POOL; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa->spa_load_state = state; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { + error = EINVAL; + goto out; + } + + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) + version = SPA_VERSION_INITIAL; + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { + error = EEXIST; + goto out; + } + + spa->spa_load_guid = pool_guid; + + /* + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa->spa_ubsync.ub_version = version; + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) + goto out; + + ASSERT(spa->spa_root_vdev == rvd); + ASSERT(spa_guid(spa) == pool_guid); + + /* + * Try to open all vdevs, loading each label in the process. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_open(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error != 0) + goto out; + + /* + * Validate the labels for all leaf vdevs. We need to grab the config + * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) + goto out; + + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + error = ENXIO; + goto out; + } + + /* + * Find the best uberblock. + */ + vdev_uberblock_load(NULL, rvd, ub); + + /* + * If we weren't able to find a single valid uberblock, return failure. + */ + if (ub->ub_txg == 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = ENXIO; + goto out; + } + + /* + * If the pool is newer than the code, we can't open it. + */ + if (ub->ub_version > SPA_VERSION) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_VERSION_NEWER); + error = ENOTSUP; + goto out; + } + + /* + * If the vdev guid sum doesn't match the uberblock, we have an + * incomplete configuration. + */ + if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_GUID_SUM); + error = ENXIO; + goto out; + } + + /* + * Initialize internal SPA structures. + */ + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_first_txg = spa_last_synced_txg(spa) + 1; + error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + if (error) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + goto out; + } + spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; + + if (zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, + sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + if (!mosconfig) { + nvlist_t *newconfig; + uint64_t hostid; + + if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + char *hostname; + unsigned long myhostid = 0; + + VERIFY(nvlist_lookup_string(newconfig, + ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); + + (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); + if (hostid != 0 && myhostid != 0 && + (unsigned long)hostid != myhostid) { + cmn_err(CE_WARN, "pool '%s' could not be " + "loaded as it was last accessed by " + "another system (host: %s hostid: 0x%lx). " + "See: http://www.sun.com/msg/ZFS-8000-EY", + spa_name(spa), hostname, + (unsigned long)hostid); + error = EBADF; + goto out; + } + } + + spa_config_set(spa, newconfig); + spa_unload(spa); + spa_deactivate(spa); + spa_activate(spa); + + return (spa_load(spa, newconfig, state, B_TRUE)); + } + + if (zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, + sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load the bit that tells us to use the new accounting function + * (raid-z deflation). If we have an older pool, this will not + * be present. + */ + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load the persistent error log. If we have an older pool, this will + * not be present. + */ + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, + sizeof (uint64_t), 1, &spa->spa_errlog_last); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, + sizeof (uint64_t), 1, &spa->spa_errlog_scrub); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load the history object. If we have an older pool, this + * will not be present. + */ + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, + sizeof (uint64_t), 1, &spa->spa_history); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load any hot spares for this pool. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + if (error == 0) { + ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); + if (load_nvlist(spa, spa->spa_spares.sav_object, + &spa->spa_spares.sav_config) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + } + + /* + * Load any level 2 ARC devices for this pool. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_L2CACHE, sizeof (uint64_t), 1, + &spa->spa_l2cache.sav_object); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + if (error == 0) { + ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); + if (load_nvlist(spa, spa->spa_l2cache.sav_object, + &spa->spa_l2cache.sav_config) != 0) { + vdev_set_state(rvd, B_TRUE, + VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + } + + if (spa_check_logs(spa)) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LOG); + error = ENXIO; + ereport = FM_EREPORT_ZFS_LOG_REPLAY; + goto out; + } + + + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); + + if (error && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + if (error == 0) { + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), + sizeof (uint64_t), 1, &spa->spa_bootfs); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), + sizeof (uint64_t), 1, &autoreplace); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_DELEGATION), + sizeof (uint64_t), 1, &spa->spa_delegation); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), + sizeof (uint64_t), 1, &spa->spa_failmode); + } + + /* + * If the 'autoreplace' property is set, then post a resource notifying + * the ZFS DE that it should not issue any faults for unopenable + * devices. We also iterate over the vdevs, and post a sysevent for any + * unopenable vdevs so that the normal autoreplace handler can take + * over. + */ + if (autoreplace && state != SPA_LOAD_TRYIMPORT) + spa_check_removed(spa->spa_root_vdev); + + /* + * Load the vdev state for all toplevel vdevs. + */ + vdev_load(rvd); + + /* + * Propagate the leaf DTLs we just loaded all the way up the tree. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Check the state of the root vdev. If it can't be opened, it + * indicates one or more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + error = ENXIO; + goto out; + } + + if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { + dmu_tx_t *tx; + int need_update = B_FALSE; + int c; + + /* + * Claim log blocks that haven't been committed yet. + * This must all happen in a single txg. + */ + tx = dmu_tx_create_assigned(spa_get_dsl(spa), + spa_first_txg(spa)); + (void) dmu_objset_find(spa_name(spa), + zil_claim, tx, DS_FIND_CHILDREN); + dmu_tx_commit(tx); + + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * Wait for all claims to sync. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * If the config cache is stale, or we have uninitialized + * metaslabs (see spa_vdev_add()), then update the config. + */ + if (config_cache_txg != spa->spa_config_txg || + state == SPA_LOAD_IMPORT) + need_update = B_TRUE; + + for (c = 0; c < rvd->vdev_children; c++) + if (rvd->vdev_child[c]->vdev_ms_array == 0) + need_update = B_TRUE; + + /* + * Update the config cache asychronously in case we're the + * root pool, in which case the config cache isn't writable yet. + */ + if (need_update) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + error = 0; +out: + spa->spa_minref = refcount_count(&spa->spa_refcount); + if (error && error != EBADF) + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + spa->spa_load_state = SPA_LOAD_NONE; + spa->spa_ena = 0; + + return (error); +} + +/* + * Pool Open/Import + * + * The import case is identical to an open except that the configuration is sent + * down from userland, instead of grabbed from the configuration cache. For the + * case of an open, the pool configuration will exist in the + * POOL_STATE_UNINITIALIZED state. + * + * The stats information (gen/count/ustats) is used to gather vdev statistics at + * the same time open the pool, without having to keep around the spa_t in some + * ambiguous state. + */ +static int +spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) +{ + spa_t *spa; + int error; + int locked = B_FALSE; + + *spapp = NULL; + + /* + * As disgusting as this is, we need to support recursive calls to this + * function because dsl_dir_open() is called during spa_load(), and ends + * up calling spa_open() again. The real fix is to figure out how to + * avoid dsl_dir_open() calling this in the first place. + */ + if (mutex_owner(&spa_namespace_lock) != curthread) { + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } + + if ((spa = spa_lookup(pool)) == NULL) { + if (locked) + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + + spa_activate(spa); + + error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); + + if (error == EBADF) { + /* + * If vdev_validate() returns failure (indicated by + * EBADF), it indicates that one of the vdevs indicates + * that the pool has been exported or destroyed. If + * this is the case, the config cache is out of sync and + * we should remove the pool from the namespace. + */ + spa_unload(spa); + spa_deactivate(spa); + spa_config_sync(spa, B_TRUE, B_TRUE); + spa_remove(spa); + if (locked) + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } + + if (error) { + /* + * We can't open the pool, but we still have useful + * information: the state of each vdev after the + * attempted vdev_open(). Return this to the user. + */ + if (config != NULL && spa->spa_root_vdev != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, + B_TRUE); + spa_unload(spa); + spa_deactivate(spa); + spa->spa_last_open_failed = B_TRUE; + if (locked) + mutex_exit(&spa_namespace_lock); + *spapp = NULL; + return (error); + } else { + spa->spa_last_open_failed = B_FALSE; + } + } + + spa_open_ref(spa, tag); + + if (locked) + mutex_exit(&spa_namespace_lock); + + *spapp = spa; + + if (config != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + return (0); +} + +int +spa_open(const char *name, spa_t **spapp, void *tag) +{ + return (spa_open_common(name, spapp, tag, NULL)); +} + +/* + * Lookup the given spa_t, incrementing the inject count in the process, + * preventing it from being exported or destroyed. + */ +spa_t * +spa_inject_addref(char *name) +{ + spa_t *spa; + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (NULL); + } + spa->spa_inject_ref++; + mutex_exit(&spa_namespace_lock); + + return (spa); +} + +void +spa_inject_delref(spa_t *spa) +{ + mutex_enter(&spa_namespace_lock); + spa->spa_inject_ref--; + mutex_exit(&spa_namespace_lock); +} + +/* + * Add spares device information to the nvlist. + */ +static void +spa_add_spares(spa_t *spa, nvlist_t *config) +{ + nvlist_t **spares; + uint_t i, nspares; + nvlist_t *nvroot; + uint64_t guid; + vdev_stat_t *vs; + uint_t vsc; + uint64_t pool; + + if (spa->spa_spares.sav_count == 0) + return; + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + if (nspares != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + /* + * Go through and find any spares which have since been + * repurposed as an active spare. If this is the case, update + * their status appropriately. + */ + for (i = 0; i < nspares; i++) { + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + if (spa_spare_exists(guid, &pool, NULL) && + pool != 0ULL) { + VERIFY(nvlist_lookup_uint64_array( + spares[i], ZPOOL_CONFIG_STATS, + (uint64_t **)&vs, &vsc) == 0); + vs->vs_state = VDEV_STATE_CANT_OPEN; + vs->vs_aux = VDEV_AUX_SPARED; + } + } + } +} + +/* + * Add l2cache device information to the nvlist, including vdev stats. + */ +static void +spa_add_l2cache(spa_t *spa, nvlist_t *config) +{ + nvlist_t **l2cache; + uint_t i, j, nl2cache; + nvlist_t *nvroot; + uint64_t guid; + vdev_t *vd; + vdev_stat_t *vs; + uint_t vsc; + + if (spa->spa_l2cache.sav_count == 0) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + if (nl2cache != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + + /* + * Update level 2 cache device stats. + */ + + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + + vd = NULL; + for (j = 0; j < spa->spa_l2cache.sav_count; j++) { + if (guid == + spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { + vd = spa->spa_l2cache.sav_vdevs[j]; + break; + } + } + ASSERT(vd != NULL); + + VERIFY(nvlist_lookup_uint64_array(l2cache[i], + ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + vdev_get_stats(vd, vs); + } + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +int +spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) +{ + int error; + spa_t *spa; + + *config = NULL; + error = spa_open_common(name, &spa, FTAG, config); + + if (spa && *config != NULL) { + VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, + spa_get_errlog_size(spa)) == 0); + + if (spa_suspended(spa)) + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); + + spa_add_spares(spa, *config); + spa_add_l2cache(spa, *config); + } + + /* + * We want to get the alternate root even for faulted pools, so we cheat + * and call spa_lookup() directly. + */ + if (altroot) { + if (spa == NULL) { + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(name); + if (spa) + spa_altroot(spa, altroot, buflen); + else + altroot[0] = '\0'; + spa = NULL; + mutex_exit(&spa_namespace_lock); + } else { + spa_altroot(spa, altroot, buflen); + } + } + + if (spa != NULL) + spa_close(spa, FTAG); + + return (error); +} + +/* + * Validate that the auxiliary device array is well formed. We must have an + * array of nvlists, each which describes a valid leaf vdev. If this is an + * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be + * specified, as long as they are well-formed. + */ +static int +spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, + spa_aux_vdev_t *sav, const char *config, uint64_t version, + vdev_labeltype_t label) +{ + nvlist_t **dev; + uint_t i, ndev; + vdev_t *vd; + int error; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* + * It's acceptable to have no devs specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) + return (0); + + if (ndev == 0) + return (EINVAL); + + /* + * Make sure the pool is formatted with a version that supports this + * device type. + */ + if (spa_version(spa) < version) + return (ENOTSUP); + + /* + * Set the pending device list so we correctly handle device in-use + * checking. + */ + sav->sav_pending = dev; + sav->sav_npending = ndev; + + for (i = 0; i < ndev; i++) { + if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, + mode)) != 0) + goto out; + + if (!vd->vdev_ops->vdev_op_leaf) { + vdev_free(vd); + error = EINVAL; + goto out; + } + + /* + * The L2ARC currently only supports disk devices in + * kernel context. For user-level testing, we allow it. + */ +#ifdef _KERNEL + if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && + strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { + error = ENOTBLK; + goto out; + } +#endif + vd->vdev_top = vd; + + if ((error = vdev_open(vd)) == 0 && + (error = vdev_label_init(vd, crtxg, label)) == 0) { + VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } + + vdev_free(vd); + + if (error && + (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) + goto out; + else + error = 0; + } + +out: + sav->sav_pending = NULL; + sav->sav_npending = 0; + return (error); +} + +static int +spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +{ + int error; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, + VDEV_LABEL_SPARE)) != 0) { + return (error); + } + + return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, + VDEV_LABEL_L2CACHE)); +} + +static void +spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, + const char *config) +{ + int i; + + if (sav->sav_config != NULL) { + nvlist_t **olddevs; + uint_t oldndevs; + nvlist_t **newdevs; + + /* + * Generate new dev list by concatentating with the + * current dev list. + */ + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, + &olddevs, &oldndevs) == 0); + + newdevs = kmem_alloc(sizeof (void *) * + (ndevs + oldndevs), KM_SLEEP); + for (i = 0; i < oldndevs; i++) + VERIFY(nvlist_dup(olddevs[i], &newdevs[i], + KM_SLEEP) == 0); + for (i = 0; i < ndevs; i++) + VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], + KM_SLEEP) == 0); + + VERIFY(nvlist_remove(sav->sav_config, config, + DATA_TYPE_NVLIST_ARRAY) == 0); + + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + config, newdevs, ndevs + oldndevs) == 0); + for (i = 0; i < oldndevs + ndevs; i++) + nvlist_free(newdevs[i]); + kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); + } else { + /* + * Generate a new dev list. + */ + VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, + devs, ndevs) == 0); + } +} + +/* + * Stop and drop level 2 ARC devices + */ +void +spa_l2cache_drop(spa_t *spa) +{ + vdev_t *vd; + int i; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + for (i = 0; i < sav->sav_count; i++) { + uint64_t pool; + + vd = sav->sav_vdevs[i]; + ASSERT(vd != NULL); + + if ((spa_mode & FWRITE) && + spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && + l2arc_vdev_present(vd)) { + l2arc_remove_vdev(vd); + } + if (vd->vdev_isl2cache) + spa_l2cache_remove(vd); + vdev_clear_stats(vd); + (void) vdev_close(vd); + } +} + +/* + * Pool Creation + */ +int +spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, + const char *history_str, nvlist_t *zplprops) +{ + spa_t *spa; + char *altroot = NULL; + vdev_t *rvd; + dsl_pool_t *dp; + dmu_tx_t *tx; + int c, error = 0; + uint64_t txg = TXG_INITIAL; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + uint64_t version; + + /* + * If this pool already exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + + /* + * Allocate a new spa_t structure. + */ + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(pool, altroot); + spa_activate(spa); + + spa->spa_uberblock.ub_txg = txg - 1; + + if (props && (error = spa_prop_validate(spa, props))) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), + &version) != 0) + version = SPA_VERSION; + ASSERT(version <= SPA_VERSION); + spa->spa_uberblock.ub_version = version; + spa->spa_ubsync = spa->spa_uberblock; + + /* + * Create the root vdev. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); + + ASSERT(error != 0 || rvd != NULL); + ASSERT(error != 0 || spa->spa_root_vdev == rvd); + + if (error == 0 && !zfs_allocatable_devs(nvroot)) + error = EINVAL; + + if (error == 0 && + (error = vdev_create(rvd, txg, B_FALSE)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, + VDEV_ALLOC_ADD)) == 0) { + for (c = 0; c < rvd->vdev_children; c++) + vdev_init(rvd->vdev_child[c], txg); + vdev_config_dirty(rvd); + } + + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Get the list of spares, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + + /* + * Get the list of level 2 cache devices, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); + spa->spa_meta_objset = dp->dp_meta_objset; + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * Create the pool config object. + */ + spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, + DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, + sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool config"); + } + + /* Newly created pools with the right version are always deflated. */ + if (version >= SPA_VERSION_RAIDZ_DEFLATE) { + spa->spa_deflate = TRUE; + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { + cmn_err(CE_PANIC, "failed to add deflate"); + } + } + + /* + * Create the deferred-free bplist object. Turn off compression + * because sync-to-convergence takes longer if the blocksize + * keeps changing. + */ + spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, + 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, + ZIO_COMPRESS_OFF, tx); + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, + sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bplist"); + } + + /* + * Create the pool's history object. + */ + if (version >= SPA_VERSION_ZPOOL_HISTORY) + spa_history_create_obj(spa, tx); + + /* + * Set pool properties. + */ + spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); + if (props) + spa_sync_props(spa, props, CRED(), tx); + + dmu_tx_commit(tx); + + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * We explicitly wait for the first transaction to complete so that our + * bean counters are appropriately updated. + */ + txg_wait_synced(spa->spa_dsl_pool, txg); + + spa_config_sync(spa, B_FALSE, B_TRUE); + + if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) + (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); + + mutex_exit(&spa_namespace_lock); + + spa->spa_minref = refcount_count(&spa->spa_refcount); + + return (0); +} + +/* + * Import the given pool into the system. We set up the necessary spa_t and + * then call spa_load() to do the dirty work. + */ +static int +spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, + boolean_t isroot, boolean_t allowfaulted) +{ + spa_t *spa; + char *altroot = NULL; + int error, loaderr; + nvlist_t *nvroot; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + /* + * If a pool with this name exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pool)) != NULL) { + if (isroot) { + /* + * Remove the existing root pool from the + * namespace so that we can replace it with + * the correct config we just read in. + */ + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + spa_remove(spa); + } else { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + } + + /* + * Create and initialize the spa structure. + */ + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(pool, altroot); + spa_activate(spa); + + if (allowfaulted) + spa->spa_import_faulted = B_TRUE; + spa->spa_is_root = isroot; + + /* + * Pass off the heavy lifting to spa_load(). + * Pass TRUE for mosconfig (unless this is a root pool) because + * the user-supplied config is actually the one to trust when + * doing an import. + */ + loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + /* + * Toss any existing sparelist, as it doesn't have any validity anymore, + * and conflicts with spa_has_spare(). + */ + if (!isroot && spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + spa_load_spares(spa); + } + if (!isroot && spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + spa_load_l2cache(spa); + } + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_L2CACHE); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { + if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { + /* + * If we failed to load the pool, but 'allowfaulted' is + * set, then manually set the config as if the config + * passed in was specified in the cache file. + */ + error = 0; + spa->spa_import_faulted = B_FALSE; + if (spa->spa_config == NULL) + spa->spa_config = spa_config_generate(spa, + NULL, -1ULL, B_TRUE); + spa_unload(spa); + spa_deactivate(spa); + spa_config_sync(spa, B_FALSE, B_TRUE); + } else { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + } + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Override any spares and level 2 cache devices as specified by + * the user, as these may have correct device names/devids, etc. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + if (spa->spa_spares.sav_config) + VERIFY(nvlist_remove(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + if (spa->spa_l2cache.sav_config) + VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + if (spa_mode & FWRITE) { + /* + * Update the config cache to include the newly-imported pool. + */ + spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); + } + + spa->spa_import_faulted = B_FALSE; + mutex_exit(&spa_namespace_lock); + + return (0); +} + +#ifdef _KERNEL +/* + * Build a "root" vdev for a top level vdev read in from a rootpool + * device label. + */ +static void +spa_build_rootpool_config(nvlist_t *config) +{ + nvlist_t *nvtop, *nvroot; + uint64_t pgid; + + /* + * Add this top-level vdev to the child array. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) + == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) + == 0); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) + == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &nvtop, 1) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); +} + +/* + * Get the root pool information from the root disk, then import the root pool + * during the system boot up time. + */ +extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); + +int +spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, + uint64_t *besttxg) +{ + nvlist_t *config; + uint64_t txg; + int error; + + if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) + return (error); + + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + if (bestconf != NULL) + *bestconf = config; + else + nvlist_free(config); + *besttxg = txg; + return (0); +} + +boolean_t +spa_rootdev_validate(nvlist_t *nv) +{ + uint64_t ival; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) + return (B_FALSE); + + return (B_TRUE); +} + + +/* + * Given the boot device's physical path or devid, check if the device + * is in a valid state. If so, return the configuration from the vdev + * label. + */ +int +spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) +{ + nvlist_t *conf = NULL; + uint64_t txg = 0; + nvlist_t *nvtop, **child; + char *type; + char *bootpath = NULL; + uint_t children, c; + char *tmp; + int error; + + if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) + *tmp = '\0'; + if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { + cmn_err(CE_NOTE, "error reading device label"); + return (error); + } + if (txg == 0) { + cmn_err(CE_NOTE, "this device is detached"); + nvlist_free(conf); + return (EINVAL); + } + + VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (strcmp(type, VDEV_TYPE_DISK) == 0) { + if (spa_rootdev_validate(nvtop)) { + goto out; + } else { + nvlist_free(conf); + return (EINVAL); + } + } + + ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); + + VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + + /* + * Go thru vdevs in the mirror to see if the given device + * has the most recent txg. Only the device with the most + * recent txg has valid information and should be booted. + */ + for (c = 0; c < children; c++) { + char *cdevid, *cpath; + uint64_t tmptxg; + + if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, + &cpath) != 0) + return (EINVAL); + if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID, + &cdevid) != 0) + return (EINVAL); + if ((spa_check_rootconf(cpath, cdevid, NULL, + &tmptxg) == 0) && (tmptxg > txg)) { + txg = tmptxg; + VERIFY(nvlist_lookup_string(child[c], + ZPOOL_CONFIG_PATH, &bootpath) == 0); + } + } + + /* Does the best device match the one we've booted from? */ + if (bootpath) { + cmn_err(CE_NOTE, "try booting from '%s'", bootpath); + return (EINVAL); + } +out: + *bestconf = conf; + return (0); +} + +/* + * Import a root pool. + * + * For x86. devpath_list will consist of devid and/or physpath name of + * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). + * The GRUB "findroot" command will return the vdev we should boot. + * + * For Sparc, devpath_list consists the physpath name of the booting device + * no matter the rootpool is a single device pool or a mirrored pool. + * e.g. + * "/pci@1f,0/ide@d/disk@0,0:a" + */ +int +spa_import_rootpool(char *devpath, char *devid) +{ + nvlist_t *conf = NULL; + char *pname; + int error; + + /* + * Get the vdev pathname and configuation from the most + * recently updated vdev (highest txg). + */ + if (error = spa_get_rootconf(devpath, devid, &conf)) + goto msg_out; + + /* + * Add type "root" vdev to the config. + */ + spa_build_rootpool_config(conf); + + VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); + + /* + * We specify 'allowfaulted' for this to be treated like spa_open() + * instead of spa_import(). This prevents us from marking vdevs as + * persistently unavailable, and generates FMA ereports as if it were a + * pool open, not import. + */ + error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); + ASSERT(error != EEXIST); + + nvlist_free(conf); + return (error); + +msg_out: + cmn_err(CE_NOTE, "\n" + " *************************************************** \n" + " * This device is not bootable! * \n" + " * It is either offlined or detached or faulted. * \n" + " * Please try to boot from a different device. * \n" + " *************************************************** "); + + return (error); +} +#endif + +/* + * Import a non-root pool into the system. + */ +int +spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +{ + return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); +} + +int +spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) +{ + return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); +} + + +/* + * This (illegal) pool name is used when temporarily importing a spa_t in order + * to get the vdev stats associated with the imported devices. + */ +#define TRYIMPORT_NAME "$import" + +nvlist_t * +spa_tryimport(nvlist_t *tryconfig) +{ + nvlist_t *config = NULL; + char *poolname; + spa_t *spa; + uint64_t state; + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) + return (NULL); + + if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) + return (NULL); + + /* + * Create and initialize the spa structure. + */ + mutex_enter(&spa_namespace_lock); + spa = spa_add(TRYIMPORT_NAME, NULL); + spa_activate(spa); + + /* + * Pass off the heavy lifting to spa_load(). + * Pass TRUE for mosconfig because the user-supplied config + * is actually the one to trust when doing an import. + */ + (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); + + /* + * If 'tryconfig' was at least parsable, return the current config. + */ + if (spa->spa_root_vdev != NULL) { + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, + poolname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + state) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, + spa->spa_uberblock.ub_timestamp) == 0); + + /* + * If the bootfs property exists on this pool then we + * copy it out so that external consumers can tell which + * pools are bootable. + */ + if (spa->spa_bootfs) { + char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* + * We have to play games with the name since the + * pool was opened as TRYIMPORT_NAME. + */ + if (dsl_dsobj_to_dsname(spa_name(spa), + spa->spa_bootfs, tmpname) == 0) { + char *cp; + char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + cp = strchr(tmpname, '/'); + if (cp == NULL) { + (void) strlcpy(dsname, tmpname, + MAXPATHLEN); + } else { + (void) snprintf(dsname, MAXPATHLEN, + "%s/%s", poolname, ++cp); + } + VERIFY(nvlist_add_string(config, + ZPOOL_CONFIG_BOOTFS, dsname) == 0); + kmem_free(dsname, MAXPATHLEN); + } + kmem_free(tmpname, MAXPATHLEN); + } + + /* + * Add the list of hot spares and level 2 cache devices. + */ + spa_add_spares(spa, config); + spa_add_l2cache(spa, config); + } + + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + + return (config); +} + +/* + * Pool export/destroy + * + * The act of destroying or exporting a pool is very simple. We make sure there + * is no more pending I/O and any references to the pool are gone. Then, we + * update the pool state and sync all the labels to disk, removing the + * configuration from the cache afterwards. + */ +static int +spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, + boolean_t force) +{ + spa_t *spa; + + if (oldconfig) + *oldconfig = NULL; + + if (!(spa_mode & FWRITE)) + return (EROFS); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pool)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } + + /* + * Put a hold on the pool, drop the namespace lock, stop async tasks, + * reacquire the namespace lock, and see if we can export. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + /* + * The pool will be in core if it's openable, + * in which case we can modify its state. + */ + if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { + /* + * Objsets may be open only because they're dirty, so we + * have to force it to sync before checking spa_refcnt. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * A pool cannot be exported or destroyed if there are active + * references. If we are resetting a pool, allow references by + * fault injection handlers. + */ + if (!spa_refcount_zero(spa) || + (spa->spa_inject_ref != 0 && + new_state != POOL_STATE_UNINITIALIZED)) { + spa_async_resume(spa); + mutex_exit(&spa_namespace_lock); + return (EBUSY); + } + + /* + * A pool cannot be exported if it has an active shared spare. + * This is to prevent other pools stealing the active spare + * from an exported pool. At user's own will, such pool can + * be forcedly exported. + */ + if (!force && new_state == POOL_STATE_EXPORTED && + spa_has_active_shared_spare(spa)) { + spa_async_resume(spa); + mutex_exit(&spa_namespace_lock); + return (EXDEV); + } + + /* + * We want this to be reflected on every label, + * so mark them all dirty. spa_unload() will do the + * final sync that pushes these changes out. + */ + if (new_state != POOL_STATE_UNINITIALIZED) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa->spa_state = new_state; + spa->spa_final_txg = spa_last_synced_txg(spa) + 1; + vdev_config_dirty(spa->spa_root_vdev); + spa_config_exit(spa, SCL_ALL, FTAG); + } + } + + spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + + if (oldconfig && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); + + if (new_state != POOL_STATE_UNINITIALIZED) { + spa_config_sync(spa, B_TRUE, B_TRUE); + spa_remove(spa); + } + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Destroy a storage pool. + */ +int +spa_destroy(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE)); +} + +/* + * Export a storage pool. + */ +int +spa_export(char *pool, nvlist_t **oldconfig, boolean_t force) +{ + return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force)); +} + +/* + * Similar to spa_export(), this unloads the spa_t without actually removing it + * from the namespace in any way. + */ +int +spa_reset(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, + B_FALSE)); +} + +/* + * ========================================================================== + * Device manipulation + * ========================================================================== + */ + +/* + * Add a device to a storage pool. + */ +int +spa_vdev_add(spa_t *spa, nvlist_t *nvroot) +{ + uint64_t txg; + int c, error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *tvd; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + txg = spa_vdev_enter(spa); + + if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, + &nspares) != 0) + nspares = 0; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, + &nl2cache) != 0) + nl2cache = 0; + + if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + + if (vd->vdev_children != 0 && + (error = vdev_create(vd, txg, B_FALSE)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); + + /* + * We must validate the spares and l2cache devices after checking the + * children. Otherwise, vdev_inuse() will blindly overwrite the spare. + */ + if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); + + /* + * Transfer each new top-level vdev from vd to rvd. + */ + for (c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + vdev_remove_child(vd, tvd); + tvd->vdev_id = rvd->vdev_children; + vdev_add_child(rvd, tvd); + vdev_config_dirty(tvd); + } + + if (nspares != 0) { + spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, + ZPOOL_CONFIG_SPARES); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } + + if (nl2cache != 0) { + spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, + ZPOOL_CONFIG_L2CACHE); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + /* + * We have to be careful when adding new vdevs to an existing pool. + * If other threads start allocating from these vdevs before we + * sync the config cache, and we lose power, then upon reboot we may + * fail to open the pool because there are DVAs that the config cache + * can't translate. Therefore, we first add the vdevs without + * initializing metaslabs; sync the config cache (via spa_vdev_exit()); + * and then let spa_config_update() initialize the new metaslabs. + * + * spa_load() checks for added-but-not-initialized vdevs, so that + * if we lose power at any point in this sequence, the remaining + * steps will be completed the next time we load the pool. + */ + (void) spa_vdev_exit(spa, vd, txg, 0); + + mutex_enter(&spa_namespace_lock); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Attach a device to a mirror. The arguments are the path to any device + * in the mirror, and the nvroot for the new device. If the path specifies + * a device that is not mirrored, we automatically insert the mirror vdev. + * + * If 'replacing' is specified, the new device is intended to replace the + * existing device; in this case the two devices are made into their own + * mirror using the 'replacing' vdev, which is functionally identical to + * the mirror vdev (it actually reuses all the same ops) but has a few + * extra rules: you can't attach to it after it's been created, and upon + * completion of resilvering, the first disk (the one being replaced) + * is automatically detached. + */ +int +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) +{ + uint64_t txg, open_txg; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; + vdev_ops_t *pvops; + dmu_tx_t *tx; + char *oldvdpath, *newvdpath; + int newvd_isspare; + int error; + + txg = spa_vdev_enter(spa); + + oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (oldvd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!oldvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + pvd = oldvd->vdev_parent; + + if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + if (newrootvd->vdev_children != 1) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + newvd = newrootvd->vdev_child[0]; + + if (!newvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + if ((error = vdev_create(newrootvd, txg, replacing)) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, error)); + + /* + * Spares can't replace logs + */ + if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + if (!replacing) { + /* + * For attach, the only allowable parent is a mirror or the root + * vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + pvops = &vdev_mirror_ops; + } else { + /* + * Active hot spares can only be replaced by inactive hot + * spares. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + pvd->vdev_child[1] == oldvd && + !spa_has_spare(spa, newvd->vdev_guid)) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * If the source is a hot spare, and the parent isn't already a + * spare, then we want to create a new hot spare. Otherwise, we + * want to create a replacing vdev. The user is not allowed to + * attach to a spared vdev child unless the 'isspare' state is + * the same (spare replaces spare, non-spare replaces + * non-spare). + */ + if (pvd->vdev_ops == &vdev_replacing_ops) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + else if (pvd->vdev_ops != &vdev_spare_ops && + newvd->vdev_isspare) + pvops = &vdev_spare_ops; + else + pvops = &vdev_replacing_ops; + } + + /* + * Compare the new device size with the replaceable/attachable + * device size. + */ + if (newvd->vdev_psize < vdev_get_rsize(oldvd)) + return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); + + /* + * The new device cannot have a higher alignment requirement + * than the top-level vdev. + */ + if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) + return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + + /* + * If this is an in-place replacement, update oldvd's path and devid + * to make it distinguishable from newvd, and unopenable from now on. + */ + if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + spa_strfree(oldvd->vdev_path); + oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + KM_SLEEP); + (void) sprintf(oldvd->vdev_path, "%s/%s", + newvd->vdev_path, "old"); + if (oldvd->vdev_devid != NULL) { + spa_strfree(oldvd->vdev_devid); + oldvd->vdev_devid = NULL; + } + } + + /* + * If the parent is not a mirror, or if we're replacing, insert the new + * mirror/replacing/spare vdev above oldvd. + */ + if (pvd->vdev_ops != pvops) + pvd = vdev_add_parent(oldvd, pvops); + + ASSERT(pvd->vdev_top->vdev_parent == rvd); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + + /* + * Extract the new device from its root and add it to pvd. + */ + vdev_remove_child(newrootvd, newvd); + newvd->vdev_id = pvd->vdev_children; + vdev_add_child(pvd, newvd); + + /* + * If newvd is smaller than oldvd, but larger than its rsize, + * the addition of newvd may have decreased our parent's asize. + */ + pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); + + tvd = newvd->vdev_top; + ASSERT(pvd->vdev_top == tvd); + ASSERT(tvd->vdev_parent == rvd); + + vdev_config_dirty(tvd); + + /* + * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate + * upward when spa_vdev_exit() calls vdev_dtl_reassess(). + */ + open_txg = txg + TXG_CONCURRENT_STATES - 1; + + mutex_enter(&newvd->vdev_dtl_lock); + space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, + open_txg - TXG_INITIAL + 1); + mutex_exit(&newvd->vdev_dtl_lock); + + if (newvd->vdev_isspare) + spa_spare_activate(newvd); + oldvdpath = spa_strdup(oldvd->vdev_path); + newvdpath = spa_strdup(newvd->vdev_path); + newvd_isspare = newvd->vdev_isspare; + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); + + (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); + + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(tx, TXG_WAIT) == 0) { + spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, + CRED(), "%s vdev=%s %s vdev=%s", + replacing && newvd_isspare ? "spare in" : + replacing ? "replace" : "attach", newvdpath, + replacing ? "for" : "to", oldvdpath); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + + spa_strfree(oldvdpath); + spa_strfree(newvdpath); + + /* + * Kick off a resilver to update newvd. + */ + VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); + + return (0); +} + +/* + * Detach a device from a mirror or replacing vdev. + * If 'replace_done' is specified, only detach if the parent + * is a replacing vdev. + */ +int +spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) +{ + uint64_t txg; + int c, t, error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *pvd, *cvd, *tvd; + boolean_t unspare = B_FALSE; + uint64_t unspare_guid; + size_t len; + + txg = spa_vdev_enter(spa); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (vd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + pvd = vd->vdev_parent; + + /* + * If replace_done is specified, only remove this device if it's + * the first child of a replacing vdev. For the 'spare' vdev, either + * disk can be removed. + */ + if (replace_done) { + if (pvd->vdev_ops == &vdev_replacing_ops) { + if (vd->vdev_id != 0) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } else if (pvd->vdev_ops != &vdev_spare_ops) { + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } + } + + ASSERT(pvd->vdev_ops != &vdev_spare_ops || + spa_version(spa) >= SPA_VERSION_SPARES); + + /* + * Only mirror, replacing, and spare vdevs support detach. + */ + if (pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * If there's only one replica, you can't detach it. + */ + if (pvd->vdev_children <= 1) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * If all siblings have non-empty DTLs, this device may have the only + * valid copy of the data, which means we cannot safely detach it. + * + * XXX -- as in the vdev_offline() case, we really want a more + * precise DTL check. + */ + for (c = 0; c < pvd->vdev_children; c++) { + uint64_t dirty; + + cvd = pvd->vdev_child[c]; + if (cvd == vd) + continue; + if (vdev_is_dead(cvd)) + continue; + mutex_enter(&cvd->vdev_dtl_lock); + dirty = cvd->vdev_dtl_map.sm_space | + cvd->vdev_dtl_scrub.sm_space; + mutex_exit(&cvd->vdev_dtl_lock); + if (!dirty) + break; + } + + if (c == pvd->vdev_children) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * If we are detaching the second disk from a replacing vdev, then + * check to see if we changed the original vdev's path to have "/old" + * at the end in spa_vdev_attach(). If so, undo that change now. + */ + if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && + pvd->vdev_child[0]->vdev_path != NULL && + pvd->vdev_child[1]->vdev_path != NULL) { + ASSERT(pvd->vdev_child[1] == vd); + cvd = pvd->vdev_child[0]; + len = strlen(vd->vdev_path); + if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && + strcmp(cvd->vdev_path + len, "/old") == 0) { + spa_strfree(cvd->vdev_path); + cvd->vdev_path = spa_strdup(vd->vdev_path); + } + } + + /* + * If we are detaching the original disk from a spare, then it implies + * that the spare should become a real disk, and be removed from the + * active spare list for the pool. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + vd->vdev_id == 0) + unspare = B_TRUE; + + /* + * Erase the disk labels so the disk can be used for other things. + * This must be done after all other error cases are handled, + * but before we disembowel vd (so we can still do I/O to it). + * But if we can't do it, don't treat the error as fatal -- + * it may be that the unwritability of the disk is the reason + * it's being detached! + */ + error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + /* + * Remove vd from its parent and compact the parent's children. + */ + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + /* + * Remember one of the remaining children so we can get tvd below. + */ + cvd = pvd->vdev_child[0]; + + /* + * If we need to remove the remaining child from the list of hot spares, + * do it now, marking the vdev as no longer a spare in the process. We + * must do this before vdev_remove_parent(), because that can change the + * GUID if it creates a new toplevel GUID. + */ + if (unspare) { + ASSERT(cvd->vdev_isspare); + spa_spare_remove(cvd); + unspare_guid = cvd->vdev_guid; + } + + /* + * If the parent mirror/replacing vdev only has one child, + * the parent is no longer needed. Remove it from the tree. + */ + if (pvd->vdev_children == 1) + vdev_remove_parent(cvd); + + /* + * We don't set tvd until now because the parent we just removed + * may have been the previous top-level vdev. + */ + tvd = cvd->vdev_top; + ASSERT(tvd->vdev_parent == rvd); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(cvd); + + /* + * If the device we just detached was smaller than the others, it may be + * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() + * can't fail because the existing metaslabs are already in core, so + * there's nothing to read from disk. + */ + VERIFY(vdev_metaslab_init(tvd, txg) == 0); + + vdev_config_dirty(tvd); + + /* + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. + */ + for (t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); + vd->vdev_detached = B_TRUE; + vdev_dirty(tvd, VDD_DTL, vd, txg); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + + error = spa_vdev_exit(spa, vd, txg, 0); + + /* + * If this was the removal of the original device in a hot spare vdev, + * then we want to go through and remove the device from the hot spare + * list of every other pool. + */ + if (unspare) { + spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa->spa_state != POOL_STATE_ACTIVE) + continue; + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +static nvlist_t * +spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) +{ + for (int i = 0; i < count; i++) { + uint64_t guid; + + VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, + &guid) == 0); + + if (guid == target_guid) + return (nvpp[i]); + } + + return (NULL); +} + +static void +spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, + nvlist_t *dev_to_remove) +{ + nvlist_t **newdev = NULL; + + if (count > 1) + newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); + + for (int i = 0, j = 0; i < count; i++) { + if (dev[i] == dev_to_remove) + continue; + VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); + } + + VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); + + for (int i = 0; i < count - 1; i++) + nvlist_free(newdev[i]); + + if (count > 1) + kmem_free(newdev, (count - 1) * sizeof (void *)); +} + +/* + * Remove a device from the pool. Currently, this supports removing only hot + * spares and level 2 ARC devices. + */ +int +spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +{ + vdev_t *vd; + nvlist_t **spares, **l2cache, *nv; + uint_t nspares, nl2cache; + uint64_t txg; + int error = 0; + + txg = spa_vdev_enter(spa); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (spa->spa_spares.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && + (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { + /* + * Only remove the hot spare if it's not currently in use + * in this pool. + */ + if (vd == NULL || unspare) { + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } else { + error = EBUSY; + } + } else if (spa->spa_l2cache.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && + (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { + /* + * Cache devices can always be removed. + */ + spa_vdev_remove_aux(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } else if (vd != NULL) { + /* + * Normal vdevs cannot be removed (yet). + */ + error = ENOTSUP; + } else { + /* + * There is no vdev of any kind with the specified guid. + */ + error = ENOENT; + } + + return (spa_vdev_exit(spa, NULL, txg, error)); +} + +/* + * Find any device that's done replacing, or a vdev marked 'unspare' that's + * current spared, so we can detach it. + */ +static vdev_t * +spa_vdev_resilver_done_hunt(vdev_t *vd) +{ + vdev_t *newvd, *oldvd; + int c; + + for (c = 0; c < vd->vdev_children; c++) { + oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); + if (oldvd != NULL) + return (oldvd); + } + + /* + * Check for a completed replacement. + */ + if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { + oldvd = vd->vdev_child[0]; + newvd = vd->vdev_child[1]; + + mutex_enter(&newvd->vdev_dtl_lock); + if (newvd->vdev_dtl_map.sm_space == 0 && + newvd->vdev_dtl_scrub.sm_space == 0) { + mutex_exit(&newvd->vdev_dtl_lock); + return (oldvd); + } + mutex_exit(&newvd->vdev_dtl_lock); + } + + /* + * Check for a completed resilver with the 'unspare' flag set. + */ + if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { + newvd = vd->vdev_child[0]; + oldvd = vd->vdev_child[1]; + + mutex_enter(&newvd->vdev_dtl_lock); + if (newvd->vdev_unspare && + newvd->vdev_dtl_map.sm_space == 0 && + newvd->vdev_dtl_scrub.sm_space == 0) { + newvd->vdev_unspare = 0; + mutex_exit(&newvd->vdev_dtl_lock); + return (oldvd); + } + mutex_exit(&newvd->vdev_dtl_lock); + } + + return (NULL); +} + +static void +spa_vdev_resilver_done(spa_t *spa) +{ + vdev_t *vd; + vdev_t *pvd; + uint64_t guid; + uint64_t pguid = 0; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { + guid = vd->vdev_guid; + /* + * If we have just finished replacing a hot spared device, then + * we need to detach the parent's first child (the original hot + * spare) as well. + */ + pvd = vd->vdev_parent; + if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && + pvd->vdev_id == 0) { + ASSERT(pvd->vdev_ops == &vdev_replacing_ops); + ASSERT(pvd->vdev_parent->vdev_children == 2); + pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; + } + spa_config_exit(spa, SCL_CONFIG, FTAG); + if (spa_vdev_detach(spa, guid, B_TRUE) != 0) + return; + if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) + return; + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +/* + * Update the stored path for this vdev. Dirty the vdev configuration, relying + * on spa_vdev_enter/exit() to synchronize the labels and cache. + */ +int +spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +{ + vdev_t *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { + /* + * Determine if this is a reference to a hot spare device. If + * it is, update the path manually as there is no associated + * vdev_t that can be synced to disk. + */ + nvlist_t **spares; + uint_t i, nspares; + + if (spa->spa_spares.sav_config != NULL) { + VERIFY(nvlist_lookup_nvlist_array( + spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0); + for (i = 0; i < nspares; i++) { + uint64_t theguid; + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == guid) { + VERIFY(nvlist_add_string(spares[i], + ZPOOL_CONFIG_PATH, newpath) == 0); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + return (spa_vdev_exit(spa, NULL, txg, + 0)); + } + } + } + + return (spa_vdev_exit(spa, NULL, txg, ENOENT)); + } + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(newpath); + + vdev_config_dirty(vd->vdev_top); + + return (spa_vdev_exit(spa, NULL, txg, 0)); +} + +/* + * ========================================================================== + * SPA Scrubbing + * ========================================================================== + */ + +int +spa_scrub(spa_t *spa, pool_scrub_type_t type) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if ((uint_t)type >= POOL_SCRUB_TYPES) + return (ENOTSUP); + + /* + * If a resilver was requested, but there is no DTL on a + * writeable leaf device, we have nothing to do. + */ + if (type == POOL_SCRUB_RESILVER && + !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + return (0); + } + + if (type == POOL_SCRUB_EVERYTHING && + spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && + spa->spa_dsl_pool->dp_scrub_isresilver) + return (EBUSY); + + if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { + return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); + } else if (type == POOL_SCRUB_NONE) { + return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); + } else { + return (EINVAL); + } +} + +/* + * ========================================================================== + * SPA async task processing + * ========================================================================== + */ + +static void +spa_async_remove(spa_t *spa, vdev_t *vd) +{ + if (vd->vdev_remove_wanted) { + vd->vdev_remove_wanted = 0; + vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); + vdev_clear(spa, vd); + vdev_state_dirty(vd->vdev_top); + } + + for (int c = 0; c < vd->vdev_children; c++) + spa_async_remove(spa, vd->vdev_child[c]); +} + +static void +spa_async_probe(spa_t *spa, vdev_t *vd) +{ + if (vd->vdev_probe_wanted) { + vd->vdev_probe_wanted = 0; + vdev_reopen(vd); /* vdev_open() does the actual probe */ + } + + for (int c = 0; c < vd->vdev_children; c++) + spa_async_probe(spa, vd->vdev_child[c]); +} + +static void +spa_async_thread(spa_t *spa) +{ + int tasks; + + ASSERT(spa->spa_sync_on); + + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; + spa->spa_async_tasks = 0; + mutex_exit(&spa->spa_async_lock); + + /* + * See if the config needs to be updated. + */ + if (tasks & SPA_ASYNC_CONFIG_UPDATE) { + mutex_enter(&spa_namespace_lock); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + mutex_exit(&spa_namespace_lock); + } + + /* + * See if any devices need to be marked REMOVED. + */ + if (tasks & SPA_ASYNC_REMOVE) { + spa_vdev_state_enter(spa); + spa_async_remove(spa, spa->spa_root_vdev); + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); + for (int i = 0; i < spa->spa_spares.sav_count; i++) + spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); + (void) spa_vdev_state_exit(spa, NULL, 0); + } + + /* + * See if any devices need to be probed. + */ + if (tasks & SPA_ASYNC_PROBE) { + spa_vdev_state_enter(spa); + spa_async_probe(spa, spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + } + + /* + * If any devices are done replacing, detach them. + */ + if (tasks & SPA_ASYNC_RESILVER_DONE) + spa_vdev_resilver_done(spa); + + /* + * Kick off a resilver. + */ + if (tasks & SPA_ASYNC_RESILVER) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); + + /* + * Let the world know that we're done. + */ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_thread = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); + thread_exit(); +} + +void +spa_async_suspend(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_suspended++; + while (spa->spa_async_thread != NULL) + cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); + mutex_exit(&spa->spa_async_lock); +} + +void +spa_async_resume(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + ASSERT(spa->spa_async_suspended != 0); + spa->spa_async_suspended--; + mutex_exit(&spa->spa_async_lock); +} + +static void +spa_async_dispatch(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + if (spa->spa_async_tasks && !spa->spa_async_suspended && + spa->spa_async_thread == NULL && + rootdir != NULL && !vn_is_readonly(rootdir)) + spa->spa_async_thread = thread_create(NULL, 0, + spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&spa->spa_async_lock); +} + +void +spa_async_request(spa_t *spa, int task) +{ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks |= task; + mutex_exit(&spa->spa_async_lock); +} + +/* + * ========================================================================== + * SPA syncing routines + * ========================================================================== + */ + +static void +spa_sync_deferred_frees(spa_t *spa, uint64_t txg) +{ + bplist_t *bpl = &spa->spa_sync_bplist; + dmu_tx_t *tx; + blkptr_t blk; + uint64_t itor = 0; + zio_t *zio; + int error; + uint8_t c = 1; + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + while (bplist_iterate(bpl, &itor, &blk) == 0) { + ASSERT(blk.blk_birth < txg); + zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED)); + } + + error = zio_wait(zio); + ASSERT3U(error, ==, 0); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + bplist_vacate(bpl, tx); + + /* + * Pre-dirty the first block so we sync to convergence faster. + * (Usually only the first block is needed.) + */ + dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); + dmu_tx_commit(tx); +} + +static void +spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) +{ + char *packed = NULL; + size_t bufsize; + size_t nvsize = 0; + dmu_buf_t *db; + + VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); + + /* + * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration + * information. This avoids the dbuf_will_dirty() path and + * saves us a pre-read to get data we don't actually care about. + */ + bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); + packed = kmem_alloc(bufsize, KM_SLEEP); + + VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, + KM_SLEEP) == 0); + bzero(packed + nvsize, bufsize - nvsize); + + dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); + + kmem_free(packed, bufsize); + + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = nvsize; + dmu_buf_rele(db, FTAG); +} + +static void +spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, + const char *config, const char *entry) +{ + nvlist_t *nvroot; + nvlist_t **list; + int i; + + if (!sav->sav_sync) + return; + + /* + * Update the MOS nvlist describing the list of available devices. + * spa_validate_aux() will have already made sure this nvlist is + * valid and the vdevs are labeled appropriately. + */ + if (sav->sav_object == 0) { + sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, + &sav->sav_object, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (sav->sav_count == 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); + } else { + list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], + B_FALSE, B_FALSE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(nvroot, config, list, + sav->sav_count) == 0); + for (i = 0; i < sav->sav_count; i++) + nvlist_free(list[i]); + kmem_free(list, sav->sav_count * sizeof (void *)); + } + + spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); + nvlist_free(nvroot); + + sav->sav_sync = B_FALSE; +} + +static void +spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) +{ + nvlist_t *config; + + if (list_is_empty(&spa->spa_config_dirty_list)) + return; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + config = spa_config_generate(spa, spa->spa_root_vdev, + dmu_tx_get_txg(tx), B_FALSE); + + spa_config_exit(spa, SCL_STATE, FTAG); + + if (spa->spa_config_syncing) + nvlist_free(spa->spa_config_syncing); + spa->spa_config_syncing = config; + + spa_sync_nvlist(spa, spa->spa_config_object, config, tx); +} + +/* + * Set zpool properties. + */ +static void +spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + objset_t *mos = spa->spa_meta_objset; + nvlist_t *nvp = arg2; + nvpair_t *elem; + uint64_t intval; + char *strval; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + spa_config_dirent_t *dp; + + mutex_enter(&spa->spa_props_lock); + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvp, elem))) { + switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + case ZPOOL_PROP_VERSION: + /* + * Only set version for non-zpool-creation cases + * (set/import). spa_create() needs special care + * for version setting. + */ + if (tx->tx_txg != TXG_INITIAL) { + VERIFY(nvpair_value_uint64(elem, + &intval) == 0); + ASSERT(intval <= SPA_VERSION); + ASSERT(intval >= spa_version(spa)); + spa->spa_uberblock.ub_version = intval; + vdev_config_dirty(spa->spa_root_vdev); + } + break; + + case ZPOOL_PROP_ALTROOT: + /* + * 'altroot' is a non-persistent property. It should + * have been set temporarily at creation or import time. + */ + ASSERT(spa->spa_root != NULL); + break; + + case ZPOOL_PROP_CACHEFILE: + /* + * 'cachefile' is a non-persistent property, but note + * an async request that the config cache needs to be + * udpated. + */ + VERIFY(nvpair_value_string(elem, &strval) == 0); + + dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); + + if (strval[0] == '\0') + dp->scd_path = spa_strdup(spa_config_path); + else if (strcmp(strval, "none") == 0) + dp->scd_path = NULL; + else + dp->scd_path = spa_strdup(strval); + + list_insert_head(&spa->spa_config_list, dp); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + break; + default: + /* + * Set pool property values in the poolprops mos object. + */ + if (spa->spa_pool_props_object == 0) { + objset_t *mos = spa->spa_meta_objset; + + VERIFY((spa->spa_pool_props_object = + zap_create(mos, DMU_OT_POOL_PROPS, + DMU_OT_NONE, 0, tx)) > 0); + + VERIFY(zap_update(mos, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, + 8, 1, &spa->spa_pool_props_object, tx) + == 0); + } + + /* normalize the property name */ + propname = zpool_prop_to_name(prop); + proptype = zpool_prop_get_type(prop); + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + ASSERT(proptype == PROP_TYPE_STRING); + VERIFY(nvpair_value_string(elem, &strval) == 0); + VERIFY(zap_update(mos, + spa->spa_pool_props_object, propname, + 1, strlen(strval) + 1, strval, tx) == 0); + + } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + VERIFY(nvpair_value_uint64(elem, &intval) == 0); + + if (proptype == PROP_TYPE_INDEX) { + const char *unused; + VERIFY(zpool_prop_index_to_string( + prop, intval, &unused) == 0); + } + VERIFY(zap_update(mos, + spa->spa_pool_props_object, propname, + 8, 1, &intval, tx) == 0); + } else { + ASSERT(0); /* not allowed */ + } + + switch (prop) { + case ZPOOL_PROP_DELEGATION: + spa->spa_delegation = intval; + break; + case ZPOOL_PROP_BOOTFS: + spa->spa_bootfs = intval; + break; + case ZPOOL_PROP_FAILUREMODE: + spa->spa_failmode = intval; + break; + default: + break; + } + } + + /* log internal history if this is not a zpool create */ + if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && + tx->tx_txg != TXG_INITIAL) { + spa_history_internal_log(LOG_POOL_PROPSET, + spa, tx, cr, "%s %lld %s", + nvpair_name(elem), intval, spa_name(spa)); + } + } + + mutex_exit(&spa->spa_props_lock); +} + +/* + * Sync the specified transaction group. New blocks may be dirtied as + * part of the process, so we iterate until it converges. + */ +void +spa_sync(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + objset_t *mos = spa->spa_meta_objset; + bplist_t *bpl = &spa->spa_sync_bplist; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd; + dmu_tx_t *tx; + int dirty_vdevs; + int error; + + /* + * Lock out configuration changes. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa->spa_syncing_txg = txg; + spa->spa_sync_pass = 0; + + /* + * If there are any pending vdev state changes, convert them + * into config changes that go out with this transaction group. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_STATE, FTAG); + + VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, + * set spa_deflate if we have no raid-z vdevs. + */ + if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && + spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { + int i; + + for (i = 0; i < rvd->vdev_children; i++) { + vd = rvd->vdev_child[i]; + if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) + break; + } + if (i == rvd->vdev_children) { + spa->spa_deflate = TRUE; + VERIFY(0 == zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx)); + } + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && + spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { + dsl_pool_create_origin(dp, tx); + + /* Keeping the origin open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { + dsl_pool_upgrade_clones(dp, tx); + } + + /* + * If anything has changed in this txg, push the deferred frees + * from the previous txg. If not, leave them alone so that we + * don't generate work on an otherwise idle system. + */ + if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || + !txg_list_empty(&dp->dp_dirty_dirs, txg) || + !txg_list_empty(&dp->dp_sync_tasks, txg)) + spa_sync_deferred_frees(spa, txg); + + /* + * Iterate to convergence. + */ + do { + spa->spa_sync_pass++; + + spa_sync_config_object(spa, tx); + spa_sync_aux_dev(spa, &spa->spa_spares, tx, + ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); + spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, + ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); + spa_errlog_sync(spa, txg); + dsl_pool_sync(dp, txg); + + dirty_vdevs = 0; + while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { + vdev_sync(vd, txg); + dirty_vdevs++; + } + + bplist_sync(bpl, tx); + } while (dirty_vdevs); + + bplist_close(bpl); + + dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); + + /* + * Rewrite the vdev configuration (which includes the uberblock) + * to commit the transaction group. + * + * If there are no dirty vdevs, we sync the uberblock to a few + * random top-level vdevs that are known to be visible in the + * config cache (see spa_vdev_add() for a complete description). + * If there *are* dirty vdevs, sync the uberblock to all vdevs. + */ + for (;;) { + /* + * We hold SCL_STATE to prevent vdev open/close/etc. + * while we're attempting to write the vdev labels. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + if (list_is_empty(&spa->spa_config_dirty_list)) { + vdev_t *svd[SPA_DVAS_PER_BP]; + int svdcount = 0; + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + int c; + + for (c = 0; c < children; c++) { + vd = rvd->vdev_child[(c0 + c) % children]; + if (vd->vdev_ms_array == 0 || vd->vdev_islog) + continue; + svd[svdcount++] = vd; + if (svdcount == SPA_DVAS_PER_BP) + break; + } + error = vdev_config_sync(svd, svdcount, txg); + } else { + error = vdev_config_sync(rvd->vdev_child, + rvd->vdev_children, txg); + } + + spa_config_exit(spa, SCL_STATE, FTAG); + + if (error == 0) + break; + zio_suspend(spa, NULL); + zio_resume_wait(spa); + } + dmu_tx_commit(tx); + + /* + * Clear the dirty config list. + */ + while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) + vdev_config_clean(vd); + + /* + * Now that the new config has synced transactionally, + * let it become visible to the config cache. + */ + if (spa->spa_config_syncing != NULL) { + spa_config_set(spa, spa->spa_config_syncing); + spa->spa_config_txg = txg; + spa->spa_config_syncing = NULL; + } + + spa->spa_ubsync = spa->spa_uberblock; + + /* + * Clean up the ZIL records for the synced txg. + */ + dsl_pool_zil_clean(dp); + + /* + * Update usable space statistics. + */ + while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + vdev_sync_done(vd, txg); + + /* + * It had better be the case that we didn't dirty anything + * since vdev_config_sync(). + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); + ASSERT(bpl->bpl_queue == NULL); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * If any async tasks have been requested, kick them off. + */ + spa_async_dispatch(spa); +} + +/* + * Sync all pools. We don't want to hold the namespace lock across these + * operations, so we take a reference on the spa_t and drop the lock during the + * sync. + */ +void +spa_sync_allpools(void) +{ + spa_t *spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) + continue; + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); +} + +/* + * ========================================================================== + * Miscellaneous routines + * ========================================================================== + */ + +/* + * Remove all pools in the system. + */ +void +spa_evict_all(void) +{ + spa_t *spa; + + /* + * Remove all cached state. All pools should be closed now, + * so every spa in the AVL tree should be unreferenced. + */ + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(NULL)) != NULL) { + /* + * Stop async tasks. The async thread may need to detach + * a device that's been replaced, which requires grabbing + * spa_namespace_lock, so we must drop it here. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + spa_remove(spa); + } + mutex_exit(&spa_namespace_lock); +} + +vdev_t * +spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) +{ + vdev_t *vd; + int i; + + if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) + return (vd); + + if (l2cache) { + for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + vd = spa->spa_l2cache.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } + } + + return (NULL); +} + +void +spa_upgrade(spa_t *spa, uint64_t version) +{ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * This should only be called for a non-faulted pool, and since a + * future version would result in an unopenable pool, this shouldn't be + * possible. + */ + ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); + ASSERT(version >= spa->spa_uberblock.ub_version); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); + + spa_config_exit(spa, SCL_ALL, FTAG); + + txg_wait_synced(spa_get_dsl(spa), 0); +} + +boolean_t +spa_has_spare(spa_t *spa, uint64_t guid) +{ + int i; + uint64_t spareguid; + spa_aux_vdev_t *sav = &spa->spa_spares; + + for (i = 0; i < sav->sav_count; i++) + if (sav->sav_vdevs[i]->vdev_guid == guid) + return (B_TRUE); + + for (i = 0; i < sav->sav_npending; i++) { + if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, + &spareguid) == 0 && spareguid == guid) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Check if a pool has an active shared spare device. + * Note: reference count of an active spare is 2, as a spare and as a replace + */ +static boolean_t +spa_has_active_shared_spare(spa_t *spa) +{ + int i, refcnt; + uint64_t pool; + spa_aux_vdev_t *sav = &spa->spa_spares; + + for (i = 0; i < sav->sav_count; i++) { + if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, + &refcnt) && pool != 0ULL && pool == spa_guid(spa) && + refcnt > 2) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Post a sysevent corresponding to the given event. The 'name' must be one of + * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev. This doesn't do anything + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. + */ +void +spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +{ +#ifdef _KERNEL + sysevent_t *ev; + sysevent_attr_list_t *attr = NULL; + sysevent_value_t value; + sysevent_id_t eid; + + ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", + SE_SLEEP); + + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = spa_name(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) + goto done; + + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = spa_guid(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) + goto done; + + if (vd) { + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = vd->vdev_guid; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, + SE_SLEEP) != 0) + goto done; + + if (vd->vdev_path) { + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = vd->vdev_path; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, + &value, SE_SLEEP) != 0) + goto done; + } + } + + if (sysevent_attach_attributes(ev, attr) != 0) + goto done; + attr = NULL; + + (void) log_sysevent(ev, SE_SLEEP, &eid); + +done: + if (attr) + sysevent_free_attr(attr); + sysevent_free(ev); +#endif +} diff --git a/module/zfs/spa_boot.c b/module/zfs/spa_boot.c new file mode 100644 index 000000000..49e9e5019 --- /dev/null +++ b/module/zfs/spa_boot.c @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/sunddi.h> + +char * +spa_get_bootprop(char *propname) +{ + char *value; + + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS) + return (NULL); + return (value); +} + +void +spa_free_bootprop(char *value) +{ + ddi_prop_free(value); +} diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c new file mode 100644 index 000000000..ee425a916 --- /dev/null +++ b/module/zfs/spa_config.c @@ -0,0 +1,444 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/nvpair.h> +#include <sys/uio.h> +#include <sys/fs/zfs.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_ioctl.h> +#include <sys/utsname.h> +#include <sys/systeminfo.h> +#include <sys/sunddi.h> +#ifdef _KERNEL +#include <sys/kobj.h> +#endif + +/* + * Pool configuration repository. + * + * Pool configuration is stored as a packed nvlist on the filesystem. By + * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot + * (when the ZFS module is loaded). Pools can also have the 'cachefile' + * property set that allows them to be stored in an alternate location until + * the control of external software. + * + * For each cache file, we have a single nvlist which holds all the + * configuration information. When the module loads, we read this information + * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is + * maintained independently in spa.c. Whenever the namespace is modified, or + * the configuration of a pool is changed, we call spa_config_sync(), which + * walks through all the active pools and writes the configuration to disk. + */ + +static uint64_t spa_config_generation = 1; + +/* + * This can be overridden in userland to preserve an alternate namespace for + * userland pools when doing testing. + */ +const char *spa_config_path = ZPOOL_CACHE; + +/* + * Called when the module is first loaded, this routine loads the configuration + * file into the SPA namespace. It does not actually open or load the pools; it + * only populates the namespace. + */ +void +spa_config_load(void) +{ + void *buf = NULL; + nvlist_t *nvlist, *child; + nvpair_t *nvpair; + spa_t *spa; + char *pathname; + struct _buf *file; + uint64_t fsize; + + /* + * Open the configuration file. + */ + pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + (void) snprintf(pathname, MAXPATHLEN, "%s%s", + (rootdir != NULL) ? "./" : "", spa_config_path); + + file = kobj_open_file(pathname); + + kmem_free(pathname, MAXPATHLEN); + + if (file == (struct _buf *)-1) + return; + + if (kobj_get_filesize(file, &fsize) != 0) + goto out; + + buf = kmem_alloc(fsize, KM_SLEEP); + + /* + * Read the nvlist from the file. + */ + if (kobj_read_file(file, buf, fsize, 0) < 0) + goto out; + + /* + * Unpack the nvlist. + */ + if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) + goto out; + + /* + * Iterate over all elements in the nvlist, creating a new spa_t for + * each one with the specified configuration. + */ + mutex_enter(&spa_namespace_lock); + nvpair = NULL; + while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { + + if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) + continue; + + VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); + + if (spa_lookup(nvpair_name(nvpair)) != NULL) + continue; + spa = spa_add(nvpair_name(nvpair), NULL); + + /* + * We blindly duplicate the configuration here. If it's + * invalid, we will catch it when the pool is first opened. + */ + VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0); + } + mutex_exit(&spa_namespace_lock); + + nvlist_free(nvlist); + +out: + if (buf != NULL) + kmem_free(buf, fsize); + + kobj_close_file(file); +} + +static void +spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) +{ + size_t buflen; + char *buf; + vnode_t *vp; + int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; + char *temp; + + /* + * If the nvlist is empty (NULL), then remove the old cachefile. + */ + if (nvl == NULL) { + (void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); + return; + } + + /* + * Pack the configuration into a buffer. + */ + VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0); + + buf = kmem_alloc(buflen, KM_SLEEP); + temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR, + KM_SLEEP) == 0); + + /* + * Write the configuration to disk. We need to do the traditional + * 'write to temporary file, sync, move over original' to make sure we + * always have a consistent view of the data. + */ + (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); + + if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) { + if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, NULL) == 0 && + VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) { + (void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE); + } + (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); + VN_RELE(vp); + } + + (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); + + kmem_free(buf, buflen); + kmem_free(temp, MAXPATHLEN); +} + +/* + * Synchronize pool configuration to disk. This must be called with the + * namespace lock held. + */ +void +spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) +{ + spa_config_dirent_t *dp, *tdp; + nvlist_t *nvl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Iterate over all cachefiles for the pool, past or present. When the + * cachefile is changed, the new one is pushed onto this list, allowing + * us to update previous cachefiles that no longer contain this pool. + */ + for (dp = list_head(&target->spa_config_list); dp != NULL; + dp = list_next(&target->spa_config_list, dp)) { + spa_t *spa = NULL; + if (dp->scd_path == NULL) + continue; + + /* + * Iterate over all pools, adding any matching pools to 'nvl'. + */ + nvl = NULL; + while ((spa = spa_next(spa)) != NULL) { + if (spa == target && removing) + continue; + + mutex_enter(&spa->spa_props_lock); + tdp = list_head(&spa->spa_config_list); + if (spa->spa_config == NULL || + tdp->scd_path == NULL || + strcmp(tdp->scd_path, dp->scd_path) != 0) { + mutex_exit(&spa->spa_props_lock); + continue; + } + + if (nvl == NULL) + VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + + VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, + spa->spa_config) == 0); + mutex_exit(&spa->spa_props_lock); + } + + spa_config_write(dp, nvl); + nvlist_free(nvl); + } + + /* + * Remove any config entries older than the current one. + */ + dp = list_head(&target->spa_config_list); + while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { + list_remove(&target->spa_config_list, tdp); + if (tdp->scd_path != NULL) + spa_strfree(tdp->scd_path); + kmem_free(tdp, sizeof (spa_config_dirent_t)); + } + + spa_config_generation++; + + if (postsysevent) + spa_event_notify(target, NULL, ESC_ZFS_CONFIG_SYNC); +} + +/* + * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, + * and we don't want to allow the local zone to see all the pools anyway. + * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration + * information for all pool visible within the zone. + */ +nvlist_t * +spa_all_configs(uint64_t *generation) +{ + nvlist_t *pools; + spa_t *spa = NULL; + + if (*generation == spa_config_generation) + return (NULL); + + VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (INGLOBALZONE(curproc) || + zone_dataset_visible(spa_name(spa), NULL)) { + mutex_enter(&spa->spa_props_lock); + VERIFY(nvlist_add_nvlist(pools, spa_name(spa), + spa->spa_config) == 0); + mutex_exit(&spa->spa_props_lock); + } + } + *generation = spa_config_generation; + mutex_exit(&spa_namespace_lock); + + return (pools); +} + +void +spa_config_set(spa_t *spa, nvlist_t *config) +{ + mutex_enter(&spa->spa_props_lock); + if (spa->spa_config != NULL) + nvlist_free(spa->spa_config); + spa->spa_config = config; + mutex_exit(&spa->spa_props_lock); +} + +/* + * Generate the pool's configuration based on the current in-core state. + * We infer whether to generate a complete config or just one top-level config + * based on whether vd is the root vdev. + */ +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + nvlist_t *config, *nvroot; + vdev_t *rvd = spa->spa_root_vdev; + unsigned long hostid = 0; + boolean_t locked = B_FALSE; + + if (vd == NULL) { + vd = rvd; + locked = B_TRUE; + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + } + + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == + (SCL_CONFIG | SCL_STATE)); + + /* + * If txg is -1, report the current value of spa->spa_config_txg. + */ + if (txg == -1ULL) + txg = spa->spa_config_txg; + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, + spa_name(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + spa_state(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_guid(spa)) == 0); + (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); + if (hostid != 0) { + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, + hostid) == 0); + } + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, + utsname.nodename) == 0); + + if (vd != rvd) { + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, + vd->vdev_top->vdev_guid) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + if (vd->vdev_isspare) + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, + 1ULL) == 0); + if (vd->vdev_islog) + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, + 1ULL) == 0); + vd = vd->vdev_top; /* label contains top config */ + } + + nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); + + if (locked) + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + return (config); +} + +/* + * For a pool that's not currently a booting rootpool, update all disk labels, + * generate a fresh config based on the current in-core state, and sync the + * global config cache. + */ +void +spa_config_update(spa_t *spa, int what) +{ + spa_config_update_common(spa, what, FALSE); +} + +/* + * Update all disk labels, generate a fresh config based on the current + * in-core state, and sync the global config cache (do not sync the config + * cache if this is a booting rootpool). + */ +void +spa_config_update_common(spa_t *spa, int what, boolean_t isroot) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg; + int c; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + txg = spa_last_synced_txg(spa) + 1; + if (what == SPA_CONFIG_UPDATE_POOL) { + vdev_config_dirty(rvd); + } else { + /* + * If we have top-level vdevs that were added but have + * not yet been prepared for allocation, do that now. + * (It's safe now because the config cache is up to date, + * so it will be able to translate the new DVAs.) + * See comments in spa_vdev_add() for full details. + */ + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + if (tvd->vdev_ms_array == 0) { + vdev_init(tvd, txg); + vdev_config_dirty(tvd); + } + } + } + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Wait for the mosconfig to be regenerated and synced. + */ + txg_wait_synced(spa->spa_dsl_pool, txg); + + /* + * Update the global config cache to reflect the new mosconfig. + */ + if (!isroot) + spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); + + if (what == SPA_CONFIG_UPDATE_POOL) + spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot); +} diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c new file mode 100644 index 000000000..c642bd768 --- /dev/null +++ b/module/zfs/spa_errlog.c @@ -0,0 +1,437 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Routines to manage the on-disk persistent error log. + * + * Each pool stores a log of all logical data errors seen during normal + * operation. This is actually the union of two distinct logs: the last log, + * and the current log. All errors seen are logged to the current log. When a + * scrub completes, the current log becomes the last log, the last log is thrown + * out, and the current log is reinitialized. This way, if an error is somehow + * corrected, a new scrub will show that that it no longer exists, and will be + * deleted from the log when the scrub completes. + * + * The log is stored using a ZAP object whose key is a string form of the + * zbookmark tuple (objset, object, level, blkid), and whose contents is an + * optional 'objset:object' human-readable string describing the data. When an + * error is first logged, this string will be empty, indicating that no name is + * known. This prevents us from having to issue a potentially large amount of + * I/O to discover the object name during an error path. Instead, we do the + * calculation when the data is requested, storing the result so future queries + * will be faster. + * + * This log is then shipped into an nvlist where the key is the dataset name and + * the value is the object name. Userland is then responsible for uniquifying + * this list and displaying it to the user. + */ + +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zap.h> +#include <sys/zio.h> + +/* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexidecimal numbers that don't overflow. + */ +#ifdef _KERNEL +static uint64_t +strtonum(char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + *nptr = str; + + return (val); +} +#endif + +/* + * Convert a bookmark to a string. + */ +static void +bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); +} + +/* + * Convert a string to a bookmark + */ +#ifdef _KERNEL +static void +name_to_bookmark(char *buf, zbookmark_t *zb) +{ + zb->zb_objset = strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} +#endif + +/* + * Log an uncorrectable error to the persistent error log. We add it to the + * spa's list of pending errors. The changes are actually synced out to disk + * during spa_errlog_sync(). + */ +void +spa_log_error(spa_t *spa, zio_t *zio) +{ + zbookmark_t *zb = &zio->io_logical->io_bookmark; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_tree_t *tree; + avl_index_t where; + + /* + * If we are trying to import a pool, ignore any errors, as we won't be + * writing to the pool any time soon. + */ + if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + return; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * If we have had a request to rotate the log, log it to the next list + * instead of the current one. + */ + if (spa->spa_scrub_active || spa->spa_scrub_finished) + tree = &spa->spa_errlist_scrub; + else + tree = &spa->spa_errlist_last; + + search.se_bookmark = *zb; + if (avl_find(tree, &search, &where) != NULL) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *zb; + avl_insert(tree, new, where); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Return the number of errors currently in the error log. This is actually the + * sum of both the last log and the current log, since we don't know the union + * of these logs until we reach userland. + */ +uint64_t +spa_get_errlog_size(spa_t *spa) +{ + uint64_t total = 0, count; + + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + + return (total); +} + +#ifdef _KERNEL +static int +process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +{ + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_t zb; + + if (obj == 0) + return (0); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + + if (*count == 0) { + zap_cursor_fini(&zc); + return (ENOMEM); + } + + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)addr + + (*count - 1) * sizeof (zbookmark_t), + sizeof (zbookmark_t)) != 0) + return (EFAULT); + + *count -= 1; + } + + zap_cursor_fini(&zc); + + return (0); +} + +static int +process_error_list(avl_tree_t *list, void *addr, size_t *count) +{ + spa_error_entry_t *se; + + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + + if (*count == 0) + return (ENOMEM); + + if (copyout(&se->se_bookmark, (char *)addr + + (*count - 1) * sizeof (zbookmark_t), + sizeof (zbookmark_t)) != 0) + return (EFAULT); + + *count -= 1; + } + + return (0); +} +#endif + +/* + * Copy all known errors to userland as an array of bookmarks. This is + * actually a union of the on-disk last log and current log, as well as any + * pending error requests. + * + * Because the act of reading the on-disk log could cause errors to be + * generated, we have two separate locks: one for the error log and one for the + * in-core error lists. We only need the error list lock to log and error, so + * we grab the error log lock while we read the on-disk logs, and only pick up + * the error list lock when we are finished. + */ +int +spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +{ + int ret = 0; + +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + + ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); + + if (!ret && !spa->spa_scrub_finished) + ret = process_error_log(spa, spa->spa_errlog_last, uaddr, + count); + + mutex_enter(&spa->spa_errlist_lock); + if (!ret) + ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + count); + if (!ret) + ret = process_error_list(&spa->spa_errlist_last, uaddr, + count); + mutex_exit(&spa->spa_errlist_lock); + + mutex_exit(&spa->spa_errlog_lock); +#endif + + return (ret); +} + +/* + * Called when a scrub completes. This simply set a bit which tells which AVL + * tree to add new errors. spa_errlog_sync() is responsible for actually + * syncing the changes to the underlying objects. + */ +void +spa_errlog_rotate(spa_t *spa) +{ + mutex_enter(&spa->spa_errlist_lock); + spa->spa_scrub_finished = B_TRUE; + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Discard any pending errors from the spa_t. Called when unloading a faulted + * pool, as the errors encountered during the open cannot be synced to disk. + */ +void +spa_errlog_drain(spa_t *spa) +{ + spa_error_entry_t *se; + void *cookie; + + mutex_enter(&spa->spa_errlist_lock); + + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_last, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Process a list of errors into the current on-disk log. + */ +static void +sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) +{ + spa_error_entry_t *se; + char buf[64]; + void *cookie; + + if (avl_numnodes(t) != 0) { + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, + 0, tx); + + /* add errors to the current log */ + for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + + (void) zap_update(spa->spa_meta_objset, + *obj, buf, 1, strlen(name) + 1, name, tx); + } + + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + } +} + +/* + * Sync the error log out to disk. This is a little tricky because the act of + * writing the error log requires the spa_errlist_lock. So, we need to lock the + * error lists, take a copy of the lists, and then reinitialize them. Then, we + * drop the error list lock and take the error log lock, at which point we + * do the errlog processing. Then, if we encounter an I/O error during this + * process, we can successfully add the error to the list. Note that this will + * result in the perpetual recycling of errors, but it is an unlikely situation + * and not a performance critical operation. + */ +void +spa_errlog_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + avl_tree_t scrub, last; + int scrub_finished; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * Bail out early under normal circumstances. + */ + if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && + avl_numnodes(&spa->spa_errlist_last) == 0 && + !spa->spa_scrub_finished) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + spa_get_errlists(spa, &last, &scrub); + scrub_finished = spa->spa_scrub_finished; + spa->spa_scrub_finished = B_FALSE; + + mutex_exit(&spa->spa_errlist_lock); + mutex_enter(&spa->spa_errlog_lock); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + /* + * Sync out the current list of errors. + */ + sync_error_list(spa, &last, &spa->spa_errlog_last, tx); + + /* + * Rotate the log if necessary. + */ + if (scrub_finished) { + if (spa->spa_errlog_last != 0) + VERIFY(dmu_object_free(spa->spa_meta_objset, + spa->spa_errlog_last, tx) == 0); + spa->spa_errlog_last = spa->spa_errlog_scrub; + spa->spa_errlog_scrub = 0; + + sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); + } + + /* + * Sync out any pending scrub errors. + */ + sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); + + /* + * Update the MOS to reflect the new values. + */ + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, + &spa->spa_errlog_last, tx); + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, + &spa->spa_errlog_scrub, tx); + + dmu_tx_commit(tx); + + mutex_exit(&spa->spa_errlog_lock); +} diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c new file mode 100644 index 000000000..c997240c1 --- /dev/null +++ b/module/zfs/spa_history.c @@ -0,0 +1,428 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zap.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/utsname.h> +#include <sys/cmn_err.h> +#include <sys/sunddi.h> +#ifdef _KERNEL +#include <sys/zone.h> +#endif + +/* + * Routines to manage the on-disk history log. + * + * The history log is stored as a dmu object containing + * <packed record length, record nvlist> tuples. + * + * Where "record nvlist" is a nvlist containing uint64_ts and strings, and + * "packed record length" is the packed length of the "record nvlist" stored + * as a little endian uint64_t. + * + * The log is implemented as a ring buffer, though the original creation + * of the pool ('zpool create') is never overwritten. + * + * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer + * of 'spa_history' stores the offsets for logging/retrieving history as + * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of + * where the 'zpool create' record is stored. This allows us to never + * overwrite the original creation of the pool. 'sh_phys_max_off' is the + * physical ending offset in bytes of the log. This tells you the length of + * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record + * is added, 'sh_eof' is incremented by the the size of the record. + * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). + * This is where the consumer should start reading from after reading in + * the 'zpool create' portion of the log. + * + * 'sh_records_lost' keeps track of how many records have been overwritten + * and permanently lost. + */ + +/* convert a logical offset to physical */ +static uint64_t +spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) +{ + uint64_t phys_len; + + phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; + return ((log_off - shpp->sh_pool_create_len) % phys_len + + shpp->sh_pool_create_len); +} + +void +spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) +{ + dmu_buf_t *dbp; + spa_history_phys_t *shpp; + objset_t *mos = spa->spa_meta_objset; + + ASSERT(spa->spa_history == 0); + spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, + SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, + sizeof (spa_history_phys_t), tx); + + VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_HISTORY, sizeof (uint64_t), 1, + &spa->spa_history, tx) == 0); + + VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); + + shpp = dbp->db_data; + dmu_buf_will_dirty(dbp, tx); + + /* + * Figure out maximum size of history log. We set it at + * 1% of pool size, with a max of 32MB and min of 128KB. + */ + shpp->sh_phys_max_off = spa_get_dspace(spa) / 100; + shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); + shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); + + dmu_buf_rele(dbp, FTAG); +} + +/* + * Change 'sh_bof' to the beginning of the next record. + */ +static int +spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t firstread, reclen, phys_bof; + char buf[sizeof (reclen)]; + int err; + + phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); + firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); + + if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, + buf)) != 0) + return (err); + if (firstread != sizeof (reclen)) { + if ((err = dmu_read(mos, spa->spa_history, + shpp->sh_pool_create_len, sizeof (reclen) - firstread, + buf + firstread)) != 0) + return (err); + } + + reclen = LE_64(*((uint64_t *)buf)); + shpp->sh_bof += reclen + sizeof (reclen); + shpp->sh_records_lost++; + return (0); +} + +static int +spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, + dmu_tx_t *tx) +{ + uint64_t firstwrite, phys_eof; + objset_t *mos = spa->spa_meta_objset; + int err; + + ASSERT(MUTEX_HELD(&spa->spa_history_lock)); + + /* see if we need to reset logical BOF */ + while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - + (shpp->sh_eof - shpp->sh_bof) <= len) { + if ((err = spa_history_advance_bof(spa, shpp)) != 0) { + return (err); + } + } + + phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); + firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); + shpp->sh_eof += len; + dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); + + len -= firstwrite; + if (len > 0) { + /* write out the rest at the beginning of physical file */ + dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, + len, (char *)buf + firstwrite, tx); + } + + return (0); +} + +static char * +spa_history_zone() +{ +#ifdef _KERNEL + return (curproc->p_zone->zone_name); +#else + return ("global"); +#endif +} + +/* + * Write out a history event. + */ +static void +spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + history_arg_t *hap = arg2; + const char *history_str = hap->ha_history_str; + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *dbp; + spa_history_phys_t *shpp; + size_t reclen; + uint64_t le_len; + nvlist_t *nvrecord; + char *record_packed = NULL; + int ret; + + /* + * If we have an older pool that doesn't have a command + * history object, create it now. + */ + mutex_enter(&spa->spa_history_lock); + if (!spa->spa_history) + spa_history_create_obj(spa, tx); + mutex_exit(&spa->spa_history_lock); + + /* + * Get the offset of where we need to write via the bonus buffer. + * Update the offset when the write completes. + */ + VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + shpp = dbp->db_data; + + dmu_buf_will_dirty(dbp, tx); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbp, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); + } +#endif + + VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, + gethrestime_sec()) == 0); + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, + (uint64_t)crgetuid(cr)) == 0); + if (hap->ha_zone[0] != '\0') + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, + hap->ha_zone) == 0); +#ifdef _KERNEL + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, + utsname.nodename) == 0); +#endif + if (hap->ha_log_type == LOG_CMD_POOL_CREATE || + hap->ha_log_type == LOG_CMD_NORMAL) { + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, + history_str) == 0); + } else { + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, + hap->ha_event) == 0); + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, + tx->tx_txg) == 0); + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, + history_str) == 0); + } + + VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); + record_packed = kmem_alloc(reclen, KM_SLEEP); + + VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, + NV_ENCODE_XDR, KM_SLEEP) == 0); + + mutex_enter(&spa->spa_history_lock); + if (hap->ha_log_type == LOG_CMD_POOL_CREATE) + VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); + + /* write out the packed length as little endian */ + le_len = LE_64((uint64_t)reclen); + ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); + if (!ret) + ret = spa_history_write(spa, record_packed, reclen, shpp, tx); + + if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { + shpp->sh_pool_create_len += sizeof (le_len) + reclen; + shpp->sh_bof = shpp->sh_pool_create_len; + } + + mutex_exit(&spa->spa_history_lock); + nvlist_free(nvrecord); + kmem_free(record_packed, reclen); + dmu_buf_rele(dbp, FTAG); + + if (hap->ha_log_type == LOG_INTERNAL) { + kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN); + kmem_free(hap, sizeof (history_arg_t)); + } +} + +/* + * Write out a history event. + */ +int +spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) +{ + history_arg_t ha; + + ASSERT(what != LOG_INTERNAL); + + ha.ha_history_str = history_str; + ha.ha_log_type = what; + (void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone)); + return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync, + spa, &ha, 0)); +} + +/* + * Read out the command history. + */ +int +spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) +{ + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *dbp; + uint64_t read_len, phys_read_off, phys_eof; + uint64_t leftover = 0; + spa_history_phys_t *shpp; + int err; + + /* + * If the command history doesn't exist (older pool), + * that's ok, just return ENOENT. + */ + if (!spa->spa_history) + return (ENOENT); + + if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) + return (err); + shpp = dbp->db_data; + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbp, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); + } +#endif + + mutex_enter(&spa->spa_history_lock); + phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); + + if (*offp < shpp->sh_pool_create_len) { + /* read in just the zpool create history */ + phys_read_off = *offp; + read_len = MIN(*len, shpp->sh_pool_create_len - + phys_read_off); + } else { + /* + * Need to reset passed in offset to BOF if the passed in + * offset has since been overwritten. + */ + *offp = MAX(*offp, shpp->sh_bof); + phys_read_off = spa_history_log_to_phys(*offp, shpp); + + /* + * Read up to the minimum of what the user passed down or + * the EOF (physical or logical). If we hit physical EOF, + * use 'leftover' to read from the physical BOF. + */ + if (phys_read_off <= phys_eof) { + read_len = MIN(*len, phys_eof - phys_read_off); + } else { + read_len = MIN(*len, + shpp->sh_phys_max_off - phys_read_off); + if (phys_read_off + *len > shpp->sh_phys_max_off) { + leftover = MIN(*len - read_len, + phys_eof - shpp->sh_pool_create_len); + } + } + } + + /* offset for consumer to use next */ + *offp += read_len + leftover; + + /* tell the consumer how much you actually read */ + *len = read_len + leftover; + + if (read_len == 0) { + mutex_exit(&spa->spa_history_lock); + dmu_buf_rele(dbp, FTAG); + return (0); + } + + err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf); + if (leftover && err == 0) { + err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, + leftover, buf + read_len); + } + mutex_exit(&spa->spa_history_lock); + + dmu_buf_rele(dbp, FTAG); + return (err); +} + +void +spa_history_internal_log(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +{ + history_arg_t *hap; + char *str; + va_list adx; + + /* + * If this is part of creating a pool, not everything is + * initialized yet, so don't bother logging the internal events. + */ + if (tx->tx_txg == TXG_INITIAL) + return; + + hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); + str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + + va_start(adx, fmt); + (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx); + va_end(adx); + + hap->ha_log_type = LOG_INTERNAL; + hap->ha_history_str = str; + hap->ha_event = event; + hap->ha_zone[0] = '\0'; + + if (dmu_tx_is_syncing(tx)) { + spa_history_log_sync(spa, hap, cr, tx); + } else { + dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, + spa_history_log_sync, spa, hap, 0, tx); + } + /* spa_history_log_sync() will free hap and str */ +} diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c new file mode 100644 index 000000000..36046e6df --- /dev/null +++ b/module/zfs/spa_misc.c @@ -0,0 +1,1410 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/fs/zfs.h> +#include <sys/metaslab_impl.h> +#include <sys/sunddi.h> +#include <sys/arc.h> +#include "zfs_prop.h" + +/* + * SPA locking + * + * There are four basic locks for managing spa_t structures: + * + * spa_namespace_lock (global mutex) + * + * This lock must be acquired to do any of the following: + * + * - Lookup a spa_t by name + * - Add or remove a spa_t from the namespace + * - Increase spa_refcount from non-zero + * - Check if spa_refcount is zero + * - Rename a spa_t + * - add/remove/attach/detach devices + * - Held for the duration of create/destroy/import/export + * + * It does not need to handle recursion. A create or destroy may + * reference objects (files or zvols) in other pools, but by + * definition they must have an existing reference, and will never need + * to lookup a spa_t by name. + * + * spa_refcount (per-spa refcount_t protected by mutex) + * + * This reference count keep track of any active users of the spa_t. The + * spa_t cannot be destroyed or freed while this is non-zero. Internally, + * the refcount is never really 'zero' - opening a pool implicitly keeps + * some references in the DMU. Internally we check against spa_minref, but + * present the image of a zero/non-zero value to consumers. + * + * spa_config_lock[] (per-spa array of rwlocks) + * + * This protects the spa_t from config changes, and must be held in + * the following circumstances: + * + * - RW_READER to perform I/O to the spa + * - RW_WRITER to change the vdev config + * + * The locking order is fairly straightforward: + * + * spa_namespace_lock -> spa_refcount + * + * The namespace lock must be acquired to increase the refcount from 0 + * or to check if it is zero. + * + * spa_refcount -> spa_config_lock[] + * + * There must be at least one valid reference on the spa_t to acquire + * the config lock. + * + * spa_namespace_lock -> spa_config_lock[] + * + * The namespace lock must always be taken before the config lock. + * + * + * The spa_namespace_lock can be acquired directly and is globally visible. + * + * The namespace is manipulated using the following functions, all of which + * require the spa_namespace_lock to be held. + * + * spa_lookup() Lookup a spa_t by name. + * + * spa_add() Create a new spa_t in the namespace. + * + * spa_remove() Remove a spa_t from the namespace. This also + * frees up any memory associated with the spa_t. + * + * spa_next() Returns the next spa_t in the system, or the + * first if NULL is passed. + * + * spa_evict_all() Shutdown and remove all spa_t structures in + * the system. + * + * spa_guid_exists() Determine whether a pool/device guid exists. + * + * The spa_refcount is manipulated using the following functions: + * + * spa_open_ref() Adds a reference to the given spa_t. Must be + * called with spa_namespace_lock held if the + * refcount is currently zero. + * + * spa_close() Remove a reference from the spa_t. This will + * not free the spa_t or remove it from the + * namespace. No locking is required. + * + * spa_refcount_zero() Returns true if the refcount is currently + * zero. Must be called with spa_namespace_lock + * held. + * + * The spa_config_lock[] is an array of rwlocks, ordered as follows: + * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. + * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). + * + * To read the configuration, it suffices to hold one of these locks as reader. + * To modify the configuration, you must hold all locks as writer. To modify + * vdev state without altering the vdev tree's topology (e.g. online/offline), + * you must hold SCL_STATE and SCL_ZIO as writer. + * + * We use these distinct config locks to avoid recursive lock entry. + * For example, spa_sync() (which holds SCL_CONFIG as reader) induces + * block allocations (SCL_ALLOC), which may require reading space maps + * from disk (dmu_read() -> zio_read() -> SCL_ZIO). + * + * The spa config locks cannot be normal rwlocks because we need the + * ability to hand off ownership. For example, SCL_ZIO is acquired + * by the issuing thread and later released by an interrupt thread. + * They do, however, obey the usual write-wanted semantics to prevent + * writer (i.e. system administrator) starvation. + * + * The lock acquisition rules are as follows: + * + * SCL_CONFIG + * Protects changes to the vdev tree topology, such as vdev + * add/remove/attach/detach. Protects the dirty config list + * (spa_config_dirty_list) and the set of spares and l2arc devices. + * + * SCL_STATE + * Protects changes to pool state and vdev state, such as vdev + * online/offline/fault/degrade/clear. Protects the dirty state list + * (spa_state_dirty_list) and global pool state (spa_state). + * + * SCL_ALLOC + * Protects changes to metaslab groups and classes. + * Held as reader by metaslab_alloc() and metaslab_claim(). + * + * SCL_ZIO + * Held by bp-level zios (those which have no io_vd upon entry) + * to prevent changes to the vdev tree. The bp-level zio implicitly + * protects all of its vdev child zios, which do not hold SCL_ZIO. + * + * SCL_FREE + * Protects changes to metaslab groups and classes. + * Held as reader by metaslab_free(). SCL_FREE is distinct from + * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free + * blocks in zio_done() while another i/o that holds either + * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. + * + * SCL_VDEV + * Held as reader to prevent changes to the vdev tree during trivial + * inquiries such as bp_get_dasize(). SCL_VDEV is distinct from the + * other locks, and lower than all of them, to ensure that it's safe + * to acquire regardless of caller context. + * + * In addition, the following rules apply: + * + * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. + * The lock ordering is SCL_CONFIG > spa_props_lock. + * + * (b) I/O operations on leaf vdevs. For any zio operation that takes + * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), + * or zio_write_phys() -- the caller must ensure that the config cannot + * cannot change in the interim, and that the vdev cannot be reopened. + * SCL_STATE as reader suffices for both. + * + * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). + * + * spa_vdev_enter() Acquire the namespace lock and the config lock + * for writing. + * + * spa_vdev_exit() Release the config lock, wait for all I/O + * to complete, sync the updated configs to the + * cache, and release the namespace lock. + * + * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). + * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual + * locking is, always, based on spa_namespace_lock and spa_config_lock[]. + * + * spa_rename() is also implemented within this file since is requires + * manipulation of the namespace. + */ + +static avl_tree_t spa_namespace_avl; +kmutex_t spa_namespace_lock; +static kcondvar_t spa_namespace_cv; +static int spa_active_count; +int spa_max_replication_override = SPA_DVAS_PER_BP; + +static kmutex_t spa_spare_lock; +static avl_tree_t spa_spare_avl; +static kmutex_t spa_l2cache_lock; +static avl_tree_t spa_l2cache_avl; + +kmem_cache_t *spa_buffer_pool; +int spa_mode; + +#ifdef ZFS_DEBUG +/* Everything except dprintf is on by default in debug builds */ +int zfs_flags = ~ZFS_DEBUG_DPRINTF; +#else +int zfs_flags = 0; +#endif + +/* + * zfs_recover can be set to nonzero to attempt to recover from + * otherwise-fatal errors, typically caused by on-disk corruption. When + * set, calls to zfs_panic_recover() will turn into warning messages. + */ +int zfs_recover = 0; + + +/* + * ========================================================================== + * SPA config locking + * ========================================================================== + */ +static void +spa_config_lock_init(spa_t *spa) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&scl->scl_count); + scl->scl_writer = NULL; + scl->scl_write_wanted = 0; + } +} + +static void +spa_config_lock_destroy(spa_t *spa) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + mutex_destroy(&scl->scl_lock); + cv_destroy(&scl->scl_cv); + refcount_destroy(&scl->scl_count); + ASSERT(scl->scl_writer == NULL); + ASSERT(scl->scl_write_wanted == 0); + } +} + +int +spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + if (rw == RW_READER) { + if (scl->scl_writer || scl->scl_write_wanted) { + mutex_exit(&scl->scl_lock); + spa_config_exit(spa, locks ^ (1 << i), tag); + return (0); + } + } else { + ASSERT(scl->scl_writer != curthread); + if (!refcount_is_zero(&scl->scl_count)) { + mutex_exit(&scl->scl_lock); + spa_config_exit(spa, locks ^ (1 << i), tag); + return (0); + } + scl->scl_writer = curthread; + } + (void) refcount_add(&scl->scl_count, tag); + mutex_exit(&scl->scl_lock); + } + return (1); +} + +void +spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + if (rw == RW_READER) { + while (scl->scl_writer || scl->scl_write_wanted) { + cv_wait(&scl->scl_cv, &scl->scl_lock); + } + } else { + ASSERT(scl->scl_writer != curthread); + while (!refcount_is_zero(&scl->scl_count)) { + scl->scl_write_wanted++; + cv_wait(&scl->scl_cv, &scl->scl_lock); + scl->scl_write_wanted--; + } + scl->scl_writer = curthread; + } + (void) refcount_add(&scl->scl_count, tag); + mutex_exit(&scl->scl_lock); + } +} + +void +spa_config_exit(spa_t *spa, int locks, void *tag) +{ + for (int i = SCL_LOCKS - 1; i >= 0; i--) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + ASSERT(!refcount_is_zero(&scl->scl_count)); + if (refcount_remove(&scl->scl_count, tag) == 0) { + ASSERT(scl->scl_writer == NULL || + scl->scl_writer == curthread); + scl->scl_writer = NULL; /* OK in either case */ + cv_broadcast(&scl->scl_cv); + } + mutex_exit(&scl->scl_lock); + } +} + +int +spa_config_held(spa_t *spa, int locks, krw_t rw) +{ + int locks_held = 0; + + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || + (rw == RW_WRITER && scl->scl_writer == curthread)) + locks_held |= 1 << i; + } + + return (locks_held); +} + +/* + * ========================================================================== + * SPA namespace functions + * ========================================================================== + */ + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + static spa_t search; /* spa_t is large; don't allocate on stack */ + spa_t *spa; + avl_index_t where; + char c; + char *cp; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * If it's a full dataset name, figure out the pool name and + * just use that. + */ + cp = strpbrk(name, "/@"); + if (cp) { + c = *cp; + *cp = '\0'; + } + + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); + spa = avl_find(&spa_namespace_avl, &search, &where); + + if (cp) + *cp = c; + + return (spa); +} + +/* + * Create an uninitialized spa_t with the given name. Requires + * spa_namespace_lock. The caller must ensure that the spa_t doesn't already + * exist by calling spa_lookup() first. + */ +spa_t * +spa_add(const char *name, const char *altroot) +{ + spa_t *spa; + spa_config_dirent_t *dp; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); + + mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + + cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); + + (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); + spa->spa_state = POOL_STATE_UNINITIALIZED; + spa->spa_freeze_txg = UINT64_MAX; + spa->spa_final_txg = UINT64_MAX; + + refcount_create(&spa->spa_refcount); + spa_config_lock_init(spa); + + avl_add(&spa_namespace_avl, spa); + + mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Set the alternate root, if there is one. + */ + if (altroot) { + spa->spa_root = spa_strdup(altroot); + spa_active_count++; + } + + /* + * Every pool starts with the default cachefile + */ + list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), + offsetof(spa_config_dirent_t, scd_link)); + + dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); + dp->scd_path = spa_strdup(spa_config_path); + list_insert_head(&spa->spa_config_list, dp); + + return (spa); +} + +/* + * Removes a spa_t from the namespace, freeing up any memory used. Requires + * spa_namespace_lock. This is called only after the spa_t has been closed and + * deactivated. + */ +void +spa_remove(spa_t *spa) +{ + spa_config_dirent_t *dp; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + + avl_remove(&spa_namespace_avl, spa); + cv_broadcast(&spa_namespace_cv); + + if (spa->spa_root) { + spa_strfree(spa->spa_root); + spa_active_count--; + } + + while ((dp = list_head(&spa->spa_config_list)) != NULL) { + list_remove(&spa->spa_config_list, dp); + if (dp->scd_path != NULL) + spa_strfree(dp->scd_path); + kmem_free(dp, sizeof (spa_config_dirent_t)); + } + + list_destroy(&spa->spa_config_list); + + spa_config_set(spa, NULL); + + refcount_destroy(&spa->spa_refcount); + + spa_config_lock_destroy(spa); + + cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_async_root_cv); + cv_destroy(&spa->spa_scrub_io_cv); + cv_destroy(&spa->spa_suspend_cv); + + mutex_destroy(&spa->spa_async_lock); + mutex_destroy(&spa->spa_async_root_lock); + mutex_destroy(&spa->spa_scrub_lock); + mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_errlist_lock); + mutex_destroy(&spa->spa_sync_bplist.bpl_lock); + mutex_destroy(&spa->spa_history_lock); + mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_suspend_lock); + + kmem_free(spa, sizeof (spa_t)); +} + +/* + * Given a pool, return the next pool in the namespace, or NULL if there is + * none. If 'prev' is NULL, return the first pool. + */ +spa_t * +spa_next(spa_t *prev) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + +/* + * ========================================================================== + * SPA refcount functions + * ========================================================================== + */ + +/* + * Add a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_open_ref(spa_t *spa, void *tag) +{ + ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || + MUTEX_HELD(&spa_namespace_lock)); + (void) refcount_add(&spa->spa_refcount, tag); +} + +/* + * Remove a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_close(spa_t *spa, void *tag) +{ + ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || + MUTEX_HELD(&spa_namespace_lock)); + (void) refcount_remove(&spa->spa_refcount, tag); +} + +/* + * Check to see if the spa refcount is zero. Must be called with + * spa_namespace_lock held. We really compare against spa_minref, which is the + * number of references acquired when opening a pool + */ +boolean_t +spa_refcount_zero(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + return (refcount_count(&spa->spa_refcount) == spa->spa_minref); +} + +/* + * ========================================================================== + * SPA spare and l2cache tracking + * ========================================================================== + */ + +/* + * Hot spares and cache devices are tracked using the same code below, + * for 'auxiliary' devices. + */ + +typedef struct spa_aux { + uint64_t aux_guid; + uint64_t aux_pool; + avl_node_t aux_avl; + int aux_count; +} spa_aux_t; + +static int +spa_aux_compare(const void *a, const void *b) +{ + const spa_aux_t *sa = a; + const spa_aux_t *sb = b; + + if (sa->aux_guid < sb->aux_guid) + return (-1); + else if (sa->aux_guid > sb->aux_guid) + return (1); + else + return (0); +} + +void +spa_aux_add(vdev_t *vd, avl_tree_t *avl) +{ + avl_index_t where; + spa_aux_t search; + spa_aux_t *aux; + + search.aux_guid = vd->vdev_guid; + if ((aux = avl_find(avl, &search, &where)) != NULL) { + aux->aux_count++; + } else { + aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); + aux->aux_guid = vd->vdev_guid; + aux->aux_count = 1; + avl_insert(avl, aux, where); + } +} + +void +spa_aux_remove(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search; + spa_aux_t *aux; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + aux = avl_find(avl, &search, &where); + + ASSERT(aux != NULL); + + if (--aux->aux_count == 0) { + avl_remove(avl, aux); + kmem_free(aux, sizeof (spa_aux_t)); + } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { + aux->aux_pool = 0ULL; + } +} + +boolean_t +spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) +{ + spa_aux_t search, *found; + + search.aux_guid = guid; + found = avl_find(avl, &search, NULL); + + if (pool) { + if (found) + *pool = found->aux_pool; + else + *pool = 0ULL; + } + + if (refcnt) { + if (found) + *refcnt = found->aux_count; + else + *refcnt = 0; + } + + return (found != NULL); +} + +void +spa_aux_activate(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search, *found; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + found = avl_find(avl, &search, &where); + ASSERT(found != NULL); + ASSERT(found->aux_pool == 0ULL); + + found->aux_pool = spa_guid(vd->vdev_spa); +} + +/* + * Spares are tracked globally due to the following constraints: + * + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within + * another pool. + * - A spare in use in any pool can only be the source of a replacement if + * the target is a spare in the same pool. + * + * We keep track of all spares on the system through the use of a reference + * counted AVL tree. When a vdev is added as a spare, or used as a replacement + * spare, then we bump the reference count in the AVL tree. In addition, we set + * the 'vdev_isspare' member to indicate that the device is a spare (active or + * inactive). When a spare is made active (used to replace a device in the + * pool), we also keep track of which pool its been made a part of. + * + * The 'spa_spare_lock' protects the AVL tree. These functions are normally + * called under the spa_namespace lock as part of vdev reconfiguration. The + * separate spare lock exists for the status query path, which does not need to + * be completely consistent with respect to other vdev configuration changes. + */ + +static int +spa_spare_compare(const void *a, const void *b) +{ + return (spa_aux_compare(a, b)); +} + +void +spa_spare_add(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(!vd->vdev_isspare); + spa_aux_add(vd, &spa_spare_avl); + vd->vdev_isspare = B_TRUE; + mutex_exit(&spa_spare_lock); +} + +void +spa_spare_remove(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(vd->vdev_isspare); + spa_aux_remove(vd, &spa_spare_avl); + vd->vdev_isspare = B_FALSE; + mutex_exit(&spa_spare_lock); +} + +boolean_t +spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) +{ + boolean_t found; + + mutex_enter(&spa_spare_lock); + found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); + mutex_exit(&spa_spare_lock); + + return (found); +} + +void +spa_spare_activate(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(vd->vdev_isspare); + spa_aux_activate(vd, &spa_spare_avl); + mutex_exit(&spa_spare_lock); +} + +/* + * Level 2 ARC devices are tracked globally for the same reasons as spares. + * Cache devices currently only support one pool per cache device, and so + * for these devices the aux reference count is currently unused beyond 1. + */ + +static int +spa_l2cache_compare(const void *a, const void *b) +{ + return (spa_aux_compare(a, b)); +} + +void +spa_l2cache_add(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(!vd->vdev_isl2cache); + spa_aux_add(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_TRUE; + mutex_exit(&spa_l2cache_lock); +} + +void +spa_l2cache_remove(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_remove(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_FALSE; + mutex_exit(&spa_l2cache_lock); +} + +boolean_t +spa_l2cache_exists(uint64_t guid, uint64_t *pool) +{ + boolean_t found; + + mutex_enter(&spa_l2cache_lock); + found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); + + return (found); +} + +void +spa_l2cache_activate(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_activate(vd, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); +} + +void +spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc) +{ + vdev_space_update(vd, space, alloc, B_FALSE); +} + +/* + * ========================================================================== + * SPA vdev locking + * ========================================================================== + */ + +/* + * Lock the given spa_t for the purpose of adding or removing a vdev. + * Grabs the global spa_namespace_lock plus the spa config lock for writing. + * It returns the next transaction group for the spa_t. + */ +uint64_t +spa_vdev_enter(spa_t *spa) +{ + mutex_enter(&spa_namespace_lock); + + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + + return (spa_last_synced_txg(spa) + 1); +} + +/* + * Unlock the spa_t after adding or removing a vdev. Besides undoing the + * locking of spa_vdev_enter(), we also want make sure the transactions have + * synced to disk, and then update the global configuration cache with the new + * information. + */ +int +spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +{ + int config_changed = B_FALSE; + + ASSERT(txg > spa_last_synced_txg(spa)); + + spa->spa_pending_vdev = NULL; + + /* + * Reassess the DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + + /* + * If the config changed, notify the scrub thread that it must restart. + */ + if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { + dsl_pool_scrub_restart(spa->spa_dsl_pool); + config_changed = B_TRUE; + } + + spa_config_exit(spa, SCL_ALL, spa); + + /* + * Note: this txg_wait_synced() is important because it ensures + * that there won't be more than one config change per txg. + * This allows us to use the txg as the generation number. + */ + if (error == 0) + txg_wait_synced(spa->spa_dsl_pool, txg); + + if (vd != NULL) { + ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); + vdev_free(vd); + } + + /* + * If the config changed, update the config cache. + */ + if (config_changed) + spa_config_sync(spa, B_FALSE, B_TRUE); + + mutex_exit(&spa_namespace_lock); + + return (error); +} + +/* + * Lock the given spa_t for the purpose of changing vdev state. + */ +void +spa_vdev_state_enter(spa_t *spa) +{ + spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); +} + +int +spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) +{ + if (vd != NULL) + vdev_state_dirty(vd->vdev_top); + + spa_config_exit(spa, SCL_STATE_ALL, spa); + + return (error); +} + +/* + * ========================================================================== + * Miscellaneous functions + * ========================================================================== + */ + +/* + * Rename a spa_t. + */ +int +spa_rename(const char *name, const char *newname) +{ + spa_t *spa; + int err; + + /* + * Lookup the spa_t and grab the config lock for writing. We need to + * actually open the pool so that we can sync out the necessary labels. + * It's OK to call spa_open() with the namespace lock held because we + * allow recursive calls for other reasons. + */ + mutex_enter(&spa_namespace_lock); + if ((err = spa_open(name, &spa, FTAG)) != 0) { + mutex_exit(&spa_namespace_lock); + return (err); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + avl_remove(&spa_namespace_avl, spa); + (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); + avl_add(&spa_namespace_avl, spa); + + /* + * Sync all labels to disk with the new names by marking the root vdev + * dirty and waiting for it to sync. It will pick up the new pool name + * during the sync. + */ + vdev_config_dirty(spa->spa_root_vdev); + + spa_config_exit(spa, SCL_ALL, FTAG); + + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * Sync the updated config cache. + */ + spa_config_sync(spa, B_FALSE, B_TRUE); + + spa_close(spa, FTAG); + + mutex_exit(&spa_namespace_lock); + + return (0); +} + + +/* + * Determine whether a pool with given pool_guid exists. If device_guid is + * non-zero, determine whether the pool exists *and* contains a device with the + * specified device_guid. + */ +boolean_t +spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +{ + spa_t *spa; + avl_tree_t *t = &spa_namespace_avl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + continue; + if (spa->spa_root_vdev == NULL) + continue; + if (spa_guid(spa) == pool_guid) { + if (device_guid == 0) + break; + + if (vdev_lookup_by_guid(spa->spa_root_vdev, + device_guid) != NULL) + break; + + /* + * Check any devices we may be in the process of adding. + */ + if (spa->spa_pending_vdev) { + if (vdev_lookup_by_guid(spa->spa_pending_vdev, + device_guid) != NULL) + break; + } + } + } + + return (spa != NULL); +} + +char * +spa_strdup(const char *s) +{ + size_t len; + char *new; + + len = strlen(s); + new = kmem_alloc(len + 1, KM_SLEEP); + bcopy(s, new, len); + new[len] = '\0'; + + return (new); +} + +void +spa_strfree(char *s) +{ + kmem_free(s, strlen(s) + 1); +} + +uint64_t +spa_get_random(uint64_t range) +{ + uint64_t r; + + ASSERT(range != 0); + + (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); + + return (r % range); +} + +void +sprintf_blkptr(char *buf, int len, const blkptr_t *bp) +{ + int d; + + if (bp == NULL) { + (void) snprintf(buf, len, "<NULL>"); + return; + } + + if (BP_IS_HOLE(bp)) { + (void) snprintf(buf, len, "<hole>"); + return; + } + + (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ", + (u_longlong_t)BP_GET_LEVEL(bp), + dmu_ot[BP_GET_TYPE(bp)].ot_name, + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp)); + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + const dva_t *dva = &bp->blk_dva[d]; + (void) snprintf(buf + strlen(buf), len - strlen(buf), + "DVA[%d]=<%llu:%llx:%llx> ", d, + (u_longlong_t)DVA_GET_VDEV(dva), + (u_longlong_t)DVA_GET_OFFSET(dva), + (u_longlong_t)DVA_GET_ASIZE(dva)); + } + + (void) snprintf(buf + strlen(buf), len - strlen(buf), + "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx", + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, + zio_compress_table[BP_GET_COMPRESS(bp)].ci_name, + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", + BP_IS_GANG(bp) ? "gang" : "contiguous", + (u_longlong_t)bp->blk_birth, + (u_longlong_t)bp->blk_fill, + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); +} + +void +spa_freeze(spa_t *spa) +{ + uint64_t freeze_txg = 0; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + if (spa->spa_freeze_txg == UINT64_MAX) { + freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; + spa->spa_freeze_txg = freeze_txg; + } + spa_config_exit(spa, SCL_ALL, FTAG); + if (freeze_txg != 0) + txg_wait_synced(spa_get_dsl(spa), freeze_txg); +} + +void +zfs_panic_recover(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); + va_end(adx); +} + +/* + * ========================================================================== + * Accessor functions + * ========================================================================== + */ + +boolean_t +spa_shutting_down(spa_t *spa) +{ + return (spa->spa_async_suspended); +} + +dsl_pool_t * +spa_get_dsl(spa_t *spa) +{ + return (spa->spa_dsl_pool); +} + +blkptr_t * +spa_get_rootblkptr(spa_t *spa) +{ + return (&spa->spa_ubsync.ub_rootbp); +} + +void +spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) +{ + spa->spa_uberblock.ub_rootbp = *bp; +} + +void +spa_altroot(spa_t *spa, char *buf, size_t buflen) +{ + if (spa->spa_root == NULL) + buf[0] = '\0'; + else + (void) strncpy(buf, spa->spa_root, buflen); +} + +int +spa_sync_pass(spa_t *spa) +{ + return (spa->spa_sync_pass); +} + +char * +spa_name(spa_t *spa) +{ + return (spa->spa_name); +} + +uint64_t +spa_guid(spa_t *spa) +{ + /* + * If we fail to parse the config during spa_load(), we can go through + * the error path (which posts an ereport) and end up here with no root + * vdev. We stash the original pool guid in 'spa_load_guid' to handle + * this case. + */ + if (spa->spa_root_vdev != NULL) + return (spa->spa_root_vdev->vdev_guid); + else + return (spa->spa_load_guid); +} + +uint64_t +spa_last_synced_txg(spa_t *spa) +{ + return (spa->spa_ubsync.ub_txg); +} + +uint64_t +spa_first_txg(spa_t *spa) +{ + return (spa->spa_first_txg); +} + +pool_state_t +spa_state(spa_t *spa) +{ + return (spa->spa_state); +} + +uint64_t +spa_freeze_txg(spa_t *spa) +{ + return (spa->spa_freeze_txg); +} + +/* + * Return how much space is allocated in the pool (ie. sum of all asize) + */ +uint64_t +spa_get_alloc(spa_t *spa) +{ + return (spa->spa_root_vdev->vdev_stat.vs_alloc); +} + +/* + * Return how much (raid-z inflated) space there is in the pool. + */ +uint64_t +spa_get_space(spa_t *spa) +{ + return (spa->spa_root_vdev->vdev_stat.vs_space); +} + +/* + * Return the amount of raid-z-deflated space in the pool. + */ +uint64_t +spa_get_dspace(spa_t *spa) +{ + if (spa->spa_deflate) + return (spa->spa_root_vdev->vdev_stat.vs_dspace); + else + return (spa->spa_root_vdev->vdev_stat.vs_space); +} + +/* ARGSUSED */ +uint64_t +spa_get_asize(spa_t *spa, uint64_t lsize) +{ + /* + * For now, the worst case is 512-byte RAID-Z blocks, in which + * case the space requirement is exactly 2x; so just assume that. + * Add to this the fact that we can have up to 3 DVAs per bp, and + * we have to multiply by a total of 6x. + */ + return (lsize * 6); +} + +/* + * Return the failure mode that has been set to this pool. The default + * behavior will be to block all I/Os when a complete failure occurs. + */ +uint8_t +spa_get_failmode(spa_t *spa) +{ + return (spa->spa_failmode); +} + +boolean_t +spa_suspended(spa_t *spa) +{ + return (spa->spa_suspended); +} + +uint64_t +spa_version(spa_t *spa) +{ + return (spa->spa_ubsync.ub_version); +} + +int +spa_max_replication(spa_t *spa) +{ + /* + * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to + * handle BPs with more than one DVA allocated. Set our max + * replication level accordingly. + */ + if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) + return (1); + return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); +} + +uint64_t +bp_get_dasize(spa_t *spa, const blkptr_t *bp) +{ + int sz = 0, i; + + if (!spa->spa_deflate) + return (BP_GET_ASIZE(bp)); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (i = 0; i < SPA_DVAS_PER_BP; i++) { + vdev_t *vd = + vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i])); + if (vd) + sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> + SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; + } + spa_config_exit(spa, SCL_VDEV, FTAG); + return (sz); +} + +/* + * ========================================================================== + * Initialization and Termination + * ========================================================================== + */ + +static int +spa_name_compare(const void *a1, const void *a2) +{ + const spa_t *s1 = a1; + const spa_t *s2 = a2; + int s; + + s = strcmp(s1->spa_name, s2->spa_name); + if (s > 0) + return (1); + if (s < 0) + return (-1); + return (0); +} + +int +spa_busy(void) +{ + return (spa_active_count); +} + +void +spa_boot_init() +{ + spa_config_load(); +} + +void +spa_init(int mode) +{ + mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); + + avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), + offsetof(spa_t, spa_avl)); + + avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); + + avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); + + spa_mode = mode; + + refcount_init(); + unique_init(); + zio_init(); + dmu_init(); + zil_init(); + vdev_cache_stat_init(); + zfs_prop_init(); + zpool_prop_init(); + spa_config_load(); + l2arc_start(); +} + +void +spa_fini(void) +{ + l2arc_stop(); + + spa_evict_all(); + + vdev_cache_stat_fini(); + zil_fini(); + dmu_fini(); + zio_fini(); + unique_fini(); + refcount_fini(); + + avl_destroy(&spa_namespace_avl); + avl_destroy(&spa_spare_avl); + avl_destroy(&spa_l2cache_avl); + + cv_destroy(&spa_namespace_cv); + mutex_destroy(&spa_namespace_lock); + mutex_destroy(&spa_spare_lock); + mutex_destroy(&spa_l2cache_lock); +} + +/* + * Return whether this pool has slogs. No locking needed. + * It's not a problem if the wrong answer is returned as it's only for + * performance and not correctness + */ +boolean_t +spa_has_slogs(spa_t *spa) +{ + return (spa->spa_log_class->mc_rotor != NULL); +} + +/* + * Return whether this pool is the root pool. + */ +boolean_t +spa_is_root(spa_t *spa) +{ + return (spa->spa_is_root); +} diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c new file mode 100644 index 000000000..0a1fd59ea --- /dev/null +++ b/module/zfs/space_map.c @@ -0,0 +1,506 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zio.h> +#include <sys/space_map.h> + +/* + * Space map routines. + * NOTE: caller is responsible for all locking. + */ +static int +space_map_seg_compare(const void *x1, const void *x2) +{ + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + + if (s1->ss_start < s2->ss_start) { + if (s1->ss_end > s2->ss_start) + return (0); + return (-1); + } + if (s1->ss_start > s2->ss_start) { + if (s1->ss_start < s2->ss_end) + return (0); + return (1); + } + return (0); +} + +void +space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, + kmutex_t *lp) +{ + bzero(sm, sizeof (*sm)); + + avl_create(&sm->sm_root, space_map_seg_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); + + sm->sm_start = start; + sm->sm_size = size; + sm->sm_shift = shift; + sm->sm_lock = lp; +} + +void +space_map_destroy(space_map_t *sm) +{ + ASSERT(!sm->sm_loaded && !sm->sm_loading); + VERIFY3U(sm->sm_space, ==, 0); + avl_destroy(&sm->sm_root); +} + +void +space_map_add(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss_before, *ss_after, *ss; + uint64_t end = start + size; + int merge_before, merge_after; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY3U(start, >=, sm->sm_start); + VERIFY3U(end, <=, sm->sm_start + sm->sm_size); + VERIFY(sm->sm_space + size <= sm->sm_size); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { + zfs_panic_recover("zfs: allocating allocated segment" + "(offset=%llu size=%llu)\n", + (longlong_t)start, (longlong_t)size); + return; + } + + /* Make sure we don't overlap with either of our neighbors */ + VERIFY(ss == NULL); + + ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE); + ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER); + + merge_before = (ss_before != NULL && ss_before->ss_end == start); + merge_after = (ss_after != NULL && ss_after->ss_start == end); + + if (merge_before && merge_after) { + avl_remove(&sm->sm_root, ss_before); + ss_after->ss_start = ss_before->ss_start; + kmem_free(ss_before, sizeof (*ss_before)); + } else if (merge_before) { + ss_before->ss_end = end; + } else if (merge_after) { + ss_after->ss_start = start; + } else { + ss = kmem_alloc(sizeof (*ss), KM_SLEEP); + ss->ss_start = start; + ss->ss_end = end; + avl_insert(&sm->sm_root, ss, where); + } + + sm->sm_space += size; +} + +void +space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss, *newseg; + uint64_t end = start + size; + int left_over, right_over; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + /* Make sure we completely overlap with someone */ + if (ss == NULL) { + zfs_panic_recover("zfs: freeing free segment " + "(offset=%llu size=%llu)", + (longlong_t)start, (longlong_t)size); + return; + } + VERIFY3U(ss->ss_start, <=, start); + VERIFY3U(ss->ss_end, >=, end); + VERIFY(sm->sm_space - size <= sm->sm_size); + + left_over = (ss->ss_start != start); + right_over = (ss->ss_end != end); + + if (left_over && right_over) { + newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); + newseg->ss_start = end; + newseg->ss_end = ss->ss_end; + ss->ss_end = start; + avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); + } else if (left_over) { + ss->ss_end = start; + } else if (right_over) { + ss->ss_start = end; + } else { + avl_remove(&sm->sm_root, ss); + kmem_free(ss, sizeof (*ss)); + } + + sm->sm_space -= size; +} + +int +space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss; + uint64_t end = start + size; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); +} + +void +space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) +{ + space_seg_t *ss; + void *cookie = NULL; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { + if (func != NULL) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); + kmem_free(ss, sizeof (*ss)); + } + sm->sm_space = 0; +} + +void +space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) +{ + space_seg_t *ss; + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); +} + +void +space_map_excise(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + avl_index_t where; + space_seg_t *ss, search; + uint64_t end = start + size; + uint64_t rm_start, rm_end; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + search.ss_start = start; + search.ss_end = start; + + for (;;) { + ss = avl_find(t, &search, &where); + + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + + if (ss == NULL || ss->ss_start >= end) + break; + + rm_start = MAX(ss->ss_start, start); + rm_end = MIN(ss->ss_end, end); + + space_map_remove(sm, rm_start, rm_end - rm_start); + } +} + +/* + * Replace smd with the union of smd and sms. + */ +void +space_map_union(space_map_t *smd, space_map_t *sms) +{ + avl_tree_t *t = &sms->sm_root; + space_seg_t *ss; + + ASSERT(MUTEX_HELD(smd->sm_lock)); + + /* + * For each source segment, remove any intersections with the + * destination, then add the source segment to the destination. + */ + for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { + space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start); + space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start); + } +} + +/* + * Wait for any in-progress space_map_load() to complete. + */ +void +space_map_load_wait(space_map_t *sm) +{ + ASSERT(MUTEX_HELD(sm->sm_lock)); + + while (sm->sm_loading) + cv_wait(&sm->sm_load_cv, sm->sm_lock); +} + +/* + * Note: space_map_load() will drop sm_lock across dmu_read() calls. + * The caller must be OK with this. + */ +int +space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, + space_map_obj_t *smo, objset_t *os) +{ + uint64_t *entry, *entry_map, *entry_map_end; + uint64_t bufsize, size, offset, end, space; + uint64_t mapstart = sm->sm_start; + int error = 0; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + space_map_load_wait(sm); + + if (sm->sm_loaded) + return (0); + + sm->sm_loading = B_TRUE; + end = smo->smo_objsize; + space = smo->smo_alloc; + + ASSERT(sm->sm_ops == NULL); + VERIFY3U(sm->sm_space, ==, 0); + + if (maptype == SM_FREE) { + space_map_add(sm, sm->sm_start, sm->sm_size); + space = sm->sm_size - space; + } + + bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT; + entry_map = zio_buf_alloc(bufsize); + + mutex_exit(sm->sm_lock); + if (end > bufsize) + dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize); + mutex_enter(sm->sm_lock); + + for (offset = 0; offset < end; offset += bufsize) { + size = MIN(end - offset, bufsize); + VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); + VERIFY(size != 0); + + dprintf("object=%llu offset=%llx size=%llx\n", + smo->smo_object, offset, size); + + mutex_exit(sm->sm_lock); + error = dmu_read(os, smo->smo_object, offset, size, entry_map); + mutex_enter(sm->sm_lock); + if (error != 0) + break; + + entry_map_end = entry_map + (size / sizeof (uint64_t)); + for (entry = entry_map; entry < entry_map_end; entry++) { + uint64_t e = *entry; + + if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ + continue; + + (SM_TYPE_DECODE(e) == maptype ? + space_map_add : space_map_remove)(sm, + (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart, + SM_RUN_DECODE(e) << sm->sm_shift); + } + } + + if (error == 0) { + VERIFY3U(sm->sm_space, ==, space); + + sm->sm_loaded = B_TRUE; + sm->sm_ops = ops; + if (ops != NULL) + ops->smop_load(sm); + } else { + space_map_vacate(sm, NULL, NULL); + } + + zio_buf_free(entry_map, bufsize); + + sm->sm_loading = B_FALSE; + + cv_broadcast(&sm->sm_load_cv); + + return (error); +} + +void +space_map_unload(space_map_t *sm) +{ + ASSERT(MUTEX_HELD(sm->sm_lock)); + + if (sm->sm_loaded && sm->sm_ops != NULL) + sm->sm_ops->smop_unload(sm); + + sm->sm_loaded = B_FALSE; + sm->sm_ops = NULL; + + space_map_vacate(sm, NULL, NULL); +} + +uint64_t +space_map_alloc(space_map_t *sm, uint64_t size) +{ + uint64_t start; + + start = sm->sm_ops->smop_alloc(sm, size); + if (start != -1ULL) + space_map_remove(sm, start, size); + return (start); +} + +void +space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + sm->sm_ops->smop_claim(sm, start, size); + space_map_remove(sm, start, size); +} + +void +space_map_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + space_map_add(sm, start, size); + sm->sm_ops->smop_free(sm, start, size); +} + +/* + * Note: space_map_sync() will drop sm_lock across dmu_write() calls. + */ +void +space_map_sync(space_map_t *sm, uint8_t maptype, + space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + void *cookie = NULL; + space_seg_t *ss; + uint64_t bufsize, start, size, run_len; + uint64_t *entry, *entry_map, *entry_map_end; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + if (sm->sm_space == 0) + return; + + dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n", + smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa), + maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), + sm->sm_space); + + if (maptype == SM_ALLOC) + smo->smo_alloc += sm->sm_space; + else + smo->smo_alloc -= sm->sm_space; + + bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); + bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT); + entry_map = zio_buf_alloc(bufsize); + entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); + entry = entry_map; + + *entry++ = SM_DEBUG_ENCODE(1) | + SM_DEBUG_ACTION_ENCODE(maptype) | + SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | + SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); + + while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { + size = ss->ss_end - ss->ss_start; + start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; + + sm->sm_space -= size; + size >>= sm->sm_shift; + + while (size) { + run_len = MIN(size, SM_RUN_MAX); + + if (entry == entry_map_end) { + mutex_exit(sm->sm_lock); + dmu_write(os, smo->smo_object, smo->smo_objsize, + bufsize, entry_map, tx); + mutex_enter(sm->sm_lock); + smo->smo_objsize += bufsize; + entry = entry_map; + } + + *entry++ = SM_OFFSET_ENCODE(start) | + SM_TYPE_ENCODE(maptype) | + SM_RUN_ENCODE(run_len); + + start += run_len; + size -= run_len; + } + kmem_free(ss, sizeof (*ss)); + } + + if (entry != entry_map) { + size = (entry - entry_map) * sizeof (uint64_t); + mutex_exit(sm->sm_lock); + dmu_write(os, smo->smo_object, smo->smo_objsize, + size, entry_map, tx); + mutex_enter(sm->sm_lock); + smo->smo_objsize += size; + } + + zio_buf_free(entry_map, bufsize); + + VERIFY3U(sm->sm_space, ==, 0); +} + +void +space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) +{ + VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0); + + smo->smo_objsize = 0; + smo->smo_alloc = 0; +} diff --git a/module/zfs/txg.c b/module/zfs/txg.c new file mode 100644 index 000000000..2bbf2f086 --- /dev/null +++ b/module/zfs/txg.c @@ -0,0 +1,627 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/txg_impl.h> +#include <sys/dmu_impl.h> +#include <sys/dsl_pool.h> +#include <sys/callb.h> + +/* + * Pool-wide transaction groups. + */ + +static void txg_sync_thread(dsl_pool_t *dp); +static void txg_quiesce_thread(dsl_pool_t *dp); + +int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */ + +/* + * Prepare the txg subsystem. + */ +void +txg_init(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + int c; + bzero(tx, sizeof (tx_state_t)); + + tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); + + for (c = 0; c < max_ncpus; c++) { + int i; + + mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); + for (i = 0; i < TXG_SIZE; i++) { + cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, + NULL); + } + } + + rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL); + mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); + + tx->tx_open_txg = txg; +} + +/* + * Close down the txg subsystem. + */ +void +txg_fini(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + int c; + + ASSERT(tx->tx_threads == 0); + + rw_destroy(&tx->tx_suspend); + mutex_destroy(&tx->tx_sync_lock); + + for (c = 0; c < max_ncpus; c++) { + int i; + + mutex_destroy(&tx->tx_cpu[c].tc_lock); + for (i = 0; i < TXG_SIZE; i++) + cv_destroy(&tx->tx_cpu[c].tc_cv[i]); + } + + kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); + + bzero(tx, sizeof (tx_state_t)); +} + +/* + * Start syncing transaction groups. + */ +void +txg_sync_start(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + + dprintf("pool %p\n", dp); + + ASSERT(tx->tx_threads == 0); + + tx->tx_threads = 2; + + tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + /* + * The sync thread can need a larger-than-default stack size on + * 32-bit x86. This is due in part to nested pools and + * scrub_visitbp() recursion. + */ + tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + mutex_exit(&tx->tx_sync_lock); +} + +static void +txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) +{ + CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); + mutex_enter(&tx->tx_sync_lock); +} + +static void +txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) +{ + ASSERT(*tpp != NULL); + *tpp = NULL; + tx->tx_threads--; + cv_broadcast(&tx->tx_exit_cv); + CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ + thread_exit(); +} + +static void +txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) +{ + CALLB_CPR_SAFE_BEGIN(cpr); + + if (time) + (void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + time); + else + cv_wait(cv, &tx->tx_sync_lock); + + CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); +} + +/* + * Stop syncing transaction groups. + */ +void +txg_sync_stop(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + dprintf("pool %p\n", dp); + /* + * Finish off any work in progress. + */ + ASSERT(tx->tx_threads == 2); + txg_wait_synced(dp, 0); + + /* + * Wake all sync threads and wait for them to die. + */ + mutex_enter(&tx->tx_sync_lock); + + ASSERT(tx->tx_threads == 2); + + tx->tx_exiting = 1; + + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + cv_broadcast(&tx->tx_sync_more_cv); + + while (tx->tx_threads != 0) + cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); + + tx->tx_exiting = 0; + + mutex_exit(&tx->tx_sync_lock); +} + +uint64_t +txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) +{ + tx_state_t *tx = &dp->dp_tx; + tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; + uint64_t txg; + + mutex_enter(&tc->tc_lock); + + txg = tx->tx_open_txg; + tc->tc_count[txg & TXG_MASK]++; + + th->th_cpu = tc; + th->th_txg = txg; + + return (txg); +} + +void +txg_rele_to_quiesce(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + + mutex_exit(&tc->tc_lock); +} + +void +txg_rele_to_sync(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + ASSERT(tc->tc_count[g] != 0); + if (--tc->tc_count[g] == 0) + cv_broadcast(&tc->tc_cv[g]); + mutex_exit(&tc->tc_lock); + + th->th_cpu = NULL; /* defensive */ +} + +static void +txg_quiesce(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + int g = txg & TXG_MASK; + int c; + + /* + * Grab all tx_cpu locks so nobody else can get into this txg. + */ + for (c = 0; c < max_ncpus; c++) + mutex_enter(&tx->tx_cpu[c].tc_lock); + + ASSERT(txg == tx->tx_open_txg); + tx->tx_open_txg++; + + /* + * Now that we've incremented tx_open_txg, we can let threads + * enter the next transaction group. + */ + for (c = 0; c < max_ncpus; c++) + mutex_exit(&tx->tx_cpu[c].tc_lock); + + /* + * Quiesce the transaction group by waiting for everyone to txg_exit(). + */ + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + mutex_enter(&tc->tc_lock); + while (tc->tc_count[g] != 0) + cv_wait(&tc->tc_cv[g], &tc->tc_lock); + mutex_exit(&tc->tc_lock); + } +} + +static void +txg_sync_thread(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + uint64_t start, delta; + + txg_thread_enter(tx, &cpr); + + start = delta = 0; + for (;;) { + uint64_t timer, timeout = zfs_txg_timeout * hz; + uint64_t txg; + + /* + * We sync when we're scrubbing, there's someone waiting + * on us, or the quiesce thread has handed off a txg to + * us, or we have reached our timeout. + */ + timer = (delta >= timeout ? 0 : timeout - delta); + while ((dp->dp_scrub_func == SCRUB_FUNC_NONE || + spa_shutting_down(dp->dp_spa)) && + !tx->tx_exiting && timer > 0 && + tx->tx_synced_txg >= tx->tx_sync_txg_waiting && + tx->tx_quiesced_txg == 0) { + dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); + delta = lbolt - start; + timer = (delta > timeout ? 0 : timeout - delta); + } + + /* + * Wait until the quiesce thread hands off a txg to us, + * prompting it to do so if necessary. + */ + while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { + if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) + tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; + cv_broadcast(&tx->tx_quiesce_more_cv); + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); + } + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + + rw_enter(&tx->tx_suspend, RW_WRITER); + + /* + * Consume the quiesced txg which has been handed off to + * us. This may cause the quiescing thread to now be + * able to quiesce another txg, so we must signal it. + */ + txg = tx->tx_quiesced_txg; + tx->tx_quiesced_txg = 0; + tx->tx_syncing_txg = txg; + cv_broadcast(&tx->tx_quiesce_more_cv); + rw_exit(&tx->tx_suspend); + + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + mutex_exit(&tx->tx_sync_lock); + + start = lbolt; + spa_sync(dp->dp_spa, txg); + delta = lbolt - start; + + mutex_enter(&tx->tx_sync_lock); + rw_enter(&tx->tx_suspend, RW_WRITER); + tx->tx_synced_txg = txg; + tx->tx_syncing_txg = 0; + rw_exit(&tx->tx_suspend); + cv_broadcast(&tx->tx_sync_done_cv); + } +} + +static void +txg_quiesce_thread(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + for (;;) { + uint64_t txg; + + /* + * We quiesce when there's someone waiting on us. + * However, we can only have one txg in "quiescing" or + * "quiesced, waiting to sync" state. So we wait until + * the "quiesced, waiting to sync" txg has been consumed + * by the sync thread. + */ + while (!tx->tx_exiting && + (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || + tx->tx_quiesced_txg != 0)) + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); + + txg = tx->tx_open_txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, + tx->tx_sync_txg_waiting); + mutex_exit(&tx->tx_sync_lock); + txg_quiesce(dp, txg); + mutex_enter(&tx->tx_sync_lock); + + /* + * Hand this txg off to the sync thread. + */ + dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiesced_txg = txg; + cv_broadcast(&tx->tx_sync_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + } +} + +/* + * Delay this thread by 'ticks' if we are still in the open transaction + * group and there is already a waiting txg quiesing or quiesced. Abort + * the delay if this txg stalls or enters the quiesing state. + */ +void +txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) +{ + tx_state_t *tx = &dp->dp_tx; + int timeout = lbolt + ticks; + + /* don't delay if this txg could transition to quiesing immediately */ + if (tx->tx_open_txg > txg || + tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) + return; + + mutex_enter(&tx->tx_sync_lock); + if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { + mutex_exit(&tx->tx_sync_lock); + return; + } + + while (lbolt < timeout && + tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) + (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, + timeout); + + mutex_exit(&tx->tx_sync_lock); +} + +void +txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + ASSERT(tx->tx_threads == 2); + if (txg == 0) + txg = tx->tx_open_txg; + if (tx->tx_sync_txg_waiting < txg) + tx->tx_sync_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_synced_txg < txg) { + dprintf("broadcasting sync more " + "tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + cv_broadcast(&tx->tx_sync_more_cv); + cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + } + mutex_exit(&tx->tx_sync_lock); +} + +void +txg_wait_open(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + ASSERT(tx->tx_threads == 2); + if (txg == 0) + txg = tx->tx_open_txg + 1; + if (tx->tx_quiesce_txg_waiting < txg) + tx->tx_quiesce_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_open_txg < txg) { + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); + } + mutex_exit(&tx->tx_sync_lock); +} + +boolean_t +txg_stalled(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); +} + +boolean_t +txg_sync_waiting(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || + tx->tx_quiesced_txg != 0); +} + +void +txg_suspend(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + /* XXX some code paths suspend when they are already suspended! */ + rw_enter(&tx->tx_suspend, RW_READER); +} + +void +txg_resume(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + rw_exit(&tx->tx_suspend); +} + +/* + * Per-txg object lists. + */ +void +txg_list_create(txg_list_t *tl, size_t offset) +{ + int t; + + mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); + + tl->tl_offset = offset; + + for (t = 0; t < TXG_SIZE; t++) + tl->tl_head[t] = NULL; +} + +void +txg_list_destroy(txg_list_t *tl) +{ + int t; + + for (t = 0; t < TXG_SIZE; t++) + ASSERT(txg_list_empty(tl, t)); + + mutex_destroy(&tl->tl_lock); +} + +int +txg_list_empty(txg_list_t *tl, uint64_t txg) +{ + return (tl->tl_head[txg & TXG_MASK] == NULL); +} + +/* + * Add an entry to the list. + * Returns 0 if it's a new entry, 1 if it's already there. + */ +int +txg_list_add(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + int already_on_list; + + mutex_enter(&tl->tl_lock); + already_on_list = tn->tn_member[t]; + if (!already_on_list) { + tn->tn_member[t] = 1; + tn->tn_next[t] = tl->tl_head[t]; + tl->tl_head[t] = tn; + } + mutex_exit(&tl->tl_lock); + + return (already_on_list); +} + +/* + * Remove the head of the list and return it. + */ +void * +txg_list_remove(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn; + void *p = NULL; + + mutex_enter(&tl->tl_lock); + if ((tn = tl->tl_head[t]) != NULL) { + p = (char *)tn - tl->tl_offset; + tl->tl_head[t] = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + } + mutex_exit(&tl->tl_lock); + + return (p); +} + +/* + * Remove a specific item from the list and return it. + */ +void * +txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn, **tp; + + mutex_enter(&tl->tl_lock); + + for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { + if ((char *)tn - tl->tl_offset == p) { + *tp = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + mutex_exit(&tl->tl_lock); + return (p); + } + } + + mutex_exit(&tl->tl_lock); + + return (NULL); +} + +int +txg_list_member(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + return (tn->tn_member[t]); +} + +/* + * Walk a txg list -- only safe if you know it's not changing. + */ +void * +txg_list_head(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = tl->tl_head[t]; + + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} + +void * +txg_list_next(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + tn = tn->tn_next[t]; + + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c new file mode 100644 index 000000000..34d7e0c3a --- /dev/null +++ b/module/zfs/uberblock.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/uberblock_impl.h> +#include <sys/vdev_impl.h> + +int +uberblock_verify(uberblock_t *ub) +{ + if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) + byteswap_uint64_array(ub, sizeof (uberblock_t)); + + if (ub->ub_magic != UBERBLOCK_MAGIC) + return (EINVAL); + + return (0); +} + +/* + * Update the uberblock and return a boolean value indicating whether + * anything changed in this transaction group. + */ +int +uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) +{ + ASSERT(ub->ub_txg < txg); + + /* + * We explicitly do not set ub_version here, so that older versions + * continue to be written with the previous uberblock version. + */ + ub->ub_magic = UBERBLOCK_MAGIC; + ub->ub_txg = txg; + ub->ub_guid_sum = rvd->vdev_guid_sum; + ub->ub_timestamp = gethrestime_sec(); + + return (ub->ub_rootbp.blk_birth == txg); +} diff --git a/module/zfs/unique.c b/module/zfs/unique.c new file mode 100644 index 000000000..fbe7b619a --- /dev/null +++ b/module/zfs/unique.c @@ -0,0 +1,116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/unique.h> + +static avl_tree_t unique_avl; +static kmutex_t unique_mtx; + +typedef struct unique { + avl_node_t un_link; + uint64_t un_value; +} unique_t; + +#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1) + +static int +unique_compare(const void *a, const void *b) +{ + const unique_t *una = a; + const unique_t *unb = b; + + if (una->un_value < unb->un_value) + return (-1); + if (una->un_value > unb->un_value) + return (+1); + return (0); +} + +void +unique_init(void) +{ + avl_create(&unique_avl, unique_compare, + sizeof (unique_t), offsetof(unique_t, un_link)); + mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL); +} + +void +unique_fini(void) +{ + avl_destroy(&unique_avl); + mutex_destroy(&unique_mtx); +} + +uint64_t +unique_create(void) +{ + uint64_t value = unique_insert(0); + unique_remove(value); + return (value); +} + +uint64_t +unique_insert(uint64_t value) +{ + avl_index_t idx; + unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP); + + un->un_value = value; + + mutex_enter(&unique_mtx); + while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK || + avl_find(&unique_avl, un, &idx)) { + mutex_exit(&unique_mtx); + (void) random_get_pseudo_bytes((void*)&un->un_value, + sizeof (un->un_value)); + un->un_value &= UNIQUE_MASK; + mutex_enter(&unique_mtx); + } + + avl_insert(&unique_avl, un, idx); + mutex_exit(&unique_mtx); + + return (un->un_value); +} + +void +unique_remove(uint64_t value) +{ + unique_t un_tofind; + unique_t *un; + + un_tofind.un_value = value; + mutex_enter(&unique_mtx); + un = avl_find(&unique_avl, &un_tofind, NULL); + if (un != NULL) { + avl_remove(&unique_avl, un); + kmem_free(un, sizeof (unique_t)); + } + mutex_exit(&unique_mtx); +} diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c new file mode 100644 index 000000000..16a27e514 --- /dev/null +++ b/module/zfs/vdev.c @@ -0,0 +1,2425 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/space_map.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/arc.h> + +/* + * Virtual device management. + */ + +static vdev_ops_t *vdev_ops_table[] = { + &vdev_root_ops, + &vdev_raidz_ops, + &vdev_mirror_ops, + &vdev_replacing_ops, + &vdev_spare_ops, + &vdev_disk_ops, + &vdev_file_ops, + &vdev_missing_ops, + NULL +}; + +/* maximum scrub/resilver I/O queue per leaf vdev */ +int zfs_scrub_limit = 10; + +/* + * Given a vdev type, return the appropriate ops vector. + */ +static vdev_ops_t * +vdev_getops(const char *type) +{ + vdev_ops_t *ops, **opspp; + + for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) + if (strcmp(ops->vdev_op_type, type) == 0) + break; + + return (ops); +} + +/* + * Default asize function: return the MAX of psize with the asize of + * all children. This is what's used by anything other than RAID-Z. + */ +uint64_t +vdev_default_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); + uint64_t csize; + uint64_t c; + + for (c = 0; c < vd->vdev_children; c++) { + csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + asize = MAX(asize, csize); + } + + return (asize); +} + +/* + * Get the replaceable or attachable device size. + * If the parent is a mirror or raidz, the replaceable size is the minimum + * psize of all its children. For the rest, just return our own psize. + * + * e.g. + * psize rsize + * root - - + * mirror/raidz - - + * disk1 20g 20g + * disk2 40g 20g + * disk3 80g 80g + */ +uint64_t +vdev_get_rsize(vdev_t *vd) +{ + vdev_t *pvd, *cvd; + uint64_t c, rsize; + + pvd = vd->vdev_parent; + + /* + * If our parent is NULL or the root, just return our own psize. + */ + if (pvd == NULL || pvd->vdev_parent == NULL) + return (vd->vdev_psize); + + rsize = 0; + + for (c = 0; c < pvd->vdev_children; c++) { + cvd = pvd->vdev_child[c]; + rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; + } + + return (rsize); +} + +vdev_t * +vdev_lookup_top(spa_t *spa, uint64_t vdev) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + if (vdev < rvd->vdev_children) { + ASSERT(rvd->vdev_child[vdev] != NULL); + return (rvd->vdev_child[vdev]); + } + + return (NULL); +} + +vdev_t * +vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) +{ + int c; + vdev_t *mvd; + + if (vd->vdev_guid == guid) + return (vd); + + for (c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != + NULL) + return (mvd); + + return (NULL); +} + +void +vdev_add_child(vdev_t *pvd, vdev_t *cvd) +{ + size_t oldsize, newsize; + uint64_t id = cvd->vdev_id; + vdev_t **newchild; + + ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(cvd->vdev_parent == NULL); + + cvd->vdev_parent = pvd; + + if (pvd == NULL) + return; + + ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); + + oldsize = pvd->vdev_children * sizeof (vdev_t *); + pvd->vdev_children = MAX(pvd->vdev_children, id + 1); + newsize = pvd->vdev_children * sizeof (vdev_t *); + + newchild = kmem_zalloc(newsize, KM_SLEEP); + if (pvd->vdev_child != NULL) { + bcopy(pvd->vdev_child, newchild, oldsize); + kmem_free(pvd->vdev_child, oldsize); + } + + pvd->vdev_child = newchild; + pvd->vdev_child[id] = cvd; + + cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); + ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += cvd->vdev_guid_sum; + + if (cvd->vdev_ops->vdev_op_leaf) + cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; +} + +void +vdev_remove_child(vdev_t *pvd, vdev_t *cvd) +{ + int c; + uint_t id = cvd->vdev_id; + + ASSERT(cvd->vdev_parent == pvd); + + if (pvd == NULL) + return; + + ASSERT(id < pvd->vdev_children); + ASSERT(pvd->vdev_child[id] == cvd); + + pvd->vdev_child[id] = NULL; + cvd->vdev_parent = NULL; + + for (c = 0; c < pvd->vdev_children; c++) + if (pvd->vdev_child[c]) + break; + + if (c == pvd->vdev_children) { + kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); + pvd->vdev_child = NULL; + pvd->vdev_children = 0; + } + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum -= cvd->vdev_guid_sum; + + if (cvd->vdev_ops->vdev_op_leaf) + cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; +} + +/* + * Remove any holes in the child array. + */ +void +vdev_compact_children(vdev_t *pvd) +{ + vdev_t **newchild, *cvd; + int oldc = pvd->vdev_children; + int newc, c; + + ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + for (c = newc = 0; c < oldc; c++) + if (pvd->vdev_child[c]) + newc++; + + newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); + + for (c = newc = 0; c < oldc; c++) { + if ((cvd = pvd->vdev_child[c]) != NULL) { + newchild[newc] = cvd; + cvd->vdev_id = newc++; + } + } + + kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); + pvd->vdev_child = newchild; + pvd->vdev_children = newc; +} + +/* + * Allocate and minimally initialize a vdev_t. + */ +static vdev_t * +vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) +{ + vdev_t *vd; + + vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + + if (spa->spa_root_vdev == NULL) { + ASSERT(ops == &vdev_root_ops); + spa->spa_root_vdev = vd; + } + + if (guid == 0) { + if (spa->spa_root_vdev == vd) { + /* + * The root vdev's guid will also be the pool guid, + * which must be unique among all pools. + */ + while (guid == 0 || spa_guid_exists(guid, 0)) + guid = spa_get_random(-1ULL); + } else { + /* + * Any other vdev's guid must be unique within the pool. + */ + while (guid == 0 || + spa_guid_exists(spa_guid(spa), guid)) + guid = spa_get_random(-1ULL); + } + ASSERT(!spa_guid_exists(spa_guid(spa), guid)); + } + + vd->vdev_spa = spa; + vd->vdev_id = id; + vd->vdev_guid = guid; + vd->vdev_guid_sum = guid; + vd->vdev_ops = ops; + vd->vdev_state = VDEV_STATE_CLOSED; + + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); + space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); + space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); + txg_list_create(&vd->vdev_ms_list, + offsetof(struct metaslab, ms_txg_node)); + txg_list_create(&vd->vdev_dtl_list, + offsetof(struct vdev, vdev_dtl_node)); + vd->vdev_stat.vs_timestamp = gethrtime(); + vdev_queue_init(vd); + vdev_cache_init(vd); + + return (vd); +} + +/* + * Allocate a new vdev. The 'alloctype' is used to control whether we are + * creating a new vdev or loading an existing one - the behavior is slightly + * different for each case. + */ +int +vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, + int alloctype) +{ + vdev_ops_t *ops; + char *type; + uint64_t guid = 0, islog, nparity; + vdev_t *vd; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (EINVAL); + + if ((ops = vdev_getops(type)) == NULL) + return (EINVAL); + + /* + * If this is a load, get the vdev guid from the nvlist. + * Otherwise, vdev_alloc_common() will generate one for us. + */ + if (alloctype == VDEV_ALLOC_LOAD) { + uint64_t label_id; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || + label_id != id) + return (EINVAL); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); + } else if (alloctype == VDEV_ALLOC_SPARE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); + } else if (alloctype == VDEV_ALLOC_L2CACHE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); + } + + /* + * The first allocated vdev must be of type 'root'. + */ + if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) + return (EINVAL); + + /* + * Determine whether we're a log vdev. + */ + islog = 0; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); + if (islog && spa_version(spa) < SPA_VERSION_SLOGS) + return (ENOTSUP); + + /* + * Set the nparity property for RAID-Z vdevs. + */ + nparity = -1ULL; + if (ops == &vdev_raidz_ops) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &nparity) == 0) { + /* + * Currently, we can only support 2 parity devices. + */ + if (nparity == 0 || nparity > 2) + return (EINVAL); + /* + * Older versions can only support 1 parity device. + */ + if (nparity == 2 && + spa_version(spa) < SPA_VERSION_RAID6) + return (ENOTSUP); + } else { + /* + * We require the parity to be specified for SPAs that + * support multiple parity levels. + */ + if (spa_version(spa) >= SPA_VERSION_RAID6) + return (EINVAL); + /* + * Otherwise, we default to 1 parity device for RAID-Z. + */ + nparity = 1; + } + } else { + nparity = 0; + } + ASSERT(nparity != -1ULL); + + vd = vdev_alloc_common(spa, id, guid, ops); + + vd->vdev_islog = islog; + vd->vdev_nparity = nparity; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) + vd->vdev_path = spa_strdup(vd->vdev_path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) + vd->vdev_devid = spa_strdup(vd->vdev_devid); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, + &vd->vdev_physpath) == 0) + vd->vdev_physpath = spa_strdup(vd->vdev_physpath); + + /* + * Set the whole_disk property. If it's not specified, leave the value + * as -1. + */ + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &vd->vdev_wholedisk) != 0) + vd->vdev_wholedisk = -1ULL; + + /* + * Look for the 'not present' flag. This will only be set if the device + * was not present at the time of import. + */ + if (!spa->spa_import_faulted) + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); + + /* + * Get the alignment requirement. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + + /* + * If we're a top-level vdev, try to load the allocation parameters. + */ + if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + &vd->vdev_ms_array); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + &vd->vdev_ms_shift); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, + &vd->vdev_asize); + } + + /* + * If we're a leaf vdev, try to load the DTL object and other state. + */ + if (vd->vdev_ops->vdev_op_leaf && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { + if (alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, + &vd->vdev_dtl.smo_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, + &vd->vdev_unspare); + } + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, + &vd->vdev_offline); + + /* + * When importing a pool, we want to ignore the persistent fault + * state, as the diagnosis made on another system may not be + * valid in the current context. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, + &vd->vdev_faulted); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, + &vd->vdev_degraded); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, + &vd->vdev_removed); + } + } + + /* + * Add ourselves to the parent's list of children. + */ + vdev_add_child(parent, vd); + + *vdp = vd; + + return (0); +} + +void +vdev_free(vdev_t *vd) +{ + int c; + spa_t *spa = vd->vdev_spa; + + /* + * vdev_free() implies closing the vdev first. This is simpler than + * trying to ensure complicated semantics for all callers. + */ + vdev_close(vd); + + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); + + /* + * Free all children. + */ + for (c = 0; c < vd->vdev_children; c++) + vdev_free(vd->vdev_child[c]); + + ASSERT(vd->vdev_child == NULL); + ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + + /* + * Discard allocation state. + */ + if (vd == vd->vdev_top) + vdev_metaslab_fini(vd); + + ASSERT3U(vd->vdev_stat.vs_space, ==, 0); + ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); + ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); + + /* + * Remove this vdev from its parent's child list. + */ + vdev_remove_child(vd->vdev_parent, vd); + + ASSERT(vd->vdev_parent == NULL); + + /* + * Clean up vdev structure. + */ + vdev_queue_fini(vd); + vdev_cache_fini(vd); + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + if (vd->vdev_isl2cache) + spa_l2cache_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); + space_map_unload(&vd->vdev_dtl_map); + space_map_destroy(&vd->vdev_dtl_map); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + mutex_destroy(&vd->vdev_probe_lock); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); +} + +/* + * Transfer top-level vdev state from svd to tvd. + */ +static void +vdev_top_transfer(vdev_t *svd, vdev_t *tvd) +{ + spa_t *spa = svd->vdev_spa; + metaslab_t *msp; + vdev_t *vd; + int t; + + ASSERT(tvd == tvd->vdev_top); + + tvd->vdev_ms_array = svd->vdev_ms_array; + tvd->vdev_ms_shift = svd->vdev_ms_shift; + tvd->vdev_ms_count = svd->vdev_ms_count; + + svd->vdev_ms_array = 0; + svd->vdev_ms_shift = 0; + svd->vdev_ms_count = 0; + + tvd->vdev_mg = svd->vdev_mg; + tvd->vdev_ms = svd->vdev_ms; + + svd->vdev_mg = NULL; + svd->vdev_ms = NULL; + + if (tvd->vdev_mg != NULL) + tvd->vdev_mg->mg_vd = tvd; + + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; + tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; + tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; + + svd->vdev_stat.vs_alloc = 0; + svd->vdev_stat.vs_space = 0; + svd->vdev_stat.vs_dspace = 0; + + for (t = 0; t < TXG_SIZE; t++) { + while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_ms_list, msp, t); + while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); + if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) + (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); + } + + if (list_link_active(&svd->vdev_config_dirty_node)) { + vdev_config_clean(svd); + vdev_config_dirty(tvd); + } + + if (list_link_active(&svd->vdev_state_dirty_node)) { + vdev_state_clean(svd); + vdev_state_dirty(tvd); + } + + tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; + svd->vdev_deflate_ratio = 0; + + tvd->vdev_islog = svd->vdev_islog; + svd->vdev_islog = 0; +} + +static void +vdev_top_update(vdev_t *tvd, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + vd->vdev_top = tvd; + + for (c = 0; c < vd->vdev_children; c++) + vdev_top_update(tvd, vd->vdev_child[c]); +} + +/* + * Add a mirror/replacing vdev above an existing vdev. + */ +vdev_t * +vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) +{ + spa_t *spa = cvd->vdev_spa; + vdev_t *pvd = cvd->vdev_parent; + vdev_t *mvd; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); + + mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_state = cvd->vdev_state; + + vdev_remove_child(pvd, cvd); + vdev_add_child(pvd, mvd); + cvd->vdev_id = mvd->vdev_children; + vdev_add_child(mvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (mvd == mvd->vdev_top) + vdev_top_transfer(cvd, mvd); + + return (mvd); +} + +/* + * Remove a 1-way mirror/replacing vdev from the tree. + */ +void +vdev_remove_parent(vdev_t *cvd) +{ + vdev_t *mvd = cvd->vdev_parent; + vdev_t *pvd = mvd->vdev_parent; + + ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + ASSERT(mvd->vdev_children == 1); + ASSERT(mvd->vdev_ops == &vdev_mirror_ops || + mvd->vdev_ops == &vdev_replacing_ops || + mvd->vdev_ops == &vdev_spare_ops); + cvd->vdev_ashift = mvd->vdev_ashift; + + vdev_remove_child(mvd, cvd); + vdev_remove_child(pvd, mvd); + /* + * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. + * Otherwise, we could have detached an offline device, and when we + * go to import the pool we'll think we have two top-level vdevs, + * instead of a different version of the same top-level vdev. + */ + if (mvd->vdev_top == mvd) + cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid; + cvd->vdev_id = mvd->vdev_id; + vdev_add_child(pvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (cvd == cvd->vdev_top) + vdev_top_transfer(mvd, cvd); + + ASSERT(mvd->vdev_children == 0); + vdev_free(mvd); +} + +int +vdev_metaslab_init(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + metaslab_class_t *mc; + uint64_t m; + uint64_t oldc = vd->vdev_ms_count; + uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; + metaslab_t **mspp; + int error; + + if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ + return (0); + + ASSERT(oldc <= newc); + + if (vd->vdev_islog) + mc = spa->spa_log_class; + else + mc = spa->spa_normal_class; + + if (vd->vdev_mg == NULL) + vd->vdev_mg = metaslab_group_create(mc, vd); + + mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); + + if (oldc != 0) { + bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); + kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); + } + + vd->vdev_ms = mspp; + vd->vdev_ms_count = newc; + + for (m = oldc; m < newc; m++) { + space_map_obj_t smo = { 0, 0, 0 }; + if (txg == 0) { + uint64_t object = 0; + error = dmu_read(mos, vd->vdev_ms_array, + m * sizeof (uint64_t), sizeof (uint64_t), &object); + if (error) + return (error); + if (object != 0) { + dmu_buf_t *db; + error = dmu_bonus_hold(mos, object, FTAG, &db); + if (error) + return (error); + ASSERT3U(db->db_size, >=, sizeof (smo)); + bcopy(db->db_data, &smo, sizeof (smo)); + ASSERT3U(smo.smo_object, ==, object); + dmu_buf_rele(db, FTAG); + } + } + vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, + m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); + } + + return (0); +} + +void +vdev_metaslab_fini(vdev_t *vd) +{ + uint64_t m; + uint64_t count = vd->vdev_ms_count; + + if (vd->vdev_ms != NULL) { + for (m = 0; m < count; m++) + if (vd->vdev_ms[m] != NULL) + metaslab_fini(vd->vdev_ms[m]); + kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); + vd->vdev_ms = NULL; + } +} + +typedef struct vdev_probe_stats { + boolean_t vps_readable; + boolean_t vps_writeable; + int vps_flags; + zio_t *vps_root; + vdev_t *vps_vd; +} vdev_probe_stats_t; + +static void +vdev_probe_done(zio_t *zio) +{ + vdev_probe_stats_t *vps = zio->io_private; + vdev_t *vd = vps->vps_vd; + + if (zio->io_type == ZIO_TYPE_READ) { + ASSERT(zio->io_vd == vd); + if (zio->io_error == 0) + vps->vps_readable = 1; + if (zio->io_error == 0 && (spa_mode & FWRITE)) { + zio_nowait(zio_write_phys(vps->vps_root, vd, + zio->io_offset, zio->io_size, zio->io_data, + ZIO_CHECKSUM_OFF, vdev_probe_done, vps, + ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); + } else { + zio_buf_free(zio->io_data, zio->io_size); + } + } else if (zio->io_type == ZIO_TYPE_WRITE) { + ASSERT(zio->io_vd == vd); + if (zio->io_error == 0) + vps->vps_writeable = 1; + zio_buf_free(zio->io_data, zio->io_size); + } else if (zio->io_type == ZIO_TYPE_NULL) { + ASSERT(zio->io_vd == NULL); + ASSERT(zio == vps->vps_root); + + vd->vdev_cant_read |= !vps->vps_readable; + vd->vdev_cant_write |= !vps->vps_writeable; + + if (vdev_readable(vd) && + (vdev_writeable(vd) || !(spa_mode & FWRITE))) { + zio->io_error = 0; + } else { + ASSERT(zio->io_error != 0); + zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, + zio->io_spa, vd, NULL, 0, 0); + zio->io_error = ENXIO; + } + kmem_free(vps, sizeof (*vps)); + } +} + +/* + * Determine whether this device is accessible by reading and writing + * to several known locations: the pad regions of each vdev label + * but the first (which we leave alone in case it contains a VTOC). + */ +zio_t * +vdev_probe(vdev_t *vd, zio_t *pio) +{ + spa_t *spa = vd->vdev_spa; + vdev_probe_stats_t *vps; + zio_t *zio; + + vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + + vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY; + + if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { + /* + * vdev_cant_read and vdev_cant_write can only transition + * from TRUE to FALSE when we have the SCL_ZIO lock as writer; + * otherwise they can only transition from FALSE to TRUE. + * This ensures that any zio looking at these values can + * assume that failures persist for the life of the I/O. + * That's important because when a device has intermittent + * connectivity problems, we want to ensure that they're + * ascribed to the device (ENXIO) and not the zio (EIO). + * + * Since we hold SCL_ZIO as writer here, clear both values + * so the probe can reevaluate from first principles. + */ + vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + } + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags); + + vps->vps_root = zio; + vps->vps_vd = vd; + + for (int l = 1; l < VDEV_LABELS; l++) { + zio_nowait(zio_read_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, + offsetof(vdev_label_t, vl_pad)), + VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), + ZIO_CHECKSUM_OFF, vdev_probe_done, vps, + ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); + } + + return (zio); +} + +/* + * Prepare a virtual device for access. + */ +int +vdev_open(vdev_t *vd) +{ + int error; + int c; + uint64_t osize = 0; + uint64_t asize, psize; + uint64_t ashift = 0; + + ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || + vd->vdev_state == VDEV_STATE_CANT_OPEN || + vd->vdev_state == VDEV_STATE_OFFLINE); + + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + + if (!vd->vdev_removed && vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + return (ENXIO); + } else if (vd->vdev_offline) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); + return (ENXIO); + } + + error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + + if (zio_injection_enabled && error == 0) + error = zio_handle_device_injection(vd, ENXIO); + + if (error) { + if (vd->vdev_removed && + vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) + vd->vdev_removed = B_FALSE; + + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + vd->vdev_stat.vs_aux); + return (error); + } + + vd->vdev_removed = B_FALSE; + + if (vd->vdev_degraded) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } else { + vd->vdev_state = VDEV_STATE_HEALTHY; + } + + for (c = 0; c < vd->vdev_children; c++) + if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_NONE); + break; + } + + osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); + + if (vd->vdev_children == 0) { + if (osize < SPA_MINDEVSIZE) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); + return (EOVERFLOW); + } + psize = osize; + asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + } else { + if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); + return (EOVERFLOW); + } + psize = 0; + asize = osize; + } + + vd->vdev_psize = psize; + + if (vd->vdev_asize == 0) { + /* + * This is the first-ever open, so use the computed values. + * For testing purposes, a higher ashift can be requested. + */ + vd->vdev_asize = asize; + vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); + } else { + /* + * Make sure the alignment requirement hasn't increased. + */ + if (ashift > vd->vdev_top->vdev_ashift) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + + /* + * Make sure the device hasn't shrunk. + */ + if (asize < vd->vdev_asize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + + /* + * If all children are healthy and the asize has increased, + * then we've experienced dynamic LUN growth. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && + asize > vd->vdev_asize) { + vd->vdev_asize = asize; + } + } + + /* + * Ensure we can issue some IO before declaring the + * vdev open for business. + */ + if (vd->vdev_ops->vdev_op_leaf && + (error = zio_wait(vdev_probe(vd, NULL))) != 0) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_IO_FAILURE); + return (error); + } + + /* + * If this is a top-level vdev, compute the raidz-deflation + * ratio. Note, we hard-code in 128k (1<<17) because it is the + * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE + * changes, this algorithm must never change, or we will + * inconsistently account for existing bp's. + */ + if (vd->vdev_top == vd) { + vd->vdev_deflate_ratio = (1<<17) / + (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); + } + + /* + * If a leaf vdev has a DTL, and seems healthy, then kick off a + * resilver. But don't do this if we are doing a reopen for a + * scrub, since this would just restart the scrub we are already + * doing. + */ + if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) { + mutex_enter(&vd->vdev_dtl_lock); + if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) + spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER); + mutex_exit(&vd->vdev_dtl_lock); + } + + return (0); +} + +/* + * Called once the vdevs are all opened, this routine validates the label + * contents. This needs to be done before vdev_load() so that we don't + * inadvertently do repair I/Os to the wrong device. + * + * This function will only return failure if one of the vdevs indicates that it + * has since been destroyed or exported. This is only possible if + * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state + * will be updated but the function will return 0. + */ +int +vdev_validate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + int c; + nvlist_t *label; + uint64_t guid, top_guid; + uint64_t state; + + for (c = 0; c < vd->vdev_children; c++) + if (vdev_validate(vd->vdev_child[c]) != 0) + return (EBADF); + + /* + * If the device has already failed, or was marked offline, don't do + * any further validation. Otherwise, label I/O will fail and we will + * overwrite the previous state. + */ + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + + if ((label = vdev_label_read_config(vd)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &guid) != 0 || guid != spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + /* + * If this vdev just became a top-level vdev because its + * sibling was detached, it will have adopted the parent's + * vdev guid -- but the label may or may not be on disk yet. + * Fortunately, either version of the label will have the + * same top guid, so if we're a top-level vdev, we can + * safely compare to that instead. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, + &guid) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0 || + (vd->vdev_guid != guid && + (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + nvlist_free(label); + + if (spa->spa_load_state == SPA_LOAD_OPEN && + state != POOL_STATE_ACTIVE) + return (EBADF); + + /* + * If we were able to open and validate a vdev that was + * previously marked permanently unavailable, clear that state + * now. + */ + if (vd->vdev_not_present) + vd->vdev_not_present = 0; + } + + return (0); +} + +/* + * Close a virtual device. + */ +void +vdev_close(vdev_t *vd) +{ + vd->vdev_ops->vdev_op_close(vd); + + vdev_cache_purge(vd); + + /* + * We record the previous state before we close it, so that if we are + * doing a reopen(), we don't generate FMA ereports if we notice that + * it's still faulted. + */ + vd->vdev_prevstate = vd->vdev_state; + + if (vd->vdev_offline) + vd->vdev_state = VDEV_STATE_OFFLINE; + else + vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; +} + +void +vdev_reopen(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + vdev_close(vd); + (void) vdev_open(vd); + + /* + * Call vdev_validate() here to make sure we have the same device. + * Otherwise, a device with an invalid label could be successfully + * opened in response to vdev_reopen(). + */ + if (vd->vdev_aux) { + (void) vdev_validate_aux(vd); + if (vdev_readable(vd) && vdev_writeable(vd) && + !l2arc_vdev_present(vd)) { + uint64_t size = vdev_get_rsize(vd); + l2arc_add_vdev(spa, vd, + VDEV_LABEL_START_SIZE, + size - VDEV_LABEL_START_SIZE); + } + } else { + (void) vdev_validate(vd); + } + + /* + * Reassess parent vdev's health. + */ + vdev_propagate_state(vd); +} + +int +vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) +{ + int error; + + /* + * Normally, partial opens (e.g. of a mirror) are allowed. + * For a create, however, we want to fail the request if + * there are any components we can't open. + */ + error = vdev_open(vd); + + if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { + vdev_close(vd); + return (error ? error : ENXIO); + } + + /* + * Recursively initialize all labels. + */ + if ((error = vdev_label_init(vd, txg, isreplacing ? + VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { + vdev_close(vd); + return (error); + } + + return (0); +} + +/* + * The is the latter half of vdev_create(). It is distinct because it + * involves initiating transactions in order to do metaslab creation. + * For creation, we want to try to create all vdevs at once and then undo it + * if anything fails; this is much harder if we have pending transactions. + */ +void +vdev_init(vdev_t *vd, uint64_t txg) +{ + /* + * Aim for roughly 200 metaslabs per vdev. + */ + vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); + vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); + + /* + * Initialize the vdev's metaslabs. This can't fail because + * there's nothing to read when creating all new metaslabs. + */ + VERIFY(vdev_metaslab_init(vd, txg) == 0); +} + +void +vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) +{ + ASSERT(vd == vd->vdev_top); + ASSERT(ISP2(flags)); + + if (flags & VDD_METASLAB) + (void) txg_list_add(&vd->vdev_ms_list, arg, txg); + + if (flags & VDD_DTL) + (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); + + (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); +} + +void +vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) +{ + mutex_enter(sm->sm_lock); + if (!space_map_contains(sm, txg, size)) + space_map_add(sm, txg, size); + mutex_exit(sm->sm_lock); +} + +int +vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) +{ + int dirty; + + /* + * Quick test without the lock -- covers the common case that + * there are no dirty time segments. + */ + if (sm->sm_space == 0) + return (0); + + mutex_enter(sm->sm_lock); + dirty = space_map_contains(sm, txg, size); + mutex_exit(sm->sm_lock); + + return (dirty); +} + +/* + * Reassess DTLs after a config change or scrub completion. + */ +void +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) +{ + spa_t *spa = vd->vdev_spa; + int c; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (vd->vdev_children == 0) { + mutex_enter(&vd->vdev_dtl_lock); + if (scrub_txg != 0 && + (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { + /* XXX should check scrub_done? */ + /* + * We completed a scrub up to scrub_txg. If we + * did it without rebooting, then the scrub dtl + * will be valid, so excise the old region and + * fold in the scrub dtl. Otherwise, leave the + * dtl as-is if there was an error. + */ + space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); + space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); + } + if (scrub_done) + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + mutex_exit(&vd->vdev_dtl_lock); + + if (txg != 0) + vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); + return; + } + + /* + * Make sure the DTLs are always correct under the scrub lock. + */ + if (vd == spa->spa_root_vdev) + mutex_enter(&spa->spa_scrub_lock); + + mutex_enter(&vd->vdev_dtl_lock); + space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + mutex_exit(&vd->vdev_dtl_lock); + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); + mutex_enter(&vd->vdev_dtl_lock); + space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); + space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + } + + if (vd == spa->spa_root_vdev) + mutex_exit(&spa->spa_scrub_lock); +} + +static int +vdev_dtl_load(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + space_map_obj_t *smo = &vd->vdev_dtl; + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *db; + int error; + + ASSERT(vd->vdev_children == 0); + + if (smo->smo_object == 0) + return (0); + + if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) + return (error); + + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(db->db_data, smo, sizeof (*smo)); + dmu_buf_rele(db, FTAG); + + mutex_enter(&vd->vdev_dtl_lock); + error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); + mutex_exit(&vd->vdev_dtl_lock); + + return (error); +} + +void +vdev_dtl_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + space_map_obj_t *smo = &vd->vdev_dtl; + space_map_t *sm = &vd->vdev_dtl_map; + objset_t *mos = spa->spa_meta_objset; + space_map_t smsync; + kmutex_t smlock; + dmu_buf_t *db; + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + if (vd->vdev_detached) { + if (smo->smo_object != 0) { + int err = dmu_object_free(mos, smo->smo_object, tx); + ASSERT3U(err, ==, 0); + smo->smo_object = 0; + } + dmu_tx_commit(tx); + return; + } + + if (smo->smo_object == 0) { + ASSERT(smo->smo_objsize == 0); + ASSERT(smo->smo_alloc == 0); + smo->smo_object = dmu_object_alloc(mos, + DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, + DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); + ASSERT(smo->smo_object != 0); + vdev_config_dirty(vd->vdev_top); + } + + mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); + + space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, + &smlock); + + mutex_enter(&smlock); + + mutex_enter(&vd->vdev_dtl_lock); + space_map_walk(sm, space_map_add, &smsync); + mutex_exit(&vd->vdev_dtl_lock); + + space_map_truncate(smo, mos, tx); + space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); + + space_map_destroy(&smsync); + + mutex_exit(&smlock); + mutex_destroy(&smlock); + + VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(smo, db->db_data, sizeof (*smo)); + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); +} + +/* + * Determine if resilver is needed, and if so the txg range. + */ +boolean_t +vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) +{ + boolean_t needed = B_FALSE; + uint64_t thismin = UINT64_MAX; + uint64_t thismax = 0; + + if (vd->vdev_children == 0) { + mutex_enter(&vd->vdev_dtl_lock); + if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) { + space_seg_t *ss; + + ss = avl_first(&vd->vdev_dtl_map.sm_root); + thismin = ss->ss_start - 1; + ss = avl_last(&vd->vdev_dtl_map.sm_root); + thismax = ss->ss_end; + needed = B_TRUE; + } + mutex_exit(&vd->vdev_dtl_lock); + } else { + int c; + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + uint64_t cmin, cmax; + + if (vdev_resilver_needed(cvd, &cmin, &cmax)) { + thismin = MIN(thismin, cmin); + thismax = MAX(thismax, cmax); + needed = B_TRUE; + } + } + } + + if (needed && minp) { + *minp = thismin; + *maxp = thismax; + } + return (needed); +} + +void +vdev_load(vdev_t *vd) +{ + int c; + + /* + * Recursively load all children. + */ + for (c = 0; c < vd->vdev_children; c++) + vdev_load(vd->vdev_child[c]); + + /* + * If this is a top-level vdev, initialize its metaslabs. + */ + if (vd == vd->vdev_top && + (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || + vdev_metaslab_init(vd, 0) != 0)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + + /* + * If this is a leaf vdev, load its DTL. + */ + if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); +} + +/* + * The special vdev case is used for hot spares and l2cache devices. Its + * sole purpose it to set the vdev state for the associated vdev. To do this, + * we make sure that we can open the underlying device, then try to read the + * label, and make sure that the label is sane and that it hasn't been + * repurposed to another pool. + */ +int +vdev_validate_aux(vdev_t *vd) +{ + nvlist_t *label; + uint64_t guid, version; + uint64_t state; + + if (!vdev_readable(vd)) + return (0); + + if ((label = vdev_label_read_config(vd)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (-1); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || + version > SPA_VERSION || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || + guid != vd->vdev_guid || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (-1); + } + + /* + * We don't actually check the pool state here. If it's in fact in + * use by another pool, we update this fact on the fly when requested. + */ + nvlist_free(label); + return (0); +} + +void +vdev_sync_done(vdev_t *vd, uint64_t txg) +{ + metaslab_t *msp; + + while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + metaslab_sync_done(msp, txg); +} + +void +vdev_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *lvd; + metaslab_t *msp; + dmu_tx_t *tx; + + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { + ASSERT(vd == vd->vdev_top); + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); + ASSERT(vd->vdev_ms_array != 0); + vdev_config_dirty(vd); + dmu_tx_commit(tx); + } + + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { + metaslab_sync(msp, txg); + (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); + } + + while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) + vdev_dtl_sync(lvd, txg); + + (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); +} + +uint64_t +vdev_psize_to_asize(vdev_t *vd, uint64_t psize) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize)); +} + +/* + * Mark the given vdev faulted. A faulted vdev behaves as if the device could + * not be opened, and no I/O is attempted. + */ +int +vdev_fault(spa_t *spa, uint64_t guid) +{ + vdev_t *vd; + + spa_vdev_state_enter(spa); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + /* + * Faulted state takes precedence over degraded. + */ + vd->vdev_faulted = 1ULL; + vd->vdev_degraded = 0ULL; + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); + + /* + * If marking the vdev as faulted cause the top-level vdev to become + * unavailable, then back off and simply mark the vdev as degraded + * instead. + */ + if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + vd->vdev_degraded = 1ULL; + vd->vdev_faulted = 0ULL; + + /* + * If we reopen the device and it's not dead, only then do we + * mark it degraded. + */ + vdev_reopen(vd); + + if (vdev_readable(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } + } + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +/* + * Mark the given vdev degraded. A degraded vdev is purely an indication to the + * user that something is wrong. The vdev continues to operate as normal as far + * as I/O is concerned. + */ +int +vdev_degrade(spa_t *spa, uint64_t guid) +{ + vdev_t *vd; + + spa_vdev_state_enter(spa); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + /* + * If the vdev is already faulted, then don't do anything. + */ + if (vd->vdev_faulted || vd->vdev_degraded) + return (spa_vdev_state_exit(spa, NULL, 0)); + + vd->vdev_degraded = 1ULL; + if (!vdev_is_dead(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +/* + * Online the given vdev. If 'unspare' is set, it implies two things. First, + * any attached spare device should be detached when the device finishes + * resilvering. Second, the online should be treated like a 'test' online case, + * so no FMA events are generated if the device fails to open. + */ +int +vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) +{ + vdev_t *vd; + + spa_vdev_state_enter(spa); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + vd->vdev_offline = B_FALSE; + vd->vdev_tmpoffline = B_FALSE; + vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); + vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); + vdev_reopen(vd->vdev_top); + vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + + if (newstate) + *newstate = vd->vdev_state; + if ((flags & ZFS_ONLINE_UNSPARE) && + !vdev_is_dead(vd) && vd->vdev_parent && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; + + (void) spa_vdev_state_exit(spa, vd, 0); + + VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); + + return (0); +} + +int +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +{ + vdev_t *vd; + + spa_vdev_state_enter(spa); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + /* + * If the device isn't already offline, try to offline it. + */ + if (!vd->vdev_offline) { + /* + * If this device's top-level vdev has a non-empty DTL, + * don't allow the device to be offlined. + * + * XXX -- make this more precise by allowing the offline + * as long as the remaining devices don't have any DTL holes. + */ + if (vd->vdev_top->vdev_dtl_map.sm_space != 0) + return (spa_vdev_state_exit(spa, NULL, EBUSY)); + + /* + * Offline this device and reopen its top-level vdev. + * If this action results in the top-level vdev becoming + * unusable, undo it and fail the request. + */ + vd->vdev_offline = B_TRUE; + vdev_reopen(vd->vdev_top); + if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + vd->vdev_offline = B_FALSE; + vdev_reopen(vd->vdev_top); + return (spa_vdev_state_exit(spa, NULL, EBUSY)); + } + } + + vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +/* + * Clear the error counts associated with this vdev. Unlike vdev_online() and + * vdev_offline(), we assume the spa config is locked. We also clear all + * children. If 'vd' is NULL, then the user wants to clear all vdevs. + */ +void +vdev_clear(spa_t *spa, vdev_t *vd) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (vd == NULL) + vd = rvd; + + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_clear(spa, vd->vdev_child[c]); + + /* + * If we're in the FAULTED state or have experienced failed I/O, then + * clear the persistent state and attempt to reopen the device. We + * also mark the vdev config dirty, so that the new faulted state is + * written out to disk. + */ + if (vd->vdev_faulted || vd->vdev_degraded || + !vdev_readable(vd) || !vdev_writeable(vd)) { + + vd->vdev_faulted = vd->vdev_degraded = 0; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + + vdev_reopen(vd); + + if (vd != rvd) + vdev_state_dirty(vd->vdev_top); + + if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) + spa_async_request(spa, SPA_ASYNC_RESILVER); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); + } +} + +boolean_t +vdev_is_dead(vdev_t *vd) +{ + return (vd->vdev_state < VDEV_STATE_DEGRADED); +} + +boolean_t +vdev_readable(vdev_t *vd) +{ + return (!vdev_is_dead(vd) && !vd->vdev_cant_read); +} + +boolean_t +vdev_writeable(vdev_t *vd) +{ + return (!vdev_is_dead(vd) && !vd->vdev_cant_write); +} + +boolean_t +vdev_allocatable(vdev_t *vd) +{ + /* + * We currently allow allocations from vdevs which maybe in the + * process of reopening (i.e. VDEV_STATE_CLOSED). If the device + * fails to reopen then we'll catch it later when we're holding + * the proper locks. + */ + return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) && + !vd->vdev_cant_write); +} + +boolean_t +vdev_accessible(vdev_t *vd, zio_t *zio) +{ + ASSERT(zio->io_vd == vd); + + if (vdev_is_dead(vd) || vd->vdev_remove_wanted) + return (B_FALSE); + + if (zio->io_type == ZIO_TYPE_READ) + return (!vd->vdev_cant_read); + + if (zio->io_type == ZIO_TYPE_WRITE) + return (!vd->vdev_cant_write); + + return (B_TRUE); +} + +/* + * Get statistics for the given vdev. + */ +void +vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +{ + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + + mutex_enter(&vd->vdev_stat_lock); + bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; + vs->vs_timestamp = gethrtime() - vs->vs_timestamp; + vs->vs_state = vd->vdev_state; + vs->vs_rsize = vdev_get_rsize(vd); + mutex_exit(&vd->vdev_stat_lock); + + /* + * If we're getting stats on the root vdev, aggregate the I/O counts + * over all top-level vdevs (i.e. the direct children of the root). + */ + if (vd == rvd) { + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + vdev_stat_t *cvs = &cvd->vdev_stat; + + mutex_enter(&vd->vdev_stat_lock); + for (int t = 0; t < ZIO_TYPES; t++) { + vs->vs_ops[t] += cvs->vs_ops[t]; + vs->vs_bytes[t] += cvs->vs_bytes[t]; + } + vs->vs_scrub_examined += cvs->vs_scrub_examined; + mutex_exit(&vd->vdev_stat_lock); + } + } +} + +void +vdev_clear_stats(vdev_t *vd) +{ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space = 0; + vd->vdev_stat.vs_dspace = 0; + vd->vdev_stat.vs_alloc = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void +vdev_stat_update(zio_t *zio, uint64_t psize) +{ + vdev_t *rvd = zio->io_spa->spa_root_vdev; + vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; + vdev_t *pvd; + uint64_t txg = zio->io_txg; + vdev_stat_t *vs = &vd->vdev_stat; + zio_type_t type = zio->io_type; + int flags = zio->io_flags; + + /* + * If this i/o is a gang leader, it didn't do any actual work. + */ + if (zio->io_gang_tree) + return; + + if (zio->io_error == 0) { + /* + * If this is a root i/o, don't count it -- we've already + * counted the top-level vdevs, and vdev_get_stats() will + * aggregate them when asked. This reduces contention on + * the root vdev_stat_lock and implicitly handles blocks + * that compress away to holes, for which there is no i/o. + * (Holes never create vdev children, so all the counters + * remain zero, which is what we want.) + * + * Note: this only applies to successful i/o (io_error == 0) + * because unlike i/o counts, errors are not additive. + * When reading a ditto block, for example, failure of + * one top-level vdev does not imply a root-level error. + */ + if (vd == rvd) + return; + + ASSERT(vd == zio->io_vd); + if (!(flags & ZIO_FLAG_IO_BYPASS)) { + mutex_enter(&vd->vdev_stat_lock); + vs->vs_ops[type]++; + vs->vs_bytes[type] += psize; + mutex_exit(&vd->vdev_stat_lock); + } + if (flags & ZIO_FLAG_IO_REPAIR) { + ASSERT(zio->io_delegate_list == NULL); + mutex_enter(&vd->vdev_stat_lock); + if (flags & ZIO_FLAG_SCRUB_THREAD) + vs->vs_scrub_repaired += psize; + else + vs->vs_self_healed += psize; + mutex_exit(&vd->vdev_stat_lock); + } + return; + } + + if (flags & ZIO_FLAG_SPECULATIVE) + return; + + mutex_enter(&vd->vdev_stat_lock); + if (type == ZIO_TYPE_READ) { + if (zio->io_error == ECKSUM) + vs->vs_checksum_errors++; + else + vs->vs_read_errors++; + } + if (type == ZIO_TYPE_WRITE) + vs->vs_write_errors++; + mutex_exit(&vd->vdev_stat_lock); + + if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) { + if (flags & ZIO_FLAG_SCRUB_THREAD) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); + } + if (!(flags & ZIO_FLAG_IO_REPAIR)) { + if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) + return; + vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); + for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); + } + } +} + +void +vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) +{ + int c; + vdev_stat_t *vs = &vd->vdev_stat; + + for (c = 0; c < vd->vdev_children; c++) + vdev_scrub_stat_update(vd->vdev_child[c], type, complete); + + mutex_enter(&vd->vdev_stat_lock); + + if (type == POOL_SCRUB_NONE) { + /* + * Update completion and end time. Leave everything else alone + * so we can report what happened during the previous scrub. + */ + vs->vs_scrub_complete = complete; + vs->vs_scrub_end = gethrestime_sec(); + } else { + vs->vs_scrub_type = type; + vs->vs_scrub_complete = 0; + vs->vs_scrub_examined = 0; + vs->vs_scrub_repaired = 0; + vs->vs_scrub_start = gethrestime_sec(); + vs->vs_scrub_end = 0; + } + + mutex_exit(&vd->vdev_stat_lock); +} + +/* + * Update the in-core space usage stats for this vdev and the root vdev. + */ +void +vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, + boolean_t update_root) +{ + int64_t dspace_delta = space_delta; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(vd == vd->vdev_top); + + /* + * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion + * factor. We must calculate this here and not at the root vdev + * because the root vdev's psize-to-asize is simply the max of its + * childrens', thus not accurate enough for us. + */ + ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); + dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space += space_delta; + vd->vdev_stat.vs_alloc += alloc_delta; + vd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&vd->vdev_stat_lock); + + if (update_root) { + ASSERT(rvd == vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + /* + * Don't count non-normal (e.g. intent log) space as part of + * the pool's capacity. + */ + if (vd->vdev_mg->mg_class != spa->spa_normal_class) + return; + + mutex_enter(&rvd->vdev_stat_lock); + rvd->vdev_stat.vs_space += space_delta; + rvd->vdev_stat.vs_alloc += alloc_delta; + rvd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&rvd->vdev_stat_lock); + } +} + +/* + * Mark a top-level vdev's config as dirty, placing it on the dirty list + * so that it will be written out next time the vdev configuration is synced. + * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. + */ +void +vdev_config_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int c; + + /* + * If this is an aux vdev (as with l2cache devices), then we update the + * vdev config manually and set the sync flag. + */ + if (vd->vdev_aux != NULL) { + spa_aux_vdev_t *sav = vd->vdev_aux; + nvlist_t **aux; + uint_t naux; + + for (c = 0; c < sav->sav_count; c++) { + if (sav->sav_vdevs[c] == vd) + break; + } + + if (c == sav->sav_count) { + /* + * We're being removed. There's nothing more to do. + */ + ASSERT(sav->sav_sync == B_TRUE); + return; + } + + sav->sav_sync = B_TRUE; + + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); + + ASSERT(c < naux); + + /* + * Setting the nvlist in the middle if the array is a little + * sketchy, but it will work. + */ + nvlist_free(aux[c]); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); + + return; + } + + /* + * The dirty list is protected by the SCL_CONFIG lock. The caller + * must either hold SCL_CONFIG as writer, or must be the sync thread + * (which holds SCL_CONFIG as reader). There's only one sync thread, + * so this is sufficient to ensure mutual exclusion. + */ + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_CONFIG, RW_READER))); + + if (vd == rvd) { + for (c = 0; c < rvd->vdev_children; c++) + vdev_config_dirty(rvd->vdev_child[c]); + } else { + ASSERT(vd == vd->vdev_top); + + if (!list_link_active(&vd->vdev_config_dirty_node)) + list_insert_head(&spa->spa_config_dirty_list, vd); + } +} + +void +vdev_config_clean(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_CONFIG, RW_READER))); + + ASSERT(list_link_active(&vd->vdev_config_dirty_node)); + list_remove(&spa->spa_config_dirty_list, vd); +} + +/* + * Mark a top-level vdev's state as dirty, so that the next pass of + * spa_sync() can convert this into vdev_config_dirty(). We distinguish + * the state changes from larger config changes because they require + * much less locking, and are often needed for administrative actions. + */ +void +vdev_state_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vd == vd->vdev_top); + + /* + * The state list is protected by the SCL_STATE lock. The caller + * must either hold SCL_STATE as writer, or must be the sync thread + * (which holds SCL_STATE as reader). There's only one sync thread, + * so this is sufficient to ensure mutual exclusion. + */ + ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_STATE, RW_READER))); + + if (!list_link_active(&vd->vdev_state_dirty_node)) + list_insert_head(&spa->spa_state_dirty_list, vd); +} + +void +vdev_state_clean(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_STATE, RW_READER))); + + ASSERT(list_link_active(&vd->vdev_state_dirty_node)); + list_remove(&spa->spa_state_dirty_list, vd); +} + +/* + * Propagate vdev state up from children to parent. + */ +void +vdev_propagate_state(vdev_t *vd) +{ + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + int degraded = 0, faulted = 0; + int corrupted = 0; + int c; + vdev_t *child; + + if (vd->vdev_children > 0) { + for (c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + + if (!vdev_readable(child) || + (!vdev_writeable(child) && (spa_mode & FWRITE))) { + /* + * Root special: if there is a top-level log + * device, treat the root vdev as if it were + * degraded. + */ + if (child->vdev_islog && vd == rvd) + degraded++; + else + faulted++; + } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { + degraded++; + } + + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } + + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a top-level vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && + rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } + + if (vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); +} + +/* + * Set a vdev's state. If this is during an open, we don't update the parent + * state, because we're in the process of opening children depth-first. + * Otherwise, we propagate the change to the parent. + * + * If this routine places a device in a faulted state, an appropriate ereport is + * generated. + */ +void +vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) +{ + uint64_t save_state; + spa_t *spa = vd->vdev_spa; + + if (state == vd->vdev_state) { + vd->vdev_stat.vs_aux = aux; + return; + } + + save_state = vd->vdev_state; + + vd->vdev_state = state; + vd->vdev_stat.vs_aux = aux; + + /* + * If we are setting the vdev state to anything but an open state, then + * always close the underlying device. Otherwise, we keep accessible + * but invalid devices open forever. We don't call vdev_close() itself, + * because that implies some extra checks (offline, etc) that we don't + * want here. This is limited to leaf devices, because otherwise + * closing the device will affect other children. + */ + if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_removed && + state == VDEV_STATE_CANT_OPEN && + (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { + /* + * If the previous state is set to VDEV_STATE_REMOVED, then this + * device was previously marked removed and someone attempted to + * reopen it. If this failed due to a nonexistent device, then + * keep the device in the REMOVED state. We also let this be if + * it is one of our special test online cases, which is only + * attempting to online the device and shouldn't generate an FMA + * fault. + */ + vd->vdev_state = VDEV_STATE_REMOVED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + } else if (state == VDEV_STATE_REMOVED) { + /* + * Indicate to the ZFS DE that this device has been removed, and + * any recent errors should be ignored. + */ + zfs_post_remove(spa, vd); + vd->vdev_removed = B_TRUE; + } else if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we fail to open a vdev during an import, we mark it as + * "not available", which signifies that it was never there to + * begin with. Failure to open such a device is not considered + * an error. + */ + if (spa->spa_load_state == SPA_LOAD_IMPORT && + !spa->spa_import_faulted && + vd->vdev_ops->vdev_op_leaf) + vd->vdev_not_present = 1; + + /* + * Post the appropriate ereport. If the 'prevstate' field is + * set to something other than VDEV_STATE_UNKNOWN, it indicates + * that this is part of a vdev_reopen(). In this case, we don't + * want to post the ereport if the device was already in the + * CANT_OPEN state beforehand. + * + * If the 'checkremove' flag is set, then this is an attempt to + * online the device in response to an insertion event. If we + * hit this case, then we have detected an insertion event for a + * faulted or offline device that wasn't in the removed state. + * In this scenario, we don't post an ereport because we are + * about to replace the device, or attempt an online with + * vdev_forcefault, which will generate the fault for us. + */ + if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && + !vd->vdev_not_present && !vd->vdev_checkremove && + vd != spa->spa_root_vdev) { + const char *class; + + switch (aux) { + case VDEV_AUX_OPEN_FAILED: + class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; + break; + case VDEV_AUX_CORRUPT_DATA: + class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; + break; + case VDEV_AUX_NO_REPLICAS: + class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; + break; + case VDEV_AUX_BAD_GUID_SUM: + class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; + break; + case VDEV_AUX_TOO_SMALL: + class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; + break; + case VDEV_AUX_BAD_LABEL: + class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; + break; + case VDEV_AUX_IO_FAILURE: + class = FM_EREPORT_ZFS_IO_FAILURE; + break; + default: + class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; + } + + zfs_ereport_post(class, spa, vd, NULL, save_state, 0); + } + + /* Erase any notion of persistent removed state */ + vd->vdev_removed = B_FALSE; + } else { + vd->vdev_removed = B_FALSE; + } + + if (!isopen) + vdev_propagate_state(vd); +} + +/* + * Check the vdev configuration to ensure that it's capable of supporting + * a root pool. Currently, we do not support RAID-Z or partial configuration. + * In addition, only a single top-level vdev is allowed and none of the leaves + * can be wholedisks. + */ +boolean_t +vdev_is_bootable(vdev_t *vd) +{ + int c; + + if (!vd->vdev_ops->vdev_op_leaf) { + char *vdev_type = vd->vdev_ops->vdev_op_type; + + if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && + vd->vdev_children > 1) { + return (B_FALSE); + } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { + return (B_FALSE); + } + } else if (vd->vdev_wholedisk == 1) { + return (B_FALSE); + } + + for (c = 0; c < vd->vdev_children; c++) { + if (!vdev_is_bootable(vd->vdev_child[c])) + return (B_FALSE); + } + return (B_TRUE); +} diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c new file mode 100644 index 000000000..5a7b59f6e --- /dev/null +++ b/module/zfs/vdev_cache.c @@ -0,0 +1,425 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/kstat.h> + +/* + * Virtual device read-ahead caching. + * + * This file implements a simple LRU read-ahead cache. When the DMU reads + * a given block, it will often want other, nearby blocks soon thereafter. + * We take advantage of this by reading a larger disk region and caching + * the result. In the best case, this can turn 128 back-to-back 512-byte + * reads into a single 64k read followed by 127 cache hits; this reduces + * latency dramatically. In the worst case, it can turn an isolated 512-byte + * read into a 64k read, which doesn't affect latency all that much but is + * terribly wasteful of bandwidth. A more intelligent version of the cache + * could keep track of access patterns and not do read-ahead unless it sees + * at least two temporally close I/Os to the same region. Currently, only + * metadata I/O is inflated. A futher enhancement could take advantage of + * more semantic information about the I/O. And it could use something + * faster than an AVL tree; that was chosen solely for convenience. + * + * There are five cache operations: allocate, fill, read, write, evict. + * + * (1) Allocate. This reserves a cache entry for the specified region. + * We separate the allocate and fill operations so that multiple threads + * don't generate I/O for the same cache miss. + * + * (2) Fill. When the I/O for a cache miss completes, the fill routine + * places the data in the previously allocated cache entry. + * + * (3) Read. Read data from the cache. + * + * (4) Write. Update cache contents after write completion. + * + * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry + * if the total cache size exceeds zfs_vdev_cache_size. + */ + +/* + * These tunables are for performance analysis. + */ +/* + * All i/os smaller than zfs_vdev_cache_max will be turned into + * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software + * track buffer). At most zfs_vdev_cache_size bytes will be kept in each + * vdev's vdev_cache. + */ +int zfs_vdev_cache_max = 1<<14; /* 16KB */ +int zfs_vdev_cache_size = 10ULL << 20; /* 10MB */ +int zfs_vdev_cache_bshift = 16; + +#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ + +kstat_t *vdc_ksp = NULL; + +typedef struct vdc_stats { + kstat_named_t vdc_stat_delegations; + kstat_named_t vdc_stat_hits; + kstat_named_t vdc_stat_misses; +} vdc_stats_t; + +static vdc_stats_t vdc_stats = { + { "delegations", KSTAT_DATA_UINT64 }, + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 } +}; + +#define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1); + +static int +vdev_cache_offset_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = a1; + const vdev_cache_entry_t *ve2 = a2; + + if (ve1->ve_offset < ve2->ve_offset) + return (-1); + if (ve1->ve_offset > ve2->ve_offset) + return (1); + return (0); +} + +static int +vdev_cache_lastused_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = a1; + const vdev_cache_entry_t *ve2 = a2; + + if (ve1->ve_lastused < ve2->ve_lastused) + return (-1); + if (ve1->ve_lastused > ve2->ve_lastused) + return (1); + + /* + * Among equally old entries, sort by offset to ensure uniqueness. + */ + return (vdev_cache_offset_compare(a1, a2)); +} + +/* + * Evict the specified entry from the cache. + */ +static void +vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) +{ + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT(ve->ve_fill_io == NULL); + ASSERT(ve->ve_data != NULL); + + avl_remove(&vc->vc_lastused_tree, ve); + avl_remove(&vc->vc_offset_tree, ve); + zio_buf_free(ve->ve_data, VCBS); + kmem_free(ve, sizeof (vdev_cache_entry_t)); +} + +/* + * Allocate an entry in the cache. At the point we don't have the data, + * we're just creating a placeholder so that multiple threads don't all + * go off and read the same blocks. + */ +static vdev_cache_entry_t * +vdev_cache_allocate(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + uint64_t offset = P2ALIGN(zio->io_offset, VCBS); + vdev_cache_entry_t *ve; + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + + if (zfs_vdev_cache_size == 0) + return (NULL); + + /* + * If adding a new entry would exceed the cache size, + * evict the oldest entry (LRU). + */ + if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > + zfs_vdev_cache_size) { + ve = avl_first(&vc->vc_lastused_tree); + if (ve->ve_fill_io != NULL) + return (NULL); + ASSERT(ve->ve_hits != 0); + vdev_cache_evict(vc, ve); + } + + ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); + ve->ve_offset = offset; + ve->ve_lastused = lbolt; + ve->ve_data = zio_buf_alloc(VCBS); + + avl_add(&vc->vc_offset_tree, ve); + avl_add(&vc->vc_lastused_tree, ve); + + return (ve); +} + +static void +vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) +{ + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT(ve->ve_fill_io == NULL); + + if (ve->ve_lastused != lbolt) { + avl_remove(&vc->vc_lastused_tree, ve); + ve->ve_lastused = lbolt; + avl_add(&vc->vc_lastused_tree, ve); + } + + ve->ve_hits++; + bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); +} + +/* + * Fill a previously allocated cache entry with data. + */ +static void +vdev_cache_fill(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve = zio->io_private; + zio_t *dio; + + ASSERT(zio->io_size == VCBS); + + /* + * Add data to the cache. + */ + mutex_enter(&vc->vc_lock); + + ASSERT(ve->ve_fill_io == zio); + ASSERT(ve->ve_offset == zio->io_offset); + ASSERT(ve->ve_data == zio->io_data); + + ve->ve_fill_io = NULL; + + /* + * Even if this cache line was invalidated by a missed write update, + * any reads that were queued up before the missed update are still + * valid, so we can satisfy them from this line before we evict it. + */ + for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) + vdev_cache_hit(vc, ve, dio); + + if (zio->io_error || ve->ve_missed_update) + vdev_cache_evict(vc, ve); + + mutex_exit(&vc->vc_lock); + + while ((dio = zio->io_delegate_list) != NULL) { + zio->io_delegate_list = dio->io_delegate_next; + dio->io_delegate_next = NULL; + dio->io_error = zio->io_error; + zio_execute(dio); + } +} + +/* + * Read data from the cache. Returns 0 on cache hit, errno on a miss. + */ +int +vdev_cache_read(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); + zio_t *fio; + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + if (zio->io_flags & ZIO_FLAG_DONT_CACHE) + return (EINVAL); + + if (zio->io_size > zfs_vdev_cache_max) + return (EOVERFLOW); + + /* + * If the I/O straddles two or more cache blocks, don't cache it. + */ + if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) + return (EXDEV); + + ASSERT(cache_phase + zio->io_size <= VCBS); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = cache_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); + + if (ve != NULL) { + if (ve->ve_missed_update) { + mutex_exit(&vc->vc_lock); + return (ESTALE); + } + + if ((fio = ve->ve_fill_io) != NULL) { + zio->io_delegate_next = fio->io_delegate_list; + fio->io_delegate_list = zio; + zio_vdev_io_bypass(zio); + mutex_exit(&vc->vc_lock); + VDCSTAT_BUMP(vdc_stat_delegations); + return (0); + } + + vdev_cache_hit(vc, ve, zio); + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + zio_execute(zio); + VDCSTAT_BUMP(vdc_stat_hits); + return (0); + } + + ve = vdev_cache_allocate(zio); + + if (ve == NULL) { + mutex_exit(&vc->vc_lock); + return (ENOMEM); + } + + fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, + ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, + ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); + + ve->ve_fill_io = fio; + fio->io_delegate_list = zio; + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + zio_nowait(fio); + VDCSTAT_BUMP(vdc_stat_misses); + + return (0); +} + +/* + * Update cache contents upon write completion. + */ +void +vdev_cache_write(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t io_start = zio->io_offset; + uint64_t io_end = io_start + zio->io_size; + uint64_t min_offset = P2ALIGN(io_start, VCBS); + uint64_t max_offset = P2ROUNDUP(io_end, VCBS); + avl_index_t where; + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = min_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); + + if (ve == NULL) + ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); + + while (ve != NULL && ve->ve_offset < max_offset) { + uint64_t start = MAX(ve->ve_offset, io_start); + uint64_t end = MIN(ve->ve_offset + VCBS, io_end); + + if (ve->ve_fill_io != NULL) { + ve->ve_missed_update = 1; + } else { + bcopy((char *)zio->io_data + start - io_start, + ve->ve_data + start - ve->ve_offset, end - start); + } + ve = AVL_NEXT(&vc->vc_offset_tree, ve); + } + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_purge(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_init(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_offset_node)); + + avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_lastused_node)); +} + +void +vdev_cache_fini(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + vdev_cache_purge(vd); + + avl_destroy(&vc->vc_offset_tree); + avl_destroy(&vc->vc_lastused_tree); + + mutex_destroy(&vc->vc_lock); +} + +void +vdev_cache_stat_init(void) +{ + vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (vdc_ksp != NULL) { + vdc_ksp->ks_data = &vdc_stats; + kstat_install(vdc_ksp); + } +} + +void +vdev_cache_stat_fini(void) +{ + if (vdc_ksp != NULL) { + kstat_delete(vdc_ksp); + vdc_ksp = NULL; + } +} diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c new file mode 100644 index 000000000..dc0e920bf --- /dev/null +++ b/module/zfs/vdev_file.c @@ -0,0 +1,188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_file.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> + +/* + * Virtual device vector for files. + */ + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_file_t *vf; + vnode_t *vp; + vattr_t vattr; + int error; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, + spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_vnode = vp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (vp->v_type != VREG) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (ENODEV); + } +#endif + /* + * Determine the physical size of the file. + */ + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *psize = vattr.va_size; + *ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vf == NULL) + return; + + if (vf->vf_vnode != NULL) { + (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); + (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); + VN_RELE(vf->vf_vnode); + } + + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +static int +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + ssize_t resid; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = ENXIO; + return (ZIO_PIPELINE_CONTINUE); + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, + kcred, NULL); + break; + default: + zio->io_error = ENOTSUP; + } + + return (ZIO_PIPELINE_CONTINUE); + } + + zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? + UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, + zio->io_size, zio->io_offset, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, &resid); + + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + + zio_interrupt(zio); + + return (ZIO_PIPELINE_STOP); +} + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c new file mode 100644 index 000000000..bf930466f --- /dev/null +++ b/module/zfs/vdev_label.c @@ -0,0 +1,1078 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Virtual Device Labels + * --------------------- + * + * The vdev label serves several distinct purposes: + * + * 1. Uniquely identify this device as part of a ZFS pool and confirm its + * identity within the pool. + * + * 2. Verify that all the devices given in a configuration are present + * within the pool. + * + * 3. Determine the uberblock for the pool. + * + * 4. In case of an import operation, determine the configuration of the + * toplevel vdev of which it is a part. + * + * 5. If an import operation cannot find all the devices in the pool, + * provide enough information to the administrator to determine which + * devices are missing. + * + * It is important to note that while the kernel is responsible for writing the + * label, it only consumes the information in the first three cases. The + * latter information is only consumed in userland when determining the + * configuration to import a pool. + * + * + * Label Organization + * ------------------ + * + * Before describing the contents of the label, it's important to understand how + * the labels are written and updated with respect to the uberblock. + * + * When the pool configuration is altered, either because it was newly created + * or a device was added, we want to update all the labels such that we can deal + * with fatal failure at any point. To this end, each disk has two labels which + * are updated before and after the uberblock is synced. Assuming we have + * labels and an uberblock with the following transaction groups: + * + * L1 UB L2 + * +------+ +------+ +------+ + * | | | | | | + * | t10 | | t10 | | t10 | + * | | | | | | + * +------+ +------+ +------+ + * + * In this stable state, the labels and the uberblock were all updated within + * the same transaction group (10). Each label is mirrored and checksummed, so + * that we can detect when we fail partway through writing the label. + * + * In order to identify which labels are valid, the labels are written in the + * following manner: + * + * 1. For each vdev, update 'L1' to the new label + * 2. Update the uberblock + * 3. For each vdev, update 'L2' to the new label + * + * Given arbitrary failure, we can determine the correct label to use based on + * the transaction group. If we fail after updating L1 but before updating the + * UB, we will notice that L1's transaction group is greater than the uberblock, + * so L2 must be valid. If we fail after writing the uberblock but before + * writing L2, we will notice that L2's transaction group is less than L1, and + * therefore L1 is valid. + * + * Another added complexity is that not every label is updated when the config + * is synced. If we add a single device, we do not want to have to re-write + * every label for every device in the pool. This means that both L1 and L2 may + * be older than the pool uberblock, because the necessary information is stored + * on another vdev. + * + * + * On-disk Format + * -------------- + * + * The vdev label consists of two distinct parts, and is wrapped within the + * vdev_label_t structure. The label includes 8k of padding to permit legacy + * VTOC disk labels, but is otherwise ignored. + * + * The first half of the label is a packed nvlist which contains pool wide + * properties, per-vdev properties, and configuration information. It is + * described in more detail below. + * + * The latter half of the label consists of a redundant array of uberblocks. + * These uberblocks are updated whenever a transaction group is committed, + * or when the configuration is updated. When a pool is loaded, we scan each + * vdev for the 'best' uberblock. + * + * + * Configuration Information + * ------------------------- + * + * The nvlist describing the pool and vdev contains the following elements: + * + * version ZFS on-disk version + * name Pool name + * state Pool state + * txg Transaction group in which this label was written + * pool_guid Unique identifier for this pool + * vdev_tree An nvlist describing vdev tree. + * + * Each leaf device label also contains the following: + * + * top_guid Unique ID for top-level vdev in which this is contained + * guid Unique ID for the leaf vdev + * + * The 'vs' configuration follows the format described in 'spa_config.c'. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Basic routines to read and write from a vdev label. + * Used throughout the rest of this file. + */ +uint64_t +vdev_label_offset(uint64_t psize, int l, uint64_t offset) +{ + ASSERT(offset < sizeof (vdev_label_t)); + ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); + + return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Returns back the vdev label associated with the passed in offset. + */ +int +vdev_label_number(uint64_t psize, uint64_t offset) +{ + int l; + + if (offset >= psize - VDEV_LABEL_END_SIZE) { + offset -= psize - VDEV_LABEL_END_SIZE; + offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); + } + l = offset / sizeof (vdev_label_t); + return (l < VDEV_LABELS ? l : -1); +} + +static void +vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) +{ + ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == + SCL_STATE_ALL); + ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); + + zio_nowait(zio_read_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); +} + +static void +vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) +{ + ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || + (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == + (SCL_CONFIG | SCL_STATE) && + dsl_pool_sync_context(spa_get_dsl(zio->io_spa)))); + ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); + + zio_nowait(zio_write_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); +} + +/* + * Generate the nvlist representing this vdev's config. + */ +nvlist_t * +vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, + boolean_t isspare, boolean_t isl2cache) +{ + nvlist_t *nv = NULL; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, + vd->vdev_ops->vdev_op_type) == 0); + if (!isspare && !isl2cache) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) + == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + + if (vd->vdev_path != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, + vd->vdev_path) == 0); + + if (vd->vdev_devid != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, + vd->vdev_devid) == 0); + + if (vd->vdev_physpath != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath) == 0); + + if (vd->vdev_nparity != 0) { + ASSERT(strcmp(vd->vdev_ops->vdev_op_type, + VDEV_TYPE_RAIDZ) == 0); + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vd->vdev_nparity == 1 || + (vd->vdev_nparity == 2 && + spa_version(spa) >= SPA_VERSION_RAID6)); + + /* + * Note that we'll add the nparity tag even on storage pools + * that only support a single parity device -- older software + * will just ignore it. + */ + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, + vd->vdev_nparity) == 0); + } + + if (vd->vdev_wholedisk != -1ULL) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + vd->vdev_wholedisk) == 0); + + if (vd->vdev_not_present) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0); + + if (vd->vdev_isspare) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); + + if (!isspare && !isl2cache && vd == vd->vdev_top) { + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + vd->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + vd->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, + vd->vdev_asize) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, + vd->vdev_islog) == 0); + } + + if (vd->vdev_dtl.smo_object != 0) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, + vd->vdev_dtl.smo_object) == 0); + + if (getstats) { + vdev_stat_t vs; + vdev_get_stats(vd, &vs); + VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS, + (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); + } + + if (!vd->vdev_ops->vdev_op_leaf) { + nvlist_t **child; + int c; + + child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), + KM_SLEEP); + + for (c = 0; c < vd->vdev_children; c++) + child[c] = vdev_config_generate(spa, vd->vdev_child[c], + getstats, isspare, isl2cache); + + VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + child, vd->vdev_children) == 0); + + for (c = 0; c < vd->vdev_children; c++) + nvlist_free(child[c]); + + kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); + + } else { + if (vd->vdev_offline && !vd->vdev_tmpoffline) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, + B_TRUE) == 0); + if (vd->vdev_faulted) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, + B_TRUE) == 0); + if (vd->vdev_degraded) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, + B_TRUE) == 0); + if (vd->vdev_removed) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, + B_TRUE) == 0); + if (vd->vdev_unspare) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, + B_TRUE) == 0); + } + + return (nv); +} + +nvlist_t * +vdev_label_read_config(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *config = NULL; + vdev_phys_t *vp; + zio_t *zio; + int flags = + ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (!vdev_readable(vd)) + return (NULL); + + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + + for (int l = 0; l < VDEV_LABELS; l++) { + + zio = zio_root(spa, NULL, NULL, flags); + + vdev_label_read(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL, flags); + + if (zio_wait(zio) == 0 && + nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), + &config, 0) == 0) + break; + + if (config != NULL) { + nvlist_free(config); + config = NULL; + } + } + + zio_buf_free(vp, sizeof (vdev_phys_t)); + + return (config); +} + +/* + * Determine if a device is in use. The 'spare_guid' parameter will be filled + * in with the device guid if this spare is active elsewhere on the system. + */ +static boolean_t +vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, + uint64_t *spare_guid, uint64_t *l2cache_guid) +{ + spa_t *spa = vd->vdev_spa; + uint64_t state, pool_guid, device_guid, txg, spare_pool; + uint64_t vdtxg = 0; + nvlist_t *label; + + if (spare_guid) + *spare_guid = 0ULL; + if (l2cache_guid) + *l2cache_guid = 0ULL; + + /* + * Read the label, if any, and perform some basic sanity checks. + */ + if ((label = vdev_label_read_config(vd)) == NULL) + return (B_FALSE); + + (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + &vdtxg); + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, + &device_guid) != 0) { + nvlist_free(label); + return (B_FALSE); + } + + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0)) { + nvlist_free(label); + return (B_FALSE); + } + + nvlist_free(label); + + /* + * Check to see if this device indeed belongs to the pool it claims to + * be a part of. The only way this is allowed is if the device is a hot + * spare (which we check for later on). + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + !spa_guid_exists(pool_guid, device_guid) && + !spa_spare_exists(device_guid, NULL, NULL) && + !spa_l2cache_exists(device_guid, NULL)) + return (B_FALSE); + + /* + * If the transaction group is zero, then this an initialized (but + * unused) label. This is only an error if the create transaction + * on-disk is the same as the one we're using now, in which case the + * user has attempted to add the same vdev multiple times in the same + * transaction. + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + txg == 0 && vdtxg == crtxg) + return (B_TRUE); + + /* + * Check to see if this is a spare device. We do an explicit check for + * spa_has_spare() here because it may be on our pending list of spares + * to add. We also check if it is an l2cache device. + */ + if (spa_spare_exists(device_guid, &spare_pool, NULL) || + spa_has_spare(spa, device_guid)) { + if (spare_guid) + *spare_guid = device_guid; + + switch (reason) { + case VDEV_LABEL_CREATE: + case VDEV_LABEL_L2CACHE: + return (B_TRUE); + + case VDEV_LABEL_REPLACE: + return (!spa_has_spare(spa, device_guid) || + spare_pool != 0ULL); + + case VDEV_LABEL_SPARE: + return (spa_has_spare(spa, device_guid)); + } + } + + /* + * Check to see if this is an l2cache device. + */ + if (spa_l2cache_exists(device_guid, NULL)) + return (B_TRUE); + + /* + * If the device is marked ACTIVE, then this device is in use by another + * pool on the system. + */ + return (state == POOL_STATE_ACTIVE); +} + +/* + * Initialize a vdev label. We check to make sure each leaf device is not in + * use, and writable. We put down an initial label which we will later + * overwrite with a complete label. Note that it's important to do this + * sequentially, not in parallel, so that we catch cases of multiple use of the + * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with + * itself. + */ +int +vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *label; + vdev_phys_t *vp; + vdev_boot_header_t *vb; + uberblock_t *ub; + zio_t *zio; + char *buf; + size_t buflen; + int error; + uint64_t spare_guid, l2cache_guid; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + for (int c = 0; c < vd->vdev_children; c++) + if ((error = vdev_label_init(vd->vdev_child[c], + crtxg, reason)) != 0) + return (error); + + if (!vd->vdev_ops->vdev_op_leaf) + return (0); + + /* + * Dead vdevs cannot be initialized. + */ + if (vdev_is_dead(vd)) + return (EIO); + + /* + * Determine if the vdev is in use. + */ + if (reason != VDEV_LABEL_REMOVE && + vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) + return (EBUSY); + + ASSERT(reason != VDEV_LABEL_REMOVE || + vdev_inuse(vd, crtxg, reason, NULL, NULL)); + + /* + * If this is a request to add or replace a spare or l2cache device + * that is in use elsewhere on the system, then we must update the + * guid (which was initialized to a random value) to reflect the + * actual GUID (which is shared between multiple pools). + */ + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && + spare_guid != 0ULL) { + uint64_t guid_delta = spare_guid - vd->vdev_guid; + + vd->vdev_guid += guid_delta; + + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += guid_delta; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding a spare, then it's already + * labeled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_SPARE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE); + } + + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && + l2cache_guid != 0ULL) { + uint64_t guid_delta = l2cache_guid - vd->vdev_guid; + + vd->vdev_guid += guid_delta; + + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += guid_delta; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding an l2cache, then it's + * already labeled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_L2CACHE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE); + } + + /* + * Initialize its label. + */ + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + bzero(vp, sizeof (vdev_phys_t)); + + /* + * Generate a label describing the pool and our top-level vdev. + * We mark it as being from txg 0 to indicate that it's not + * really part of an active pool just yet. The labels will + * be written again with a meaningful txg by spa_sync(). + */ + if (reason == VDEV_LABEL_SPARE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { + /* + * For inactive hot spares, we generate a special label that + * identifies as a mutually shared hot spare. We write the + * label if we are adding a hot spare, or if we are removing an + * active hot spare (in which case we want to revert the + * labels). + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_SPARE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } else if (reason == VDEV_LABEL_L2CACHE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { + /* + * For level 2 ARC devices, add a special label. + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_L2CACHE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } else { + label = spa_config_generate(spa, vd, 0ULL, B_FALSE); + + /* + * Add our creation time. This allows us to detect multiple + * vdev uses as described above, and automatically expires if we + * fail. + */ + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + crtxg) == 0); + } + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); + if (error != 0) { + nvlist_free(label); + zio_buf_free(vp, sizeof (vdev_phys_t)); + /* EFAULT means nvlist_pack ran out of room */ + return (error == EFAULT ? ENAMETOOLONG : EINVAL); + } + + /* + * Initialize boot block header. + */ + vb = zio_buf_alloc(sizeof (vdev_boot_header_t)); + bzero(vb, sizeof (vdev_boot_header_t)); + vb->vb_magic = VDEV_BOOT_MAGIC; + vb->vb_version = VDEV_BOOT_VERSION; + vb->vb_offset = VDEV_BOOT_OFFSET; + vb->vb_size = VDEV_BOOT_SIZE; + + /* + * Initialize uberblock template. + */ + ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); + bzero(ub, VDEV_UBERBLOCK_SIZE(vd)); + *ub = spa->spa_uberblock; + ub->ub_txg = 0; + + /* + * Write everything in parallel. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (int l = 0; l < VDEV_LABELS; l++) { + + vdev_label_write(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL, flags); + + vdev_label_write(zio, vd, l, vb, + offsetof(vdev_label_t, vl_boot_header), + sizeof (vdev_boot_header_t), NULL, NULL, flags); + + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + vdev_label_write(zio, vd, l, ub, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); + } + } + + error = zio_wait(zio); + + nvlist_free(label); + zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); + zio_buf_free(vb, sizeof (vdev_boot_header_t)); + zio_buf_free(vp, sizeof (vdev_phys_t)); + + /* + * If this vdev hasn't been previously identified as a spare, then we + * mark it as such only if a) we are labeling it as a spare, or b) it + * exists as a spare elsewhere in the system. Do the same for + * level 2 ARC devices. + */ + if (error == 0 && !vd->vdev_isspare && + (reason == VDEV_LABEL_SPARE || + spa_spare_exists(vd->vdev_guid, NULL, NULL))) + spa_spare_add(vd); + + if (error == 0 && !vd->vdev_isl2cache && + (reason == VDEV_LABEL_L2CACHE || + spa_l2cache_exists(vd->vdev_guid, NULL))) + spa_l2cache_add(vd); + + return (error); +} + +/* + * ========================================================================== + * uberblock load/sync + * ========================================================================== + */ + +/* + * Consider the following situation: txg is safely synced to disk. We've + * written the first uberblock for txg + 1, and then we lose power. When we + * come back up, we fail to see the uberblock for txg + 1 because, say, + * it was on a mirrored device and the replica to which we wrote txg + 1 + * is now offline. If we then make some changes and sync txg + 1, and then + * the missing replica comes back, then for a new seconds we'll have two + * conflicting uberblocks on disk with the same txg. The solution is simple: + * among uberblocks with equal txg, choose the one with the latest timestamp. + */ +static int +vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) +{ + if (ub1->ub_txg < ub2->ub_txg) + return (-1); + if (ub1->ub_txg > ub2->ub_txg) + return (1); + + if (ub1->ub_timestamp < ub2->ub_timestamp) + return (-1); + if (ub1->ub_timestamp > ub2->ub_timestamp) + return (1); + + return (0); +} + +static void +vdev_uberblock_load_done(zio_t *zio) +{ + zio_t *rio = zio->io_private; + uberblock_t *ub = zio->io_data; + uberblock_t *ubbest = rio->io_private; + + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); + + if (zio->io_error == 0 && uberblock_verify(ub) == 0) { + mutex_enter(&rio->io_lock); + if (vdev_uberblock_compare(ub, ubbest) > 0) + *ubbest = *ub; + mutex_exit(&rio->io_lock); + } + + zio_buf_free(zio->io_data, zio->io_size); +} + +void +vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int flags = + ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + + if (vd == rvd) { + ASSERT(zio == NULL); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, ubbest, flags); + bzero(ubbest, sizeof (uberblock_t)); + } + + ASSERT(zio != NULL); + + for (int c = 0; c < vd->vdev_children; c++) + vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS; l++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + vdev_label_read(zio, vd, l, + zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_load_done, zio, flags); + } + } + } + + if (vd == rvd) { + (void) zio_wait(zio); + spa_config_exit(spa, SCL_ALL, FTAG); + } +} + +/* + * On success, increment root zio's count of good writes. + * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). + */ +static void +vdev_uberblock_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) + atomic_add_64(good_writes, 1); +} + +/* + * Write the uberblock to all labels of all leaves of the specified vdev. + */ +static void +vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) +{ + uberblock_t *ubbuf; + int n; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); + + ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); + bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + *ubbuf = *ub; + + for (int l = 0; l < VDEV_LABELS; l++) + vdev_label_write(zio, vd, l, ubbuf, + VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_sync_done, zio->io_private, + flags | ZIO_FLAG_DONT_PROPAGATE); + + zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); +} + +int +vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) +{ + spa_t *spa = svd[0]->vdev_spa; + zio_t *zio; + uint64_t good_writes = 0; + + zio = zio_root(spa, NULL, &good_writes, flags); + + for (int v = 0; v < svdcount; v++) + vdev_uberblock_sync(zio, ub, svd[v], flags); + + (void) zio_wait(zio); + + /* + * Flush the uberblocks to disk. This ensures that the odd labels + * are no longer needed (because the new uberblocks and the even + * labels are safely on disk), so it is safe to overwrite them. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) + zio_flush(zio, svd[v]); + + (void) zio_wait(zio); + + return (good_writes >= 1 ? 0 : EIO); +} + +/* + * On success, increment the count of good writes for our top-level vdev. + */ +static void +vdev_label_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0) + atomic_add_64(good_writes, 1); +} + +/* + * If there weren't enough good writes, indicate failure to the parent. + */ +static void +vdev_label_sync_top_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (*good_writes == 0) + zio->io_error = EIO; + + kmem_free(good_writes, sizeof (uint64_t)); +} + +/* + * We ignore errors for log and cache devices, simply free the private data. + */ +static void +vdev_label_sync_ignore_done(zio_t *zio) +{ + kmem_free(zio->io_private, sizeof (uint64_t)); +} + +/* + * Write all even or odd labels to all leaves of the specified vdev. + */ +static void +vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) +{ + nvlist_t *label; + vdev_phys_t *vp; + char *buf; + size_t buflen; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + /* + * Generate a label describing the top-level config to which we belong. + */ + label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); + + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + bzero(vp, sizeof (vdev_phys_t)); + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) { + for (; l < VDEV_LABELS; l += 2) { + vdev_label_write(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), + vdev_label_sync_done, zio->io_private, + flags | ZIO_FLAG_DONT_PROPAGATE); + } + } + + zio_buf_free(vp, sizeof (vdev_phys_t)); + nvlist_free(label); +} + +int +vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) +{ + list_t *dl = &spa->spa_config_dirty_list; + vdev_t *vd; + zio_t *zio; + int error; + + /* + * Write the new labels to disk. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { + uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), + KM_SLEEP); + zio_t *vio = zio_null(zio, spa, + (vd->vdev_islog || vd->vdev_aux != NULL) ? + vdev_label_sync_ignore_done : vdev_label_sync_top_done, + good_writes, flags); + vdev_label_sync(vio, vd, l, txg, flags); + zio_nowait(vio); + } + + error = zio_wait(zio); + + /* + * Flush the new labels to disk. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) + zio_flush(zio, vd); + + (void) zio_wait(zio); + + return (error); +} + +/* + * Sync the uberblock and any changes to the vdev configuration. + * + * The order of operations is carefully crafted to ensure that + * if the system panics or loses power at any time, the state on disk + * is still transactionally consistent. The in-line comments below + * describe the failure semantics at each stage. + * + * Moreover, vdev_config_sync() is designed to be idempotent: if it fails + * at any time, you can just call it again, and it will resume its work. + */ +int +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) +{ + spa_t *spa = svd[0]->vdev_spa; + uberblock_t *ub = &spa->spa_uberblock; + vdev_t *vd; + zio_t *zio; + int error; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + + ASSERT(ub->ub_txg <= txg); + + /* + * If this isn't a resync due to I/O errors, + * and nothing changed in this transaction group, + * and the vdev configuration hasn't changed, + * then there's nothing to do. + */ + if (ub->ub_txg < txg && + uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE && + list_is_empty(&spa->spa_config_dirty_list)) + return (0); + + if (txg > spa_freeze_txg(spa)) + return (0); + + ASSERT(txg <= spa->spa_final_txg); + + /* + * Flush the write cache of every disk that's been written to + * in this transaction group. This ensures that all blocks + * written in this txg will be committed to stable storage + * before any uberblock that references them. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; + vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) + zio_flush(zio, vd); + + (void) zio_wait(zio); + + /* + * Sync out the even labels (L0, L2) for every dirty vdev. If the + * system dies in the middle of this process, that's OK: all of the + * even labels that made it to disk will be newer than any uberblock, + * and will therefore be considered invalid. The odd labels (L1, L3), + * which have not yet been touched, will still be valid. We flush + * the new labels to disk to ensure that all even-label updates + * are committed to stable storage before the uberblock update. + */ + if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) + return (error); + + /* + * Sync the uberblocks to all vdevs in svd[]. + * If the system dies in the middle of this step, there are two cases + * to consider, and the on-disk state is consistent either way: + * + * (1) If none of the new uberblocks made it to disk, then the + * previous uberblock will be the newest, and the odd labels + * (which had not yet been touched) will be valid with respect + * to that uberblock. + * + * (2) If one or more new uberblocks made it to disk, then they + * will be the newest, and the even labels (which had all + * been successfully committed) will be valid with respect + * to the new uberblocks. + */ + if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) + return (error); + + /* + * Sync out odd labels for every dirty vdev. If the system dies + * in the middle of this process, the even labels and the new + * uberblocks will suffice to open the pool. The next time + * the pool is opened, the first thing we'll do -- before any + * user data is modified -- is mark every vdev dirty so that + * all labels will be brought up to date. We flush the new labels + * to disk to ensure that all odd-label updates are committed to + * stable storage before the next transaction group begins. + */ + return (vdev_label_sync_list(spa, 1, txg, flags)); +} diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c new file mode 100644 index 000000000..c4629ff45 --- /dev/null +++ b/module/zfs/vdev_mirror.c @@ -0,0 +1,480 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for mirroring. + */ + +typedef struct mirror_child { + vdev_t *mc_vd; + uint64_t mc_offset; + int mc_error; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; +} mirror_child_t; + +typedef struct mirror_map { + int mm_children; + int mm_replacing; + int mm_preferred; + int mm_root; + mirror_child_t mm_child[1]; +} mirror_map_t; + +int vdev_mirror_shift = 21; + +static void +vdev_mirror_map_free(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + + kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); +} + +static mirror_map_t * +vdev_mirror_map_alloc(zio_t *zio) +{ + mirror_map_t *mm = NULL; + mirror_child_t *mc; + vdev_t *vd = zio->io_vd; + int c, d; + + if (vd == NULL) { + dva_t *dva = zio->io_bp->blk_dva; + spa_t *spa = zio->io_spa; + + c = BP_GET_NDVAS(zio->io_bp); + + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm->mm_children = c; + mm->mm_replacing = B_FALSE; + mm->mm_preferred = spa_get_random(c); + mm->mm_root = B_TRUE; + + /* + * Check the other, lower-index DVAs to see if they're on + * the same vdev as the child we picked. If they are, use + * them since they are likely to have been allocated from + * the primary metaslab in use at the time, and hence are + * more likely to have locality with single-copy data. + */ + for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { + if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) + mm->mm_preferred = d; + } + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); + mc->mc_offset = DVA_GET_OFFSET(&dva[c]); + } + } else { + c = vd->vdev_children; + + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm->mm_children = c; + mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + mm->mm_preferred = mm->mm_replacing ? 0 : + (zio->io_offset >> vdev_mirror_shift) % c; + mm->mm_root = B_FALSE; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + mc->mc_vd = vd->vdev_child[c]; + mc->mc_offset = zio->io_offset; + } + } + + zio->io_vsd = mm; + zio->io_vsd_free = vdev_mirror_map_free; + return (mm); +} + +static int +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + uint64_t c; + int numerrors = 0; + int ret, lasterror = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if ((ret = vdev_open(cvd)) != 0) { + lasterror = ret; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + if (numerrors == vd->vdev_children) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_mirror_close(vdev_t *vd) +{ + uint64_t c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_mirror_child_done(zio_t *zio) +{ + mirror_child_t *mc = zio->io_private; + + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; +} + +static void +vdev_mirror_scrub_done(zio_t *zio) +{ + mirror_child_t *mc = zio->io_private; + + if (zio->io_error == 0) { + zio_t *pio = zio->io_parent; + mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); + bcopy(zio->io_data, pio->io_data, pio->io_size); + mutex_exit(&pio->io_lock); + } + + zio_buf_free(zio->io_data, zio->io_size); + + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; +} + +/* + * Try to find a child whose DTL doesn't contain the block we want to read. + * If we can't, try the read on any vdev we haven't already tried. + */ +static int +vdev_mirror_child_select(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + mirror_child_t *mc; + uint64_t txg = zio->io_txg; + int i, c; + + ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); + + /* + * Try to find a child whose DTL doesn't contain the block to read. + * If a child is known to be completely inaccessible (indicated by + * vdev_readable() returning B_FALSE), don't even try. + */ + for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { + if (c >= mm->mm_children) + c = 0; + mc = &mm->mm_child[c]; + if (mc->mc_tried || mc->mc_skipped) + continue; + if (!vdev_readable(mc->mc_vd)) { + mc->mc_error = ENXIO; + mc->mc_tried = 1; /* don't even try */ + mc->mc_skipped = 1; + continue; + } + if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) + return (c); + mc->mc_error = ESTALE; + mc->mc_skipped = 1; + mc->mc_speculative = 1; + } + + /* + * Every device is either missing or has this txg in its DTL. + * Look for any child we haven't already tried before giving up. + */ + for (c = 0; c < mm->mm_children; c++) + if (!mm->mm_child[c].mc_tried) + return (c); + + /* + * Every child failed. There's no place left to look. + */ + return (-1); +} + +static int +vdev_mirror_io_start(zio_t *zio) +{ + mirror_map_t *mm; + mirror_child_t *mc; + int c, children; + + mm = vdev_mirror_map_alloc(zio); + + if (zio->io_type == ZIO_TYPE_READ) { + if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { + /* + * For scrubbing reads we need to allocate a read + * buffer for each child and issue reads to all + * children. If any child succeeds, it will copy its + * data into zio->io_data in vdev_mirror_scrub_done. + */ + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + zio_buf_alloc(zio->io_size), zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_mirror_scrub_done, mc)); + } + return (ZIO_PIPELINE_CONTINUE); + } + /* + * For normal reads just pick one child. + */ + c = vdev_mirror_child_select(zio); + children = (c >= 0); + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + /* + * If this is a resilvering I/O to a replacing vdev, + * only the last child should be written -- unless the + * first child happens to have a DTL entry here as well. + * All other writes go to all children. + */ + if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && + !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, + zio->io_txg, 1)) { + c = mm->mm_children - 1; + children = 1; + } else { + c = 0; + children = mm->mm_children; + } + } + + while (children--) { + mc = &mm->mm_child[c]; + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_mirror_child_done, mc)); + c++; + } + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +vdev_mirror_worst_error(mirror_map_t *mm) +{ + int error[2] = { 0, 0 }; + + for (int c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc = &mm->mm_child[c]; + int s = mc->mc_speculative; + error[s] = zio_worst_error(error[s], mc->mc_error); + } + + return (error[0] ? error[0] : error[1]); +} + +static void +vdev_mirror_io_done(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + mirror_child_t *mc; + int c; + int good_copies = 0; + int unexpected_errors = 0; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + if (mc->mc_error) { + if (!mc->mc_skipped) + unexpected_errors++; + } else if (mc->mc_tried) { + good_copies++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * XXX -- for now, treat partial writes as success. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + /* XXPOLICY */ + if (good_copies != mm->mm_children) { + /* + * Always require at least one good copy. + * + * For ditto blocks (io_vd == NULL), require + * all copies to be good. + * + * XXX -- for replacing vdevs, there's no great answer. + * If the old device is really dead, we may not even + * be able to access it -- so we only want to + * require good writes to the new device. But if + * the new device turns out to be flaky, we want + * to be able to detach it -- which requires all + * writes to the old device to have succeeded. + */ + if (good_copies == 0 || zio->io_vd == NULL) + zio->io_error = vdev_mirror_worst_error(mm); + } + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * If we don't have a good copy yet, keep trying other children. + */ + /* XXPOLICY */ + if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { + ASSERT(c >= 0 && c < mm->mm_children); + mc = &mm->mm_child[c]; + zio_vdev_io_redone(zio); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + ZIO_TYPE_READ, zio->io_priority, 0, + vdev_mirror_child_done, mc)); + return; + } + + /* XXPOLICY */ + if (good_copies == 0) { + zio->io_error = vdev_mirror_worst_error(mm); + ASSERT(zio->io_error != 0); + } + + if (good_copies && (spa_mode & FWRITE) && + (unexpected_errors || + (zio->io_flags & ZIO_FLAG_RESILVER) || + ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (c = 0; c < mm->mm_children; c++) { + /* + * Don't rewrite known good children. + * Not only is it unnecessary, it could + * actually be harmful: if the system lost + * power while rewriting the only good copy, + * there would be no good copies left! + */ + mc = &mm->mm_child[c]; + + if (mc->mc_error == 0) { + if (mc->mc_tried) + continue; + if (!(zio->io_flags & ZIO_FLAG_SCRUB) && + !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, + zio->io_txg, 1)) + continue; + mc->mc_error = ESTALE; + } + + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + zio->io_data, zio->io_size, + ZIO_TYPE_WRITE, zio->io_priority, + ZIO_FLAG_IO_REPAIR, NULL, NULL)); + } + } +} + +static void +vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted == vd->vdev_children) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_mirror_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + VDEV_TYPE_MIRROR, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_replacing_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + VDEV_TYPE_REPLACING, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_spare_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + VDEV_TYPE_SPARE, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c new file mode 100644 index 000000000..731f7d3dc --- /dev/null +++ b/module/zfs/vdev_missing.c @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * The 'missing' vdev is a special vdev type used only during import. It + * signifies a placeholder in the root vdev for some vdev that we know is + * missing. We pass it down to the kernel to allow the rest of the + * configuration to parsed and an attempt made to open all available devices. + * Because its GUID is always 0, we know that the guid sum will mismatch and we + * won't be able to open the pool anyway. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> + +/* ARGSUSED */ +static int +vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + /* + * Really this should just fail. But then the root vdev will be in the + * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is + * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we + * will fail the GUID sum check before ever trying to open the pool. + */ + *psize = SPA_MINDEVSIZE; + *ashift = SPA_MINBLOCKSHIFT; + return (0); +} + +/* ARGSUSED */ +static void +vdev_missing_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static int +vdev_missing_io_start(zio_t *zio) +{ + zio->io_error = ENOTSUP; + return (ZIO_PIPELINE_CONTINUE); +} + +/* ARGSUSED */ +static void +vdev_missing_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_missing_ops = { + vdev_missing_open, + vdev_missing_close, + vdev_default_asize, + vdev_missing_io_start, + vdev_missing_io_done, + NULL, + VDEV_TYPE_MISSING, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c new file mode 100644 index 000000000..46fca0e3b --- /dev/null +++ b/module/zfs/vdev_queue.c @@ -0,0 +1,308 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/avl.h> + +/* + * These tunables are for performance analysis. + */ +/* + * zfs_vdev_max_pending is the maximum number of i/os concurrently + * pending to each device. zfs_vdev_min_pending is the initial number + * of i/os pending to each device (before it starts ramping up to + * max_pending). + */ +int zfs_vdev_max_pending = 35; +int zfs_vdev_min_pending = 4; + +/* deadline = pri + (lbolt >> time_shift) */ +int zfs_vdev_time_shift = 6; + +/* exponential I/O issue ramp-up rate */ +int zfs_vdev_ramp_rate = 2; + +/* + * i/os will be aggregated into a single large i/o up to + * zfs_vdev_aggregation_limit bytes long. + */ +int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; + +/* + * Virtual device vector for disk I/O scheduling. + */ +int +vdev_queue_deadline_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_deadline < z2->io_deadline) + return (-1); + if (z1->io_deadline > z2->io_deadline) + return (1); + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +int +vdev_queue_offset_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +void +vdev_queue_init(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, + sizeof (zio_t), offsetof(struct zio, io_deadline_node)); + + avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); +} + +void +vdev_queue_fini(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + avl_destroy(&vq->vq_deadline_tree); + avl_destroy(&vq->vq_read_tree); + avl_destroy(&vq->vq_write_tree); + avl_destroy(&vq->vq_pending_tree); + + mutex_destroy(&vq->vq_lock); +} + +static void +vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) +{ + avl_add(&vq->vq_deadline_tree, zio); + avl_add(zio->io_vdev_tree, zio); +} + +static void +vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) +{ + avl_remove(&vq->vq_deadline_tree, zio); + avl_remove(zio->io_vdev_tree, zio); +} + +static void +vdev_queue_agg_io_done(zio_t *aio) +{ + zio_t *dio; + uint64_t offset = 0; + + while ((dio = aio->io_delegate_list) != NULL) { + if (aio->io_type == ZIO_TYPE_READ) + bcopy((char *)aio->io_data + offset, dio->io_data, + dio->io_size); + offset += dio->io_size; + aio->io_delegate_list = dio->io_delegate_next; + dio->io_delegate_next = NULL; + dio->io_error = aio->io_error; + zio_execute(dio); + } + ASSERT3U(offset, ==, aio->io_size); + + zio_buf_free(aio->io_data, aio->io_size); +} + +#define IS_ADJACENT(io, nio) \ + ((io)->io_offset + (io)->io_size == (nio)->io_offset) + +static zio_t * +vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) +{ + zio_t *fio, *lio, *aio, *dio; + avl_tree_t *tree; + uint64_t size; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || + avl_numnodes(&vq->vq_deadline_tree) == 0) + return (NULL); + + fio = lio = avl_first(&vq->vq_deadline_tree); + + tree = fio->io_vdev_tree; + size = fio->io_size; + + while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && + !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && + size + dio->io_size <= zfs_vdev_aggregation_limit) { + dio->io_delegate_next = fio; + fio = dio; + size += dio->io_size; + } + + while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && + !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && + size + dio->io_size <= zfs_vdev_aggregation_limit) { + lio->io_delegate_next = dio; + lio = dio; + size += dio->io_size; + } + + if (fio != lio) { + char *buf = zio_buf_alloc(size); + uint64_t offset = 0; + + ASSERT(size <= zfs_vdev_aggregation_limit); + + aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, + buf, size, fio->io_type, ZIO_PRIORITY_NOW, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + vdev_queue_agg_io_done, NULL); + + aio->io_delegate_list = fio; + + for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { + ASSERT(dio->io_type == aio->io_type); + ASSERT(dio->io_vdev_tree == tree); + if (dio->io_type == ZIO_TYPE_WRITE) + bcopy(dio->io_data, buf + offset, dio->io_size); + offset += dio->io_size; + vdev_queue_io_remove(vq, dio); + zio_vdev_io_bypass(dio); + } + + ASSERT(offset == size); + + avl_add(&vq->vq_pending_tree, aio); + + return (aio); + } + + ASSERT(fio->io_vdev_tree == tree); + vdev_queue_io_remove(vq, fio); + + avl_add(&vq->vq_pending_tree, fio); + + return (fio); +} + +zio_t * +vdev_queue_io(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + + if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) + return (zio); + + zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + + if (zio->io_type == ZIO_TYPE_READ) + zio->io_vdev_tree = &vq->vq_read_tree; + else + zio->io_vdev_tree = &vq->vq_write_tree; + + mutex_enter(&vq->vq_lock); + + zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; + + vdev_queue_io_add(vq, zio); + + nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); + + mutex_exit(&vq->vq_lock); + + if (nio == NULL) + return (NULL); + + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + return (NULL); + } + + return (nio); +} + +void +vdev_queue_io_done(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + + avl_remove(&vq->vq_pending_tree, zio); + + for (int i = 0; i < zfs_vdev_ramp_rate; i++) { + zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); + if (nio == NULL) + break; + mutex_exit(&vq->vq_lock); + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + } else { + zio_vdev_io_reissue(nio); + zio_execute(nio); + } + mutex_enter(&vq->vq_lock); + } + + mutex_exit(&vq->vq_lock); +} diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c new file mode 100644 index 000000000..69e314468 --- /dev/null +++ b/module/zfs/vdev_raidz.c @@ -0,0 +1,1209 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> + +/* + * Virtual device vector for RAID-Z. + * + * This vdev supports both single and double parity. For single parity, we + * use a simple XOR of all the data columns. For double parity, we use both + * the simple XOR as well as a technique described in "The mathematics of + * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), + * over the integers expressable in a single byte. Briefly, the operations on + * the field are defined as follows: + * + * o addition (+) is represented by a bitwise XOR + * o subtraction (-) is therefore identical to addition: A + B = A - B + * o multiplication of A by 2 is defined by the following bitwise expression: + * (A * 2)_7 = A_6 + * (A * 2)_6 = A_5 + * (A * 2)_5 = A_4 + * (A * 2)_4 = A_3 + A_7 + * (A * 2)_3 = A_2 + A_7 + * (A * 2)_2 = A_1 + A_7 + * (A * 2)_1 = A_0 + * (A * 2)_0 = A_7 + * + * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). + * + * Observe that any number in the field (except for 0) can be expressed as a + * power of 2 -- a generator for the field. We store a table of the powers of + * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can + * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather + * than field addition). The inverse of a field element A (A^-1) is A^254. + * + * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, + * can be expressed by field operations: + * + * P = D_0 + D_1 + ... + D_n-2 + D_n-1 + * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 + * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 + * + * See the reconstruction code below for how P and Q can used individually or + * in concert to recover missing data columns. + */ + +typedef struct raidz_col { + uint64_t rc_devidx; /* child device index for I/O */ + uint64_t rc_offset; /* device offset */ + uint64_t rc_size; /* I/O size */ + void *rc_data; /* I/O data */ + int rc_error; /* I/O error for this device */ + uint8_t rc_tried; /* Did we attempt this I/O column? */ + uint8_t rc_skipped; /* Did we skip this I/O column? */ +} raidz_col_t; + +typedef struct raidz_map { + uint64_t rm_cols; /* Column count */ + uint64_t rm_bigcols; /* Number of oversized columns */ + uint64_t rm_asize; /* Actual total I/O size */ + uint64_t rm_missingdata; /* Count of missing data devices */ + uint64_t rm_missingparity; /* Count of missing parity devices */ + uint64_t rm_firstdatacol; /* First data column/parity count */ + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ +} raidz_map_t; + +#define VDEV_RAIDZ_P 0 +#define VDEV_RAIDZ_Q 1 + +#define VDEV_RAIDZ_MAXPARITY 2 + +#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) + +/* + * These two tables represent powers and logs of 2 in the Galois field defined + * above. These values were computed by repeatedly multiplying by 2 as above. + */ +static const uint8_t vdev_raidz_pow2[256] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, + 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, + 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, + 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, + 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, + 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, + 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, + 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, + 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, + 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, + 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, + 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, + 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, + 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, + 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, + 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, + 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, + 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, + 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, + 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, + 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, + 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, + 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, + 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, + 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, + 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, + 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, + 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, + 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, + 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, + 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 +}; +static const uint8_t vdev_raidz_log2[256] = { + 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, + 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, + 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, + 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, + 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, + 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, + 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, + 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, + 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, + 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, + 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, + 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, + 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, + 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, + 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, + 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, + 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, + 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, + 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, + 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, + 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, + 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, + 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, + 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, + 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, + 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, + 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, + 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, + 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, + 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, + 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, + 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, +}; + +/* + * Multiply a given number by 2 raised to the given power. + */ +static uint8_t +vdev_raidz_exp2(uint_t a, int exp) +{ + if (a == 0) + return (0); + + ASSERT(exp >= 0); + ASSERT(vdev_raidz_log2[a] > 0 || a == 1); + + exp += vdev_raidz_log2[a]; + if (exp > 255) + exp -= 255; + + return (vdev_raidz_pow2[exp]); +} + +static void +vdev_raidz_map_free(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + int c; + + for (c = 0; c < rm->rm_firstdatacol; c++) + zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); +} + +static raidz_map_t * +vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, + uint64_t nparity) +{ + raidz_map_t *rm; + uint64_t b = zio->io_offset >> unit_shift; + uint64_t s = zio->io_size >> unit_shift; + uint64_t f = b % dcols; + uint64_t o = (b / dcols) << unit_shift; + uint64_t q, r, c, bc, col, acols, coff, devidx; + + q = s / (dcols - nparity); + r = s - q * (dcols - nparity); + bc = (r == 0 ? 0 : r + nparity); + + acols = (q == 0 ? bc : dcols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); + + rm->rm_cols = acols; + rm->rm_bigcols = bc; + rm->rm_asize = 0; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + rm->rm_firstdatacol = nparity; + + for (c = 0; c < acols; c++) { + col = f + c; + coff = o; + if (col >= dcols) { + col -= dcols; + coff += 1ULL << unit_shift; + } + rm->rm_col[c].rc_devidx = col; + rm->rm_col[c].rc_offset = coff; + rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; + rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + rm->rm_asize += rm->rm_col[c].rc_size; + } + + rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); + + for (c = 0; c < rm->rm_firstdatacol; c++) + rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + + rm->rm_col[c].rc_data = zio->io_data; + + for (c = c + 1; c < acols; c++) + rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + + /* + * If all data stored spans all columns, there's a danger that parity + * will always be on the same device and, since parity isn't read + * during normal operation, that that device's I/O bandwidth won't be + * used effectively. We therefore switch the parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices evenly, we + * won't see any benefit. Further, occasional writes that aren't a + * multiple of the LCM of the number of children and the minimum + * stripe width are sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk format + * requirement that we need to support for all eternity, but only + * for single-parity RAID-Z. + */ + ASSERT(rm->rm_cols >= 2); + ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + + if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + devidx = rm->rm_col[0].rc_devidx; + o = rm->rm_col[0].rc_offset; + rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; + rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; + rm->rm_col[1].rc_devidx = devidx; + rm->rm_col[1].rc_offset = o; + } + + zio->io_vsd = rm; + zio->io_vsd_free = vdev_raidz_map_free; + return (rm); +} + +static void +vdev_raidz_generate_parity_p(raidz_map_t *rm) +{ + uint64_t *p, *src, pcount, ccount, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount); + for (i = 0; i < ccount; i++, p++, src++) { + *p = *src; + } + } else { + ASSERT(ccount <= pcount); + for (i = 0; i < ccount; i++, p++, src++) { + *p ^= *src; + } + } + } +} + +static void +vdev_raidz_generate_parity_pq(raidz_map_t *rm) +{ + uint64_t *q, *p, *src, pcount, ccount, mask, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount || ccount == 0); + for (i = 0; i < ccount; i++, p++, q++, src++) { + *q = *src; + *p = *src; + } + for (; i < pcount; i++, p++, q++, src++) { + *q = 0; + *p = 0; + } + } else { + ASSERT(ccount <= pcount); + + /* + * Rather than multiplying each byte individually (as + * described above), we are able to handle 8 at once + * by generating a mask based on the high bit in each + * byte and using that to conditionally XOR in 0x1d. + */ + for (i = 0; i < ccount; i++, p++, q++, src++) { + mask = *q & 0x8080808080808080ULL; + mask = (mask << 1) - (mask >> 7); + *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ + (mask & 0x1d1d1d1d1d1d1d1dULL); + *q ^= *src; + *p ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + */ + for (; i < pcount; i++, q++) { + mask = *q & 0x8080808080808080ULL; + mask = (mask << 1) - (mask >> 7); + *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ + (mask & 0x1d1d1d1d1d1d1d1dULL); + } + } + } +} + +static void +vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) +{ + uint64_t *dst, *src, xcount, ccount, count, i; + int c; + + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); + ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); + ASSERT(xcount > 0); + + src = rm->rm_col[VDEV_RAIDZ_P].rc_data; + dst = rm->rm_col[x].rc_data; + for (i = 0; i < xcount; i++, dst++, src++) { + *dst = *src; + } + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + dst = rm->rm_col[x].rc_data; + + if (c == x) + continue; + + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + count = MIN(ccount, xcount); + + for (i = 0; i < count; i++, dst++, src++) { + *dst ^= *src; + } + } +} + +static void +vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) +{ + uint64_t *dst, *src, xcount, ccount, count, mask, i; + uint8_t *b; + int c, j, exp; + + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); + ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + dst = rm->rm_col[x].rc_data; + + if (c == x) + ccount = 0; + else + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + count = MIN(ccount, xcount); + + if (c == rm->rm_firstdatacol) { + for (i = 0; i < count; i++, dst++, src++) { + *dst = *src; + } + for (; i < xcount; i++, dst++) { + *dst = 0; + } + + } else { + /* + * For an explanation of this, see the comment in + * vdev_raidz_generate_parity_pq() above. + */ + for (i = 0; i < count; i++, dst++, src++) { + mask = *dst & 0x8080808080808080ULL; + mask = (mask << 1) - (mask >> 7); + *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ + (mask & 0x1d1d1d1d1d1d1d1dULL); + *dst ^= *src; + } + + for (; i < xcount; i++, dst++) { + mask = *dst & 0x8080808080808080ULL; + mask = (mask << 1) - (mask >> 7); + *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ + (mask & 0x1d1d1d1d1d1d1d1dULL); + } + } + } + + src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + dst = rm->rm_col[x].rc_data; + exp = 255 - (rm->rm_cols - 1 - x); + + for (i = 0; i < xcount; i++, dst++, src++) { + *dst ^= *src; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, exp); + } + } +} + +static void +vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) +{ + uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; + void *pdata, *qdata; + uint64_t xsize, ysize, i; + + ASSERT(x < y); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(y < rm->rm_cols); + + ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + + /* + * Move the parity data aside -- we're going to compute parity as + * though columns x and y were full of zeros -- Pxy and Qxy. We want to + * reuse the parity generation mechanism without trashing the actual + * parity so we make those columns appear to be full of zeros by + * setting their lengths to zero. + */ + pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; + qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + xsize = rm->rm_col[x].rc_size; + ysize = rm->rm_col[y].rc_size; + + rm->rm_col[VDEV_RAIDZ_P].rc_data = + zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); + rm->rm_col[VDEV_RAIDZ_Q].rc_data = + zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + rm->rm_col[x].rc_size = 0; + rm->rm_col[y].rc_size = 0; + + vdev_raidz_generate_parity_pq(rm); + + rm->rm_col[x].rc_size = xsize; + rm->rm_col[y].rc_size = ysize; + + p = pdata; + q = qdata; + pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; + qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + xd = rm->rm_col[x].rc_data; + yd = rm->rm_col[y].rc_data; + + /* + * We now have: + * Pxy = P + D_x + D_y + * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y + * + * We can then solve for D_x: + * D_x = A * (P + Pxy) + B * (Q + Qxy) + * where + * A = 2^(x - y) * (2^(x - y) + 1)^-1 + * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 + * + * With D_x in hand, we can easily solve for D_y: + * D_y = P + Pxy + D_x + */ + + a = vdev_raidz_pow2[255 + x - y]; + b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + tmp = 255 - vdev_raidz_log2[a ^ 1]; + + aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; + bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; + + for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { + *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ + vdev_raidz_exp2(*q ^ *qxy, bexp); + + if (i < ysize) + *yd = *p ^ *pxy ^ *xd; + } + + zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, + rm->rm_col[VDEV_RAIDZ_P].rc_size); + zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + /* + * Restore the saved parity data. + */ + rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; + rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; +} + + +static int +vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + uint64_t nparity = vd->vdev_nparity; + int c, error; + int lasterror = 0; + int numerrors = 0; + + ASSERT(nparity > 0); + + if (nparity > VDEV_RAIDZ_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if ((error = vdev_open(cvd)) != 0) { + lasterror = error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + *asize *= vd->vdev_children; + + if (numerrors > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_raidz_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static uint64_t +vdev_raidz_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + + asize = ((psize - 1) >> ashift) + 1; + asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); + asize = roundup(asize, nparity + 1) << ashift; + + return (asize); +} + +static void +vdev_raidz_child_done(zio_t *zio) +{ + raidz_col_t *rc = zio->io_private; + + rc->rc_error = zio->io_error; + rc->rc_tried = 1; + rc->rc_skipped = 0; +} + +static int +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + blkptr_t *bp = zio->io_bp; + raidz_map_t *rm; + raidz_col_t *rc; + int c; + + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); + + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * Generate RAID parity in the first virtual columns. + */ + if (rm->rm_firstdatacol == 1) + vdev_raidz_generate_parity_p(rm); + else + vdev_raidz_generate_parity_pq(rm); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + return (ZIO_PIPELINE_CONTINUE); + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity + * data. + */ + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_readable(cvd)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = ENXIO; + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = ESTALE; + rc->rc_skipped = 1; + continue; + } + if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + (zio->io_flags & ZIO_FLAG_SCRUB)) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } + + return (ZIO_PIPELINE_CONTINUE); +} + +/* + * Report a checksum error for a child of a RAID-Z device. + */ +static void +raidz_checksum_error(zio_t *zio, raidz_col_t *rc) +{ + vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); +} + +/* + * Generate the parity from the data columns. If we tried and were able to + * read the parity without error, verify that the generated parity matches the + * data we read. If it doesn't, we fire off a checksum error. Return the + * number such failures. + */ +static int +raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +{ + void *orig[VDEV_RAIDZ_MAXPARITY]; + int c, ret = 0; + raidz_col_t *rc; + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + orig[c] = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig[c], rc->rc_size); + } + + if (rm->rm_firstdatacol == 1) + vdev_raidz_generate_parity_p(rm); + else + vdev_raidz_generate_parity_pq(rm); + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + raidz_checksum_error(zio, rc); + rc->rc_error = ECKSUM; + ret++; + } + zio_buf_free(orig[c], rc->rc_size); + } + + return (ret); +} + +static uint64_t raidz_corrected_p; +static uint64_t raidz_corrected_q; +static uint64_t raidz_corrected_pq; + +static int +vdev_raidz_worst_error(raidz_map_t *rm) +{ + int error = 0; + + for (int c = 0; c < rm->rm_cols; c++) + error = zio_worst_error(error, rm->rm_col[c].rc_error); + + return (error); +} + +static void +vdev_raidz_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc, *rc1; + int unexpected_errors = 0; + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; + int total_errors = 0; + int n, c, c1; + + ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + + ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); + ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + if (c < rm->rm_firstdatacol) + parity_errors++; + else + data_errors++; + + if (!rc->rc_skipped) + unexpected_errors++; + + total_errors++; + } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * XXX -- for now, treat partial writes as a success. + * (If we couldn't write enough columns to reconstruct + * the data, the I/O failed. Otherwise, good enough.) + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + /* XXPOLICY */ + if (total_errors > rm->rm_firstdatacol) + zio->io_error = vdev_raidz_worst_error(rm); + + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * There are three potential phases for a read: + * 1. produce valid data from the columns read + * 2. read all disks and try again + * 3. perform combinatorial reconstruction + * + * Each phase is progressively both more expensive and less likely to + * occur. If we encounter more errors than we can repair or all phases + * fail, we have no choice but to return an error. + */ + + /* + * If the number of errors we saw was correctable -- less than or equal + * to the number of parity disks read -- attempt to produce data that + * has a valid checksum. Naturally, this case applies in the absence of + * any errors. + */ + if (total_errors <= rm->rm_firstdatacol - parity_untried) { + switch (data_errors) { + case 0: + if (zio_checksum_error(zio) == 0) { + /* + * If we read parity information (unnecessarily + * as it happens since no reconstruction was + * needed) regenerate and verify the parity. + * We also regenerate parity when resilvering + * so we can write it out to the failed device + * later. + */ + if (parity_errors + parity_untried < + rm->rm_firstdatacol || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + n = raidz_parity_verify(zio, rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + goto done; + } + break; + + case 1: + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rm->rm_firstdatacol); + + /* + * Find the column that reported the error. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) + break; + } + ASSERT(c != rm->rm_cols); + ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || + rc->rc_error == ESTALE); + + if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { + vdev_raidz_reconstruct_p(rm, c); + } else { + ASSERT(rm->rm_firstdatacol > 1); + vdev_raidz_reconstruct_q(rm, c); + } + + if (zio_checksum_error(zio) == 0) { + if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) + atomic_inc_64(&raidz_corrected_p); + else + atomic_inc_64(&raidz_corrected_q); + + /* + * If there's more than one parity disk that + * was successfully read, confirm that the + * other parity disk produced the correct data. + * This routine is suboptimal in that it + * regenerates both the parity we wish to test + * as well as the parity we just used to + * perform the reconstruction, but this should + * be a relatively uncommon case, and can be + * optimized if it becomes a problem. + * We also regenerate parity when resilvering + * so we can write it out to the failed device + * later. + */ + if (parity_errors < rm->rm_firstdatacol - 1 || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + n = raidz_parity_verify(zio, rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + + goto done; + } + break; + + case 2: + /* + * Two data column errors require double parity. + */ + ASSERT(rm->rm_firstdatacol == 2); + + /* + * Find the two columns that reported errors. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) + break; + } + ASSERT(c != rm->rm_cols); + ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || + rc->rc_error == ESTALE); + + for (c1 = c++; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) + break; + } + ASSERT(c != rm->rm_cols); + ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || + rc->rc_error == ESTALE); + + vdev_raidz_reconstruct_pq(rm, c1, c); + + if (zio_checksum_error(zio) == 0) { + atomic_inc_64(&raidz_corrected_pq); + goto done; + } + break; + + default: + ASSERT(rm->rm_firstdatacol <= 2); + ASSERT(0); + } + } + + /* + * This isn't a typical situation -- either we got a read error or + * a child silently returned bad data. Read every block so we can + * try again with as much data and parity as we can track down. If + * we've already been through once before, all children will be marked + * as tried so we'll proceed to combinatorial reconstruction. + */ + unexpected_errors = 1; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + + for (c = 0; c < rm->rm_cols; c++) { + if (rm->rm_col[c].rc_tried) + continue; + + zio_vdev_io_redone(zio); + do { + rc = &rm->rm_col[c]; + if (rc->rc_tried) + continue; + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } while (++c < rm->rm_cols); + + return; + } + + /* + * At this point we've attempted to reconstruct the data given the + * errors we detected, and we've attempted to read all columns. There + * must, therefore, be one or more additional problems -- silent errors + * resulting in invalid data rather than explicit I/O errors resulting + * in absent data. Before we attempt combinatorial reconstruction make + * sure we have a chance of coming up with the right answer. + */ + if (total_errors >= rm->rm_firstdatacol) { + zio->io_error = vdev_raidz_worst_error(rm); + /* + * If there were exactly as many device errors as parity + * columns, yet we couldn't reconstruct the data, then at + * least one device must have returned bad data silently. + */ + if (total_errors == rm->rm_firstdatacol) + zio->io_error = zio_worst_error(zio->io_error, ECKSUM); + goto done; + } + + if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { + /* + * Attempt to reconstruct the data from parity P. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + void *orig; + rc = &rm->rm_col[c]; + + orig = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig, rc->rc_size); + vdev_raidz_reconstruct_p(rm, c); + + if (zio_checksum_error(zio) == 0) { + zio_buf_free(orig, rc->rc_size); + atomic_inc_64(&raidz_corrected_p); + + /* + * If this child didn't know that it returned + * bad data, inform it. + */ + if (rc->rc_tried && rc->rc_error == 0) + raidz_checksum_error(zio, rc); + rc->rc_error = ECKSUM; + goto done; + } + + bcopy(orig, rc->rc_data, rc->rc_size); + zio_buf_free(orig, rc->rc_size); + } + } + + if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + /* + * Attempt to reconstruct the data from parity Q. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + void *orig; + rc = &rm->rm_col[c]; + + orig = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig, rc->rc_size); + vdev_raidz_reconstruct_q(rm, c); + + if (zio_checksum_error(zio) == 0) { + zio_buf_free(orig, rc->rc_size); + atomic_inc_64(&raidz_corrected_q); + + /* + * If this child didn't know that it returned + * bad data, inform it. + */ + if (rc->rc_tried && rc->rc_error == 0) + raidz_checksum_error(zio, rc); + rc->rc_error = ECKSUM; + goto done; + } + + bcopy(orig, rc->rc_data, rc->rc_size); + zio_buf_free(orig, rc->rc_size); + } + } + + if (rm->rm_firstdatacol > 1 && + rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && + rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + /* + * Attempt to reconstruct the data from both P and Q. + */ + for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { + void *orig, *orig1; + rc = &rm->rm_col[c]; + + orig = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig, rc->rc_size); + + for (c1 = c + 1; c1 < rm->rm_cols; c1++) { + rc1 = &rm->rm_col[c1]; + + orig1 = zio_buf_alloc(rc1->rc_size); + bcopy(rc1->rc_data, orig1, rc1->rc_size); + + vdev_raidz_reconstruct_pq(rm, c, c1); + + if (zio_checksum_error(zio) == 0) { + zio_buf_free(orig, rc->rc_size); + zio_buf_free(orig1, rc1->rc_size); + atomic_inc_64(&raidz_corrected_pq); + + /* + * If these children didn't know they + * returned bad data, inform them. + */ + if (rc->rc_tried && rc->rc_error == 0) + raidz_checksum_error(zio, rc); + if (rc1->rc_tried && rc1->rc_error == 0) + raidz_checksum_error(zio, rc1); + + rc->rc_error = ECKSUM; + rc1->rc_error = ECKSUM; + + goto done; + } + + bcopy(orig1, rc1->rc_data, rc1->rc_size); + zio_buf_free(orig1, rc1->rc_size); + } + + bcopy(orig, rc->rc_data, rc->rc_size); + zio_buf_free(orig, rc->rc_size); + } + } + + /* + * All combinations failed to checksum. Generate checksum ereports for + * all children. + */ + zio->io_error = ECKSUM; + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, + rc->rc_offset, rc->rc_size); + } + } + +done: + zio_checksum_verified(zio); + + if (zio->io_error == 0 && (spa_mode & FWRITE) && + (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + ZIO_TYPE_WRITE, zio->io_priority, + ZIO_FLAG_IO_REPAIR, NULL, NULL)); + } + } +} + +static void +vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > vd->vdev_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_raidz_ops = { + vdev_raidz_open, + vdev_raidz_close, + vdev_raidz_asize, + vdev_raidz_io_start, + vdev_raidz_io_done, + vdev_raidz_state_change, + VDEV_TYPE_RAIDZ, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c new file mode 100644 index 000000000..88383f002 --- /dev/null +++ b/module/zfs/vdev_root.c @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for the pool's root vdev. + */ + +/* + * We should be able to tolerate one failure with absolutely no damage + * to our metadata. Two failures will take out space maps, a bunch of + * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy + * place to live. When we get smarter, we can liberalize this policy. + * e.g. If we haven't lost two consecutive top-level vdevs, then we are + * probably fine. Adding bean counters during alloc/free can make this + * future guesswork more accurate. + */ +static int +too_many_errors(vdev_t *vd, int numerrors) +{ + ASSERT3U(numerrors, <=, vd->vdev_children); + return (numerrors > 0); +} + +static int +vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + int c; + int lasterror = 0; + int numerrors = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + int error; + + if ((error = vdev_open(cvd)) != 0 && + !cvd->vdev_islog) { + lasterror = error; + numerrors++; + continue; + } + } + + if (too_many_errors(vd, numerrors)) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + *asize = 0; + *ashift = 0; + + return (0); +} + +static void +vdev_root_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_root_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (too_many_errors(vd, faulted)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + } else if (degraded) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } +} + +vdev_ops_t vdev_root_ops = { + vdev_root_open, + vdev_root_close, + vdev_default_asize, + NULL, /* io_start - not applicable to the root */ + NULL, /* io_done - not applicable to the root */ + vdev_root_state_change, + VDEV_TYPE_ROOT, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/module/zfs/zap.c b/module/zfs/zap.c new file mode 100644 index 000000000..ca859ec35 --- /dev/null +++ b/module/zfs/zap.c @@ -0,0 +1,1136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * This file contains the top half of the zfs directory structure + * implementation. The bottom half is in zap_leaf.c. + * + * The zdir is an extendable hash data structure. There is a table of + * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are + * each a constant size and hold a variable number of directory entries. + * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. + * + * The pointer table holds a power of 2 number of pointers. + * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to + * by the pointer at index i in the table holds entries whose hash value + * has a zd_prefix_len - bit prefix + */ + +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/zfs_znode.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> + +int fzap_default_block_shift = 14; /* 16k blocksize */ + +static void zap_leaf_pageout(dmu_buf_t *db, void *vl); +static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); + + +void +fzap_byteswap(void *vbuf, size_t size) +{ + uint64_t block_type; + + block_type = *(uint64_t *)vbuf; + + if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) + zap_leaf_byteswap(vbuf, size); + else { + /* it's a ptrtbl block */ + byteswap_uint64_array(vbuf, size); + } +} + +void +fzap_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + dmu_buf_t *db; + zap_leaf_t *l; + int i; + zap_phys_t *zp; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + zap->zap_ismicro = FALSE; + + (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, + &zap->zap_f.zap_phys, zap_evict); + + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); + zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; + + zp = zap->zap_f.zap_phys; + /* + * explicitly zero it since it might be coming from an + * initialized microzap + */ + bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); + zp->zap_block_type = ZBT_HEADER; + zp->zap_magic = ZAP_MAGIC; + + zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); + + zp->zap_freeblk = 2; /* block 1 will be the first leaf */ + zp->zap_num_leafs = 1; + zp->zap_num_entries = 0; + zp->zap_salt = zap->zap_salt; + zp->zap_normflags = zap->zap_normflags; + + /* block 1 will be the first leaf */ + for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++) + ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; + + /* + * set up block 1 - the first leaf + */ + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_dbuf = db; + l->l_phys = db->db_data; + + zap_leaf_init(l, zp->zap_normflags != 0); + + kmem_free(l, sizeof (zap_leaf_t)); + dmu_buf_rele(db, FTAG); +} + +static int +zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) +{ + if (RW_WRITE_HELD(&zap->zap_rwlock)) + return (1); + if (rw_tryupgrade(&zap->zap_rwlock)) { + dmu_buf_will_dirty(zap->zap_dbuf, tx); + return (1); + } + return (0); +} + +/* + * Generic routines for dealing with the pointer & cookie tables. + */ + +static int +zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, + void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), + dmu_tx_t *tx) +{ + uint64_t b, newblk; + dmu_buf_t *db_old, *db_new; + int err; + int bs = FZAP_BLOCK_SHIFT(zap); + int hepb = 1<<(bs-4); + /* hepb = half the number of entries in a block */ + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + ASSERT(tbl->zt_numblks > 0); + + if (tbl->zt_nextblk != 0) { + newblk = tbl->zt_nextblk; + } else { + newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); + tbl->zt_nextblk = newblk; + ASSERT3U(tbl->zt_blks_copied, ==, 0); + dmu_prefetch(zap->zap_objset, zap->zap_object, + tbl->zt_blk << bs, tbl->zt_numblks << bs); + } + + /* + * Copy the ptrtbl from the old to new location. + */ + + b = tbl->zt_blks_copied; + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + b) << bs, FTAG, &db_old); + if (err) + return (err); + + /* first half of entries in old[b] go to new[2*b+0] */ + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+0) << bs, FTAG, &db_new)); + dmu_buf_will_dirty(db_new, tx); + transfer_func(db_old->db_data, db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + /* second half of entries in old[b] go to new[2*b+1] */ + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+1) << bs, FTAG, &db_new)); + dmu_buf_will_dirty(db_new, tx); + transfer_func((uint64_t *)db_old->db_data + hepb, + db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + dmu_buf_rele(db_old, FTAG); + + tbl->zt_blks_copied++; + + dprintf("copied block %llu of %llu\n", + tbl->zt_blks_copied, tbl->zt_numblks); + + if (tbl->zt_blks_copied == tbl->zt_numblks) { + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); + + tbl->zt_blk = newblk; + tbl->zt_numblks *= 2; + tbl->zt_shift++; + tbl->zt_nextblk = 0; + tbl->zt_blks_copied = 0; + + dprintf("finished; numblocks now %llu (%lluk entries)\n", + tbl->zt_numblks, 1<<(tbl->zt_shift-10)); + } + + return (0); +} + +static int +zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, + dmu_tx_t *tx) +{ + int err; + uint64_t blk, off; + int bs = FZAP_BLOCK_SHIFT(zap); + dmu_buf_t *db; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + + dprintf("storing %llx at index %llx\n", val, idx); + + blk = idx >> (bs-3); + off = idx & ((1<<(bs-3))-1); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << bs, FTAG, &db); + if (err) + return (err); + dmu_buf_will_dirty(db, tx); + + if (tbl->zt_nextblk != 0) { + uint64_t idx2 = idx * 2; + uint64_t blk2 = idx2 >> (bs-3); + uint64_t off2 = idx2 & ((1<<(bs-3))-1); + dmu_buf_t *db2; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2); + if (err) { + dmu_buf_rele(db, FTAG); + return (err); + } + dmu_buf_will_dirty(db2, tx); + ((uint64_t *)db2->db_data)[off2] = val; + ((uint64_t *)db2->db_data)[off2+1] = val; + dmu_buf_rele(db2, FTAG); + } + + ((uint64_t *)db->db_data)[off] = val; + dmu_buf_rele(db, FTAG); + + return (0); +} + +static int +zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) +{ + uint64_t blk, off; + int err; + dmu_buf_t *db; + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + blk = idx >> (bs-3); + off = idx & ((1<<(bs-3))-1); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << bs, FTAG, &db); + if (err) + return (err); + *valp = ((uint64_t *)db->db_data)[off]; + dmu_buf_rele(db, FTAG); + + if (tbl->zt_nextblk != 0) { + /* + * read the nextblk for the sake of i/o error checking, + * so that zap_table_load() will catch errors for + * zap_table_store. + */ + blk = (idx*2) >> (bs-3); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk) << bs, FTAG, &db); + dmu_buf_rele(db, FTAG); + } + return (err); +} + +/* + * Routines for growing the ptrtbl. + */ + +static void +zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) +{ + int i; + for (i = 0; i < n; i++) { + uint64_t lb = src[i]; + dst[2*i+0] = lb; + dst[2*i+1] = lb; + } +} + +static int +zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) +{ + /* In case things go horribly wrong. */ + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2) + return (ENOSPC); + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + /* + * We are outgrowing the "embedded" ptrtbl (the one + * stored in the header block). Give it its own entire + * block, which will double the size of the ptrtbl. + */ + uint64_t newblk; + dmu_buf_t *db_new; + int err; + + ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, + ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0); + + newblk = zap_allocate_blocks(zap, 1); + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new); + if (err) + return (err); + dmu_buf_will_dirty(db_new, tx); + zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + dmu_buf_rele(db_new, FTAG); + + zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++; + + ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << + (FZAP_BLOCK_SHIFT(zap)-3)); + + return (0); + } else { + return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + zap_ptrtbl_transfer, tx)); + } +} + +static void +zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); + ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta); + zap->zap_f.zap_phys->zap_num_entries += delta; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); +} + +static uint64_t +zap_allocate_blocks(zap_t *zap, int nblocks) +{ + uint64_t newblk; + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + newblk = zap->zap_f.zap_phys->zap_freeblk; + zap->zap_f.zap_phys->zap_freeblk += nblocks; + return (newblk); +} + +static zap_leaf_t * +zap_create_leaf(zap_t *zap, dmu_tx_t *tx) +{ + void *winner; + zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + rw_init(&l->l_rwlock, 0, 0, 0); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = zap_allocate_blocks(zap, 1); + l->l_dbuf = NULL; + l->l_phys = NULL; + + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf)); + winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); + ASSERT(winner == NULL); + dmu_buf_will_dirty(l->l_dbuf, tx); + + zap_leaf_init(l, zap->zap_normflags != 0); + + zap->zap_f.zap_phys->zap_num_leafs++; + + return (l); +} + +int +fzap_count(zap_t *zap, uint64_t *count) +{ + ASSERT(!zap->zap_ismicro); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ + *count = zap->zap_f.zap_phys->zap_num_entries; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); + return (0); +} + +/* + * Routines for obtaining zap_leaf_t's + */ + +void +zap_put_leaf(zap_leaf_t *l) +{ + rw_exit(&l->l_rwlock); + dmu_buf_rele(l->l_dbuf, NULL); +} + +_NOTE(ARGSUSED(0)) +static void +zap_leaf_pageout(dmu_buf_t *db, void *vl) +{ + zap_leaf_t *l = vl; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + +static zap_leaf_t * +zap_open_leaf(uint64_t blkid, dmu_buf_t *db) +{ + zap_leaf_t *l, *winner; + + ASSERT(blkid != 0); + + l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + rw_init(&l->l_rwlock, 0, 0, 0); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = blkid; + l->l_bs = highbit(db->db_size)-1; + l->l_dbuf = db; + l->l_phys = NULL; + + winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); + + rw_exit(&l->l_rwlock); + if (winner != NULL) { + /* someone else set it first */ + zap_leaf_pageout(NULL, l); + l = winner; + } + + /* + * lhr_pad was previously used for the next leaf in the leaf + * chain. There should be no chained leafs (as we have removed + * support for them). + */ + ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0); + + /* + * There should be more hash entries than there can be + * chunks to put in the hash table + */ + ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); + + /* The chunks should begin at the end of the hash table */ + ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, + &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); + + /* The chunks should end at the end of the block */ + ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - + (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size); + + return (l); +} + +static int +zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, + zap_leaf_t **lp) +{ + dmu_buf_t *db; + zap_leaf_t *l; + int bs = FZAP_BLOCK_SHIFT(zap); + int err; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + blkid << bs, NULL, &db); + if (err) + return (err); + + ASSERT3U(db->db_object, ==, zap->zap_object); + ASSERT3U(db->db_offset, ==, blkid << bs); + ASSERT3U(db->db_size, ==, 1 << bs); + ASSERT(blkid != 0); + + l = dmu_buf_get_user(db); + + if (l == NULL) + l = zap_open_leaf(blkid, db); + + rw_enter(&l->l_rwlock, lt); + /* + * Must lock before dirtying, otherwise l->l_phys could change, + * causing ASSERT below to fail. + */ + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); + ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); + ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + *lp = l; + return (0); +} + +static int +zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + ASSERT3U(idx, <, + (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); + *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); + return (0); + } else { + return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + idx, valp)); + } +} + +static int +zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) +{ + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { + ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; + return (0); + } else { + return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + idx, blk, tx)); + } +} + +static int +zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) +{ + uint64_t idx, blk; + int err; + + ASSERT(zap->zap_dbuf == NULL || + zap->zap_f.zap_phys == zap->zap_dbuf->db_data); + ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); + idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + err = zap_idx_to_blk(zap, idx, &blk); + if (err != 0) + return (err); + err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); + + ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) == + (*lp)->l_phys->l_hdr.lh_prefix); + return (err); +} + +static int +zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) +{ + zap_t *zap = zn->zn_zap; + uint64_t hash = zn->zn_hash; + zap_leaf_t *nl; + int prefix_diff, i, err; + uint64_t sibling; + int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len; + + ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + l->l_phys->l_hdr.lh_prefix); + + if (zap_tryupgradedir(zap, tx) == 0 || + old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + /* We failed to upgrade, or need to grow the pointer table */ + objset_t *os = zap->zap_objset; + uint64_t object = zap->zap_object; + + zap_put_leaf(l); + zap_unlockdir(zap); + err = zap_lockdir(os, object, tx, RW_WRITER, + FALSE, FALSE, &zn->zn_zap); + zap = zn->zn_zap; + if (err) + return (err); + ASSERT(!zap->zap_ismicro); + + while (old_prefix_len == + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + err = zap_grow_ptrtbl(zap, tx); + if (err) + return (err); + } + + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err) + return (err); + + if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) { + /* it split while our locks were down */ + *lp = l; + return (0); + } + } + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + l->l_phys->l_hdr.lh_prefix); + + prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - + (old_prefix_len + 1); + sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; + + /* check for i/o errors before doing zap_leaf_split */ + for (i = 0; i < (1ULL<<prefix_diff); i++) { + uint64_t blk; + err = zap_idx_to_blk(zap, sibling+i, &blk); + if (err) + return (err); + ASSERT3U(blk, ==, l->l_blkid); + } + + nl = zap_create_leaf(zap, tx); + zap_leaf_split(l, nl, zap->zap_normflags != 0); + + /* set sibling pointers */ + for (i = 0; i < (1ULL<<prefix_diff); i++) { + err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); + ASSERT3U(err, ==, 0); /* we checked for i/o errors above */ + } + + if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) { + /* we want the sibling */ + zap_put_leaf(l); + *lp = nl; + } else { + zap_put_leaf(nl); + *lp = l; + } + + return (0); +} + +static void +zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift && + l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); + + zap_put_leaf(l); + + if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) { + int err; + + /* + * We are in the middle of growing the pointer table, or + * this leaf will soon make us grow it. + */ + if (zap_tryupgradedir(zap, tx) == 0) { + objset_t *os = zap->zap_objset; + uint64_t zapobj = zap->zap_object; + + zap_unlockdir(zap); + err = zap_lockdir(os, zapobj, tx, + RW_WRITER, FALSE, FALSE, &zn->zn_zap); + zap = zn->zn_zap; + if (err) + return; + } + + /* could have finished growing while our locks were down */ + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift) + (void) zap_grow_ptrtbl(zap, tx); + } +} + + +static int +fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) +{ + if (name && strlen(name) > ZAP_MAXNAMELEN) + return (E2BIG); + + /* Only integer sizes supported by C */ + switch (integer_size) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return (EINVAL); + } + + if (integer_size * num_integers > ZAP_MAXVALUELEN) + return (E2BIG); + + return (0); +} + +/* + * Routines for manipulating attributes. + */ +int +fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *ncp) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); + if (err != 0) + return (err); + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + err = zap_entry_read(&zeh, integer_size, num_integers, buf); + (void) zap_entry_read_name(&zeh, rn_len, realname); + if (ncp) { + *ncp = zap_entry_normalization_conflict(&zeh, + zn, NULL, zn->zn_zap); + } + } + + zap_put_leaf(l); + return (err); +} + +int +fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(!zap->zap_ismicro); + ASSERT(fzap_checksize(zn->zn_name_orij, + integer_size, num_integers) == 0); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + err = EEXIST; + goto out; + } + if (err != ENOENT) + goto out; + + err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd, + integer_size, num_integers, val, &zeh); + + if (err == 0) { + zap_increment_num_entries(zap, 1, tx); + } else if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ + if (err == 0) + goto retry; + } + +out: + if (zap != NULL) + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); + return (err); +} + +int +fzap_add(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); + if (err != 0) + return (err); + + return (fzap_add_cd(zn, integer_size, num_integers, + val, ZAP_MAXCD, tx)); +} + +int +fzap_update(zap_name_t *zn, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err, create; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); + if (err != 0) + return (err); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + create = (err == ENOENT); + ASSERT(err == 0 || err == ENOENT); + + if (create) { + err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, + ZAP_MAXCD, integer_size, num_integers, val, &zeh); + if (err == 0) + zap_increment_num_entries(zap, 1, tx); + } else { + err = zap_entry_update(&zeh, integer_size, num_integers, val); + } + + if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ + if (err == 0) + goto retry; + } + + if (zap != NULL) + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); + return (err); +} + +int +fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err != 0) + goto out; + + if (integer_size) + *integer_size = zeh.zeh_integer_size; + if (num_integers) + *num_integers = zeh.zeh_num_integers; +out: + zap_put_leaf(l); + return (err); +} + +int +fzap_remove(zap_name_t *zn, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + zap_entry_remove(&zeh); + zap_increment_num_entries(zn->zn_zap, -1, tx); + } + zap_put_leaf(l); + return (err); +} + +/* + * Helper functions for consumers. + */ + +int +zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, + char *name) +{ + zap_cursor_t zc; + zap_attribute_t *za; + int err; + + if (mask == 0) + mask = -1ULL; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, os, zapobj); + (err = zap_cursor_retrieve(&zc, za)) == 0; + zap_cursor_advance(&zc)) { + if ((za->za_first_integer & mask) == (value & mask)) { + (void) strcpy(name, za->za_name); + break; + } + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + return (err); +} + +int +zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za.za_integer_length != 8 || za.za_num_integers != 1) + return (EINVAL); + err = zap_add(os, intoobj, za.za_name, + 8, 1, &za.za_first_integer, tx); + if (err) + return (err); + } + zap_cursor_fini(&zc); + return (0); +} + +int +zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_remove(os, obj, name, tx)); +} + +int +zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_lookup(os, obj, name, 8, 1, &value)); +} + +/* + * Routines for iterating over the attributes. + */ + +int +fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) +{ + int err = ENOENT; + zap_entry_handle_t zeh; + zap_leaf_t *l; + + /* retrieve the next entry at or after zc_hash/zc_cd */ + /* if no entry, return ENOENT */ + + if (zc->zc_leaf && + (ZAP_HASH_IDX(zc->zc_hash, + zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) != + zc->zc_leaf->l_phys->l_hdr.lh_prefix)) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + +again: + if (zc->zc_leaf == NULL) { + err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, + &zc->zc_leaf); + if (err != 0) + return (err); + } else { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + } + l = zc->zc_leaf; + + err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); + + if (err == ENOENT) { + uint64_t nocare = + (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1; + zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; + zc->zc_cd = 0; + if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { + zc->zc_hash = -1ULL; + } else { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + goto again; + } + } + + if (err == 0) { + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + za->za_integer_length = zeh.zeh_integer_size; + za->za_num_integers = zeh.zeh_num_integers; + if (zeh.zeh_num_integers == 0) { + za->za_first_integer = 0; + } else { + err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); + ASSERT(err == 0 || err == EOVERFLOW); + } + err = zap_entry_read_name(&zeh, + sizeof (za->za_name), za->za_name); + ASSERT(err == 0); + + za->za_normalization_conflict = + zap_entry_normalization_conflict(&zeh, + NULL, za->za_name, zap); + } + rw_exit(&zc->zc_leaf->l_rwlock); + return (err); +} + + +static void +zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +{ + int i, err; + uint64_t lastblk = 0; + + /* + * NB: if a leaf has more pointers than an entire ptrtbl block + * can hold, then it'll be accounted for more than once, since + * we won't have lastblk. + */ + for (i = 0; i < len; i++) { + zap_leaf_t *l; + + if (tbl[i] == lastblk) + continue; + lastblk = tbl[i]; + + err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); + if (err == 0) { + zap_leaf_stats(zap, l, zs); + zap_put_leaf(l); + } + } +} + +void +fzap_get_stats(zap_t *zap, zap_stats_t *zs) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + zs->zs_blocksize = 1ULL << bs; + + /* + * Set zap_phys_t fields + */ + zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs; + zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries; + zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk; + zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type; + zs->zs_magic = zap->zap_f.zap_phys->zap_magic; + zs->zs_salt = zap->zap_f.zap_phys->zap_salt; + + /* + * Set zap_ptrtbl fields + */ + zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk; + zs->zs_ptrtbl_blks_copied = + zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied; + zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk; + zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; + zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + /* the ptrtbl is entirely in the header block. */ + zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); + } else { + int b; + + dmu_prefetch(zap->zap_objset, zap->zap_object, + zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs, + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs); + + for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; + b++) { + dmu_buf_t *db; + int err; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, + FTAG, &db); + if (err == 0) { + zap_stats_ptrtbl(zap, db->db_data, + 1<<(bs-3), zs); + dmu_buf_rele(db, FTAG); + } + } + } +} diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c new file mode 100644 index 000000000..da498b6bc --- /dev/null +++ b/module/zfs/zap_leaf.c @@ -0,0 +1,853 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * The 512-byte leaf is broken into 32 16-byte chunks. + * chunk number n means l_chunk[n], even though the header precedes it. + * the names are stored null-terminated. + */ + +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> +#include <sys/spa.h> +#include <sys/dmu.h> + +static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); + +#define CHAIN_END 0xffff /* end of the chunk chain */ + +/* half the (current) minimum block size */ +#define MAX_ARRAY_BYTES (8<<10) + +#define LEAF_HASH(l, h) \ + ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ + ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len))) + +#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) + + +static void +zap_memset(void *a, int c, size_t n) +{ + char *cp = a; + char *cpend = cp + n; + + while (cp < cpend) + *cp++ = c; +} + +static void +stv(int len, void *addr, uint64_t value) +{ + switch (len) { + case 1: + *(uint8_t *)addr = value; + return; + case 2: + *(uint16_t *)addr = value; + return; + case 4: + *(uint32_t *)addr = value; + return; + case 8: + *(uint64_t *)addr = value; + return; + } + ASSERT(!"bad int len"); +} + +static uint64_t +ldv(int len, const void *addr) +{ + switch (len) { + case 1: + return (*(uint8_t *)addr); + case 2: + return (*(uint16_t *)addr); + case 4: + return (*(uint32_t *)addr); + case 8: + return (*(uint64_t *)addr); + } + ASSERT(!"bad int len"); + return (0xFEEDFACEDEADBEEFULL); +} + +void +zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) +{ + int i; + zap_leaf_t l; + l.l_bs = highbit(size)-1; + l.l_phys = buf; + + buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); + buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); + buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); + buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); + buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); + buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); + buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); + + for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) + buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); + + for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { + zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); + struct zap_leaf_entry *le; + + switch (lc->l_free.lf_type) { + case ZAP_CHUNK_ENTRY: + le = &lc->l_entry; + + le->le_type = BSWAP_8(le->le_type); + le->le_int_size = BSWAP_8(le->le_int_size); + le->le_next = BSWAP_16(le->le_next); + le->le_name_chunk = BSWAP_16(le->le_name_chunk); + le->le_name_length = BSWAP_16(le->le_name_length); + le->le_value_chunk = BSWAP_16(le->le_value_chunk); + le->le_value_length = BSWAP_16(le->le_value_length); + le->le_cd = BSWAP_32(le->le_cd); + le->le_hash = BSWAP_64(le->le_hash); + break; + case ZAP_CHUNK_FREE: + lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); + lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); + break; + case ZAP_CHUNK_ARRAY: + lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); + lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); + /* la_array doesn't need swapping */ + break; + default: + ASSERT(!"bad leaf type"); + } + } +} + +void +zap_leaf_init(zap_leaf_t *l, boolean_t sort) +{ + int i; + + l->l_bs = highbit(l->l_dbuf->db_size)-1; + zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header)); + zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; + ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; + } + ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; + l->l_phys->l_hdr.lh_block_type = ZBT_LEAF; + l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + if (sort) + l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; +} + +/* + * Routines which manipulate leaf chunks (l_chunk[]). + */ + +static uint16_t +zap_leaf_chunk_alloc(zap_leaf_t *l) +{ + int chunk; + + ASSERT(l->l_phys->l_hdr.lh_nfree > 0); + + chunk = l->l_phys->l_hdr.lh_freelist; + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); + + l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; + + l->l_phys->l_hdr.lh_nfree--; + + return (chunk); +} + +static void +zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) +{ + struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; + ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); + + zlf->lf_type = ZAP_CHUNK_FREE; + zlf->lf_next = l->l_phys->l_hdr.lh_freelist; + bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ + l->l_phys->l_hdr.lh_freelist = chunk; + + l->l_phys->l_hdr.lh_nfree++; +} + +/* + * Routines which manipulate leaf arrays (zap_leaf_array type chunks). + */ + +static uint16_t +zap_leaf_array_create(zap_leaf_t *l, const char *buf, + int integer_size, int num_integers) +{ + uint16_t chunk_head; + uint16_t *chunkp = &chunk_head; + int byten = 0; + uint64_t value; + int shift = (integer_size-1)*8; + int len = num_integers; + + ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); + + while (len > 0) { + uint16_t chunk = zap_leaf_chunk_alloc(l); + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + int i; + + la->la_type = ZAP_CHUNK_ARRAY; + for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { + if (byten == 0) + value = ldv(integer_size, buf); + la->la_array[i] = value >> shift; + value <<= 8; + if (++byten == integer_size) { + byten = 0; + buf += integer_size; + if (--len == 0) + break; + } + } + + *chunkp = chunk; + chunkp = &la->la_next; + } + *chunkp = CHAIN_END; + + return (chunk_head); +} + +static void +zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) +{ + uint16_t chunk = *chunkp; + + *chunkp = CHAIN_END; + + while (chunk != CHAIN_END) { + int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; + ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, + ZAP_CHUNK_ARRAY); + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + } +} + +/* array_len and buf_len are in integers, not bytes */ +static void +zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, + int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, + char *buf) +{ + int len = MIN(array_len, buf_len); + int byten = 0; + uint64_t value = 0; + + ASSERT3U(array_int_len, <=, buf_int_len); + + /* Fast path for one 8-byte integer */ + if (array_int_len == 8 && buf_int_len == 8 && len == 1) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + uint8_t *ip = la->la_array; + uint64_t *buf64 = (uint64_t *)buf; + + *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | + (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | + (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | + (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; + return; + } + + /* Fast path for an array of 1-byte integers (eg. the entry name) */ + if (array_int_len == 1 && buf_int_len == 1 && + buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { + while (chunk != CHAIN_END) { + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES); + buf += ZAP_LEAF_ARRAY_BYTES; + chunk = la->la_next; + } + return; + } + + while (len > 0) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + int i; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { + value = (value << 8) | la->la_array[i]; + byten++; + if (byten == array_int_len) { + stv(buf_int_len, buf, value); + byten = 0; + len--; + if (len == 0) + return; + buf += buf_int_len; + } + } + chunk = la->la_next; + } +} + +/* + * Only to be used on 8-bit arrays. + * array_len is actual len in bytes (not encoded le_value_length). + * namenorm is null-terminated. + */ +static boolean_t +zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len) +{ + int bseen = 0; + + if (zn->zn_matchtype == MT_FIRST) { + char *thisname = kmem_alloc(array_len, KM_SLEEP); + boolean_t match; + + zap_leaf_array_read(l, chunk, 1, array_len, 1, + array_len, thisname); + match = zap_match(zn, thisname); + kmem_free(thisname, array_len); + return (match); + } + + /* Fast path for exact matching */ + while (bseen < array_len) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread)) + break; + chunk = la->la_next; + bseen += toread; + } + return (bseen == array_len); +} + +/* + * Routines which manipulate leaf entries. + */ + +int +zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) +{ + uint16_t *chunkp; + struct zap_leaf_entry *le; + + ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + +again: + for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); + *chunkp != CHAIN_END; chunkp = &le->le_next) { + uint16_t chunk = *chunkp; + le = ZAP_LEAF_ENTRY(l, chunk); + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (le->le_hash != zn->zn_hash) + continue; + + /* + * NB: the entry chain is always sorted by cd on + * normalized zap objects, so this will find the + * lowest-cd match for MT_FIRST. + */ + ASSERT(zn->zn_matchtype == MT_EXACT || + (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); + if (zap_leaf_array_match(l, zn, le->le_name_chunk, + le->le_name_length)) { + zeh->zeh_num_integers = le->le_value_length; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + zeh->zeh_leaf = l; + return (0); + } + } + + /* + * NB: we could of course do this in one pass, but that would be + * a pain. We'll see if MT_BEST is even used much. + */ + if (zn->zn_matchtype == MT_BEST) { + zn->zn_matchtype = MT_FIRST; + goto again; + } + + return (ENOENT); +} + +/* Return (h1,cd1 >= h2,cd2) */ +#define HCD_GTEQ(h1, cd1, h2, cd2) \ + ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) + +int +zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) +{ + uint16_t chunk; + uint64_t besth = -1ULL; + uint32_t bestcd = ZAP_MAXCD; + uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; + uint16_t lh; + struct zap_leaf_entry *le; + + ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { + for (chunk = l->l_phys->l_hash[lh]; + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && + HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { + ASSERT3U(bestlh, >=, lh); + bestlh = lh; + besth = le->le_hash; + bestcd = le->le_cd; + + zeh->zeh_num_integers = le->le_value_length; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_fakechunk = chunk; + zeh->zeh_chunkp = &zeh->zeh_fakechunk; + zeh->zeh_leaf = l; + } + } + } + + return (bestcd == ZAP_MAXCD ? ENOENT : 0); +} + +int +zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf) +{ + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (le->le_int_size > integer_size) + return (EINVAL); + + zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size, + le->le_value_length, integer_size, num_integers, buf); + + if (zeh->zeh_num_integers > num_integers) + return (EOVERFLOW); + return (0); + +} + +int +zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) +{ + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, + le->le_name_length, 1, buflen, buf); + if (le->le_name_length > buflen) + return (EOVERFLOW); + return (0); +} + +int +zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf) +{ + int delta_chunks; + zap_leaf_t *l = zeh->zeh_leaf; + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); + + delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size); + + if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) + return (EAGAIN); + + /* + * We should search other chained leaves (via + * zap_entry_remove,create?) otherwise returning EAGAIN will + * just send us into an infinite loop if we have to chain + * another leaf block, rather than being able to split this + * block. + */ + + zap_leaf_array_free(l, &le->le_value_chunk); + le->le_value_chunk = + zap_leaf_array_create(l, buf, integer_size, num_integers); + le->le_value_length = num_integers; + le->le_int_size = integer_size; + return (0); +} + +void +zap_entry_remove(zap_entry_handle_t *zeh) +{ + uint16_t entry_chunk; + struct zap_leaf_entry *le; + zap_leaf_t *l = zeh->zeh_leaf; + + ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); + + entry_chunk = *zeh->zeh_chunkp; + le = ZAP_LEAF_ENTRY(l, entry_chunk); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + zap_leaf_array_free(l, &le->le_name_chunk); + zap_leaf_array_free(l, &le->le_value_chunk); + + *zeh->zeh_chunkp = le->le_next; + zap_leaf_chunk_free(l, entry_chunk); + + l->l_phys->l_hdr.lh_nentries--; +} + +int +zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh) +{ + uint16_t chunk; + uint16_t *chunkp; + struct zap_leaf_entry *le; + uint64_t namelen, valuelen; + int numchunks; + + valuelen = integer_size * num_integers; + namelen = strlen(name) + 1; + ASSERT(namelen >= 2); + + numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) + + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); + if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) + return (E2BIG); + + if (cd == ZAP_MAXCD) { + /* find the lowest unused cd */ + if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { + cd = 0; + + for (chunk = *LEAF_HASH_ENTPTR(l, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + if (le->le_cd > cd) + break; + if (le->le_hash == h) { + ASSERT3U(cd, ==, le->le_cd); + cd++; + } + } + } else { + /* old unsorted format; do it the O(n^2) way */ + for (cd = 0; cd < ZAP_MAXCD; cd++) { + for (chunk = *LEAF_HASH_ENTPTR(l, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + if (le->le_hash == h && + le->le_cd == cd) { + break; + } + } + /* If this cd is not in use, we are good. */ + if (chunk == CHAIN_END) + break; + } + } + /* + * we would run out of space in a block before we could + * have ZAP_MAXCD entries + */ + ASSERT3U(cd, <, ZAP_MAXCD); + } + + if (l->l_phys->l_hdr.lh_nfree < numchunks) + return (EAGAIN); + + /* make the entry */ + chunk = zap_leaf_chunk_alloc(l); + le = ZAP_LEAF_ENTRY(l, chunk); + le->le_type = ZAP_CHUNK_ENTRY; + le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen); + le->le_name_length = namelen; + le->le_value_chunk = + zap_leaf_array_create(l, buf, integer_size, num_integers); + le->le_value_length = num_integers; + le->le_int_size = integer_size; + le->le_hash = h; + le->le_cd = cd; + + /* link it into the hash chain */ + /* XXX if we did the search above, we could just use that */ + chunkp = zap_leaf_rehash_entry(l, chunk); + + l->l_phys->l_hdr.lh_nentries++; + + zeh->zeh_leaf = l; + zeh->zeh_num_integers = num_integers; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + + return (0); +} + +/* + * Determine if there is another entry with the same normalized form. + * For performance purposes, either zn or name must be provided (the + * other can be NULL). Note, there usually won't be any hash + * conflicts, in which case we don't need the concatenated/normalized + * form of the name. But all callers have one of these on hand anyway, + * so might as well take advantage. A cleaner but slower interface + * would accept neither argument, and compute the normalized name as + * needed (using zap_name_alloc(zap_entry_read_name(zeh))). + */ +boolean_t +zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, + const char *name, zap_t *zap) +{ + uint64_t chunk; + struct zap_leaf_entry *le; + boolean_t allocdzn = B_FALSE; + + if (zap->zap_normflags == 0) + return (B_FALSE); + + for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); + if (le->le_hash != zeh->zeh_hash) + continue; + if (le->le_cd == zeh->zeh_cd) + continue; + + if (zn == NULL) { + zn = zap_name_alloc(zap, name, MT_FIRST); + allocdzn = B_TRUE; + } + if (zap_leaf_array_match(zeh->zeh_leaf, zn, + le->le_name_chunk, le->le_name_length)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } + } + if (allocdzn) + zap_name_free(zn); + return (B_FALSE); +} + +/* + * Routines for transferring entries between leafs. + */ + +static uint16_t * +zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) +{ + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); + struct zap_leaf_entry *le2; + uint16_t *chunkp; + + /* + * keep the entry chain sorted by cd + * NB: this will not cause problems for unsorted leafs, though + * it is unnecessary there. + */ + for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); + *chunkp != CHAIN_END; chunkp = &le2->le_next) { + le2 = ZAP_LEAF_ENTRY(l, *chunkp); + if (le2->le_cd > le->le_cd) + break; + } + + le->le_next = *chunkp; + *chunkp = entry; + return (chunkp); +} + +static uint16_t +zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) +{ + uint16_t new_chunk; + uint16_t *nchunkp = &new_chunk; + + while (chunk != CHAIN_END) { + uint16_t nchunk = zap_leaf_chunk_alloc(nl); + struct zap_leaf_array *nla = + &ZAP_LEAF_CHUNK(nl, nchunk).l_array; + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + int nextchunk = la->la_next; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); + + *nla = *la; /* structure assignment */ + + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + *nchunkp = nchunk; + nchunkp = &nla->la_next; + } + *nchunkp = CHAIN_END; + return (new_chunk); +} + +static void +zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) +{ + struct zap_leaf_entry *le, *nle; + uint16_t chunk; + + le = ZAP_LEAF_ENTRY(l, entry); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + chunk = zap_leaf_chunk_alloc(nl); + nle = ZAP_LEAF_ENTRY(nl, chunk); + *nle = *le; /* structure assignment */ + + (void) zap_leaf_rehash_entry(nl, chunk); + + nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); + nle->le_value_chunk = + zap_leaf_transfer_array(l, le->le_value_chunk, nl); + + zap_leaf_chunk_free(l, entry); + + l->l_phys->l_hdr.lh_nentries--; + nl->l_phys->l_hdr.lh_nentries++; +} + +/* + * Transfer the entries whose hash prefix ends in 1 to the new leaf. + */ +void +zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) +{ + int i; + int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len; + + /* set new prefix and prefix_len */ + l->l_phys->l_hdr.lh_prefix <<= 1; + l->l_phys->l_hdr.lh_prefix_len++; + nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1; + nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len; + + /* break existing hash chains */ + zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + + if (sort) + l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; + + /* + * Transfer entries whose hash bit 'bit' is set to nl; rehash + * the remaining entries + * + * NB: We could find entries via the hashtable instead. That + * would be O(hashents+numents) rather than O(numblks+numents), + * but this accesses memory more sequentially, and when we're + * called, the block is usually pretty full. + */ + for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); + if (le->le_type != ZAP_CHUNK_ENTRY) + continue; + + if (le->le_hash & (1ULL << bit)) + zap_leaf_transfer_entry(l, i, nl); + else + (void) zap_leaf_rehash_entry(l, i); + } +} + +void +zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) +{ + int i, n; + + n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - + l->l_phys->l_hdr.lh_prefix_len; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_leafs_with_2n_pointers[n]++; + + + n = l->l_phys->l_hdr.lh_nentries/5; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_with_n5_entries[n]++; + + n = ((1<<FZAP_BLOCK_SHIFT(zap)) - + l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / + (1<<FZAP_BLOCK_SHIFT(zap)); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_n_tenths_full[n]++; + + for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { + int nentries = 0; + int chunk = l->l_phys->l_hash[i]; + + while (chunk != CHAIN_END) { + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(l, chunk); + + n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) + + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * + le->le_int_size); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_entries_using_n_chunks[n]++; + + chunk = le->le_next; + nentries++; + } + + n = nentries; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_buckets_with_n_entries[n]++; + } +} diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c new file mode 100644 index 000000000..abba42775 --- /dev/null +++ b/module/zfs/zap_micro.c @@ -0,0 +1,1069 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> +#include <sys/avl.h> + +#ifdef _KERNEL +#include <sys/sunddi.h> +#endif + +static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx); + + +static uint64_t +zap_hash(zap_t *zap, const char *normname) +{ + const uint8_t *cp; + uint8_t c; + uint64_t crc = zap->zap_salt; + + /* NB: name must already be normalized, if necessary */ + + ASSERT(crc != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) { + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; + } + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the ones that we first pay attention to when + * chosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + + return (crc); +} + +static int +zap_normalize(zap_t *zap, const char *name, char *namenorm) +{ + size_t inlen, outlen; + int err; + + inlen = strlen(name) + 1; + outlen = ZAP_MAXNAMELEN; + + err = 0; + (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, + zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST, + &err); + + return (err); +} + +boolean_t +zap_match(zap_name_t *zn, const char *matchname) +{ + if (zn->zn_matchtype == MT_FIRST) { + char norm[ZAP_MAXNAMELEN]; + + if (zap_normalize(zn->zn_zap, matchname, norm) != 0) + return (B_FALSE); + + return (strcmp(zn->zn_name_norm, norm) == 0); + } else { + /* MT_BEST or MT_EXACT */ + return (strcmp(zn->zn_name_orij, matchname) == 0); + } +} + +void +zap_name_free(zap_name_t *zn) +{ + kmem_free(zn, sizeof (zap_name_t)); +} + +/* XXX combine this with zap_lockdir()? */ +zap_name_t * +zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + + zn->zn_zap = zap; + zn->zn_name_orij = name; + zn->zn_matchtype = mt; + if (zap->zap_normflags) { + if (zap_normalize(zap, name, zn->zn_normbuf) != 0) { + zap_name_free(zn); + return (NULL); + } + zn->zn_name_norm = zn->zn_normbuf; + } else { + if (mt != MT_EXACT) { + zap_name_free(zn); + return (NULL); + } + zn->zn_name_norm = zn->zn_name_orij; + } + + zn->zn_hash = zap_hash(zap, zn->zn_name_norm); + return (zn); +} + +static void +mzap_byteswap(mzap_phys_t *buf, size_t size) +{ + int i, max; + buf->mz_block_type = BSWAP_64(buf->mz_block_type); + buf->mz_salt = BSWAP_64(buf->mz_salt); + buf->mz_normflags = BSWAP_64(buf->mz_normflags); + max = (size / MZAP_ENT_LEN) - 1; + for (i = 0; i < max; i++) { + buf->mz_chunk[i].mze_value = + BSWAP_64(buf->mz_chunk[i].mze_value); + buf->mz_chunk[i].mze_cd = + BSWAP_32(buf->mz_chunk[i].mze_cd); + } +} + +void +zap_byteswap(void *buf, size_t size) +{ + uint64_t block_type; + + block_type = *(uint64_t *)buf; + + if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { + /* ASSERT(magic == ZAP_LEAF_MAGIC); */ + mzap_byteswap(buf, size); + } else { + fzap_byteswap(buf, size); + } +} + +static int +mze_compare(const void *arg1, const void *arg2) +{ + const mzap_ent_t *mze1 = arg1; + const mzap_ent_t *mze2 = arg2; + + if (mze1->mze_hash > mze2->mze_hash) + return (+1); + if (mze1->mze_hash < mze2->mze_hash) + return (-1); + if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) + return (+1); + if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) + return (-1); + return (0); +} + +static void +mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) +{ + mzap_ent_t *mze; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(mzep->mze_cd < ZAP_MAXCD); + + mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); + mze->mze_chunkid = chunkid; + mze->mze_hash = hash; + mze->mze_phys = *mzep; + avl_add(&zap->zap_m.zap_avl, mze); +} + +static mzap_ent_t * +mze_find(zap_name_t *zn) +{ + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; + + ASSERT(zn->zn_zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); + + if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name)) + return (NULL); + + mze_tofind.mze_hash = zn->zn_hash; + mze_tofind.mze_phys.mze_cd = 0; + +again: + mze = avl_find(avl, &mze_tofind, &idx); + if (mze == NULL) + mze = avl_nearest(avl, idx, AVL_AFTER); + for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { + if (zap_match(zn, mze->mze_phys.mze_name)) + return (mze); + } + if (zn->zn_matchtype == MT_BEST) { + zn->zn_matchtype = MT_FIRST; + goto again; + } + return (NULL); +} + +static uint32_t +mze_find_unused_cd(zap_t *zap, uint64_t hash) +{ + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + uint32_t cd; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + mze_tofind.mze_hash = hash; + mze_tofind.mze_phys.mze_cd = 0; + + cd = 0; + for (mze = avl_find(avl, &mze_tofind, &idx); + mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + if (mze->mze_phys.mze_cd != cd) + break; + cd++; + } + + return (cd); +} + +static void +mze_remove(zap_t *zap, mzap_ent_t *mze) +{ + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + avl_remove(&zap->zap_m.zap_avl, mze); + kmem_free(mze, sizeof (mzap_ent_t)); +} + +static void +mze_destroy(zap_t *zap) +{ + mzap_ent_t *mze; + void *avlcookie = NULL; + + while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) + kmem_free(mze, sizeof (mzap_ent_t)); + avl_destroy(&zap->zap_m.zap_avl); +} + +static zap_t * +mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +{ + zap_t *winner; + zap_t *zap; + int i; + + ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); + + zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); + rw_init(&zap->zap_rwlock, 0, 0, 0); + rw_enter(&zap->zap_rwlock, RW_WRITER); + zap->zap_objset = os; + zap->zap_object = obj; + zap->zap_dbuf = db; + + if (*(uint64_t *)db->db_data != ZBT_MICRO) { + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); + zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; + } else { + zap->zap_ismicro = TRUE; + } + + /* + * Make sure that zap_ismicro is set before we let others see + * it, because zap_lockdir() checks zap_ismicro without the lock + * held. + */ + winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); + + if (winner != NULL) { + rw_exit(&zap->zap_rwlock); + rw_destroy(&zap->zap_rwlock); + if (!zap->zap_ismicro) + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + kmem_free(zap, sizeof (zap_t)); + return (winner); + } + + if (zap->zap_ismicro) { + zap->zap_salt = zap->zap_m.zap_phys->mz_salt; + zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; + zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; + avl_create(&zap->zap_m.zap_avl, mze_compare, + sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); + + for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = + &zap->zap_m.zap_phys->mz_chunk[i]; + if (mze->mze_name[0]) { + zap_name_t *zn; + + zap->zap_m.zap_num_entries++; + zn = zap_name_alloc(zap, mze->mze_name, + MT_EXACT); + mze_insert(zap, i, zn->zn_hash, mze); + zap_name_free(zn); + } + } + } else { + zap->zap_salt = zap->zap_f.zap_phys->zap_salt; + zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; + + ASSERT3U(sizeof (struct zap_leaf_header), ==, + 2*ZAP_LEAF_CHUNKSIZE); + + /* + * The embedded pointer table should not overlap the + * other members. + */ + ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, + &zap->zap_f.zap_phys->zap_salt); + + /* + * The embedded pointer table should end at the end of + * the block + */ + ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, + 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - + (uintptr_t)zap->zap_f.zap_phys, ==, + zap->zap_dbuf->db_size); + } + rw_exit(&zap->zap_rwlock); + return (zap); +} + +int +zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) +{ + zap_t *zap; + dmu_buf_t *db; + krw_t lt; + int err; + + *zapp = NULL; + + err = dmu_buf_hold(os, obj, 0, NULL, &db); + if (err) + return (err); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + } +#endif + + zap = dmu_buf_get_user(db); + if (zap == NULL) + zap = mzap_open(os, obj, db); + + /* + * We're checking zap_ismicro without the lock held, in order to + * tell what type of lock we want. Once we have some sort of + * lock, see if it really is the right type. In practice this + * can only be different if it was upgraded from micro to fat, + * and micro wanted WRITER but fat only needs READER. + */ + lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; + rw_enter(&zap->zap_rwlock, lt); + if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { + /* it was upgraded, now we only need reader */ + ASSERT(lt == RW_WRITER); + ASSERT(RW_READER == + (!zap->zap_ismicro && fatreader) ? RW_READER : lti); + rw_downgrade(&zap->zap_rwlock); + lt = RW_READER; + } + + zap->zap_objset = os; + + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + + ASSERT3P(zap->zap_dbuf, ==, db); + + ASSERT(!zap->zap_ismicro || + zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); + if (zap->zap_ismicro && tx && adding && + zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { + uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; + if (newsz > MZAP_MAX_BLKSZ) { + dprintf("upgrading obj %llu: num_entries=%u\n", + obj, zap->zap_m.zap_num_entries); + *zapp = zap; + return (mzap_upgrade(zapp, tx)); + } + err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); + ASSERT3U(err, ==, 0); + zap->zap_m.zap_num_chunks = + db->db_size / MZAP_ENT_LEN - 1; + } + + *zapp = zap; + return (0); +} + +void +zap_unlockdir(zap_t *zap) +{ + rw_exit(&zap->zap_rwlock); + dmu_buf_rele(zap->zap_dbuf, NULL); +} + +static int +mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) +{ + mzap_phys_t *mzp; + int i, sz, nchunks, err; + zap_t *zap = *zapp; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + sz = zap->zap_dbuf->db_size; + mzp = kmem_alloc(sz, KM_SLEEP); + bcopy(zap->zap_dbuf->db_data, mzp, sz); + nchunks = zap->zap_m.zap_num_chunks; + + err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, + 1ULL << fzap_default_block_shift, 0, tx); + if (err) { + kmem_free(mzp, sz); + return (err); + } + + dprintf("upgrading obj=%llu with %u chunks\n", + zap->zap_object, nchunks); + /* XXX destroy the avl later, so we can use the stored hash value */ + mze_destroy(zap); + + fzap_upgrade(zap, tx); + + for (i = 0; i < nchunks; i++) { + int err; + mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; + zap_name_t *zn; + if (mze->mze_name[0] == 0) + continue; + dprintf("adding %s=%llu\n", + mze->mze_name, mze->mze_value); + zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); + err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); + zap = zn->zn_zap; /* fzap_add_cd() may change zap */ + zap_name_free(zn); + if (err) + break; + } + kmem_free(mzp, sz); + *zapp = zap; + return (err); +} + +static void +mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) +{ + dmu_buf_t *db; + mzap_phys_t *zp; + + VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + } +#endif + + dmu_buf_will_dirty(db, tx); + zp = db->db_data; + zp->mz_block_type = ZBT_MICRO; + zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; + zp->mz_normflags = normflags; + dmu_buf_rele(db, FTAG); +} + +int +zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_claim_norm(os, obj, + 0, ot, bonustype, bonuslen, tx)); +} + +int +zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int err; + + err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); + if (err != 0) + return (err); + mzap_create_impl(os, obj, normflags, tx); + return (0); +} + +uint64_t +zap_create(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); +} + +uint64_t +zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + + mzap_create_impl(os, obj, normflags, tx); + return (obj); +} + +int +zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) +{ + /* + * dmu_object_free will free the object number and free the + * data. Freeing the data will cause our pageout function to be + * called, which will destroy our data (zap_leaf_t's and zap_t). + */ + + return (dmu_object_free(os, zapobj, tx)); +} + +_NOTE(ARGSUSED(0)) +void +zap_evict(dmu_buf_t *db, void *vzap) +{ + zap_t *zap = vzap; + + rw_destroy(&zap->zap_rwlock); + + if (zap->zap_ismicro) + mze_destroy(zap); + else + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + + kmem_free(zap, sizeof (zap_t)); +} + +int +zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_count(zap, count); + } else { + *count = zap->zap_m.zap_num_entries; + } + zap_unlockdir(zap); + return (err); +} + +/* + * zn may be NULL; if not specified, it will be computed if needed. + * See also the comment above zap_entry_normalization_conflict(). + */ +static boolean_t +mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) +{ + mzap_ent_t *other; + int direction = AVL_BEFORE; + boolean_t allocdzn = B_FALSE; + + if (zap->zap_normflags == 0) + return (B_FALSE); + +again: + for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); + other && other->mze_hash == mze->mze_hash; + other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { + + if (zn == NULL) { + zn = zap_name_alloc(zap, mze->mze_phys.mze_name, + MT_FIRST); + allocdzn = B_TRUE; + } + if (zap_match(zn, other->mze_phys.mze_name)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } + } + + if (direction == AVL_BEFORE) { + direction = AVL_AFTER; + goto again; + } + + if (allocdzn) + zap_name_free(zn); + return (B_FALSE); +} + +/* + * Routines for manipulating attributes. + */ + +int +zap_lookup(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm(os, zapobj, name, integer_size, + num_integers, buf, MT_EXACT, NULL, 0, NULL)); +} + +int +zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc(zap, name, mt); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + + if (!zap->zap_ismicro) { + err = fzap_lookup(zn, integer_size, num_integers, buf, + realname, rn_len, ncp); + } else { + mze = mze_find(zn); + if (mze == NULL) { + err = ENOENT; + } else { + if (num_integers < 1) { + err = EOVERFLOW; + } else if (integer_size != 8) { + err = EINVAL; + } else { + *(uint64_t *)buf = mze->mze_phys.mze_value; + (void) strlcpy(realname, + mze->mze_phys.mze_name, rn_len); + if (ncp) { + *ncp = mzap_normalization_conflict(zap, + zn, mze); + } + } + } + } + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + +int +zap_length(objset_t *os, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + if (!zap->zap_ismicro) { + err = fzap_length(zn, integer_size, num_integers); + } else { + mze = mze_find(zn); + if (mze == NULL) { + err = ENOENT; + } else { + if (integer_size) + *integer_size = 8; + if (num_integers) + *num_integers = 1; + } + } + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + +static void +mzap_addent(zap_name_t *zn, uint64_t value) +{ + int i; + zap_t *zap = zn->zn_zap; + int start = zap->zap_m.zap_alloc_next; + uint32_t cd; + + dprintf("obj=%llu %s=%llu\n", zap->zap_object, + zn->zn_name_orij, value); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + +#ifdef ZFS_DEBUG + for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0); + } +#endif + + cd = mze_find_unused_cd(zap, zn->zn_hash); + /* given the limited size of the microzap, this can't happen */ + ASSERT(cd != ZAP_MAXCD); + +again: + for (i = start; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + if (mze->mze_name[0] == 0) { + mze->mze_value = value; + mze->mze_cd = cd; + (void) strcpy(mze->mze_name, zn->zn_name_orij); + zap->zap_m.zap_num_entries++; + zap->zap_m.zap_alloc_next = i+1; + if (zap->zap_m.zap_alloc_next == + zap->zap_m.zap_num_chunks) + zap->zap_m.zap_alloc_next = 0; + mze_insert(zap, i, zn->zn_hash, mze); + return; + } + } + if (start != 0) { + start = 0; + goto again; + } + ASSERT(!"out of entries!"); +} + +int +zap_add(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + const uint64_t *intval = val; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + if (err) + return (err); + zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + if (!zap->zap_ismicro) { + err = fzap_add(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + zapobj, integer_size, num_integers, name); + err = mzap_upgrade(&zn->zn_zap, tx); + if (err == 0) + err = fzap_add(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + } else { + mze = mze_find(zn); + if (mze != NULL) { + err = EEXIST; + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap); + return (err); +} + +int +zap_update(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + mzap_ent_t *mze; + const uint64_t *intval = val; + zap_name_t *zn; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + if (err) + return (err); + zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + if (!zap->zap_ismicro) { + err = fzap_update(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + zapobj, integer_size, num_integers, name); + err = mzap_upgrade(&zn->zn_zap, tx); + if (err == 0) + err = fzap_update(zn, integer_size, num_integers, + val, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + } else { + mze = mze_find(zn); + if (mze != NULL) { + mze->mze_phys.mze_value = *intval; + zap->zap_m.zap_phys->mz_chunk + [mze->mze_chunkid].mze_value = *intval; + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap); + return (err); +} + +int +zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) +{ + return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); +} + +int +zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc(zap, name, mt); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + if (!zap->zap_ismicro) { + err = fzap_remove(zn, tx); + } else { + mze = mze_find(zn); + if (mze == NULL) { + err = ENOENT; + } else { + zap->zap_m.zap_num_entries--; + bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], + sizeof (mzap_ent_phys_t)); + mze_remove(zap, mze); + } + } + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + +/* + * Routines for iterating over the attributes. + */ + +/* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So use a small hash value so + * we can fit 4 bits of cd into the 32-bit cursor. + * + * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] + */ +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zc->zc_objset = os; + zc->zc_zap = NULL; + zc->zc_leaf = NULL; + zc->zc_zapobj = zapobj; + if (serialized == -1ULL) { + zc->zc_hash = -1ULL; + zc->zc_cd = 0; + } else { + zc->zc_hash = serialized << (64-ZAP_HASHBITS); + zc->zc_cd = serialized >> ZAP_HASHBITS; + if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ + zc->zc_cd = 0; + } +} + +void +zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_serialized(zc, os, zapobj, 0); +} + +void +zap_cursor_fini(zap_cursor_t *zc) +{ + if (zc->zc_zap) { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + zap_unlockdir(zc->zc_zap); + zc->zc_zap = NULL; + } + if (zc->zc_leaf) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + zc->zc_objset = NULL; +} + +uint64_t +zap_cursor_serialize(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return (-1ULL); + ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); + ASSERT(zc->zc_cd < ZAP_MAXCD); + return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | + ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); +} + +int +zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) +{ + int err; + avl_index_t idx; + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + + if (zc->zc_hash == -1ULL) + return (ENOENT); + + if (zc->zc_zap == NULL) { + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, FALSE, &zc->zc_zap); + if (err) + return (err); + } else { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + } + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_retrieve(zc->zc_zap, zc, za); + } else { + err = ENOENT; + + mze_tofind.mze_hash = zc->zc_hash; + mze_tofind.mze_phys.mze_cd = zc->zc_cd; + + mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); + if (mze == NULL) { + mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, + idx, AVL_AFTER); + } + if (mze) { + ASSERT(0 == bcmp(&mze->mze_phys, + &zc->zc_zap->zap_m.zap_phys->mz_chunk + [mze->mze_chunkid], sizeof (mze->mze_phys))); + + za->za_normalization_conflict = + mzap_normalization_conflict(zc->zc_zap, NULL, mze); + za->za_integer_length = 8; + za->za_num_integers = 1; + za->za_first_integer = mze->mze_phys.mze_value; + (void) strcpy(za->za_name, mze->mze_phys.mze_name); + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_phys.mze_cd; + err = 0; + } else { + zc->zc_hash = -1ULL; + } + } + rw_exit(&zc->zc_zap->zap_rwlock); + return (err); +} + +void +zap_cursor_advance(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return; + zc->zc_cd++; + if (zc->zc_cd >= ZAP_MAXCD) { + zc->zc_cd = 0; + zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); + if (zc->zc_hash == 0) /* EOF */ + zc->zc_hash = -1ULL; + } +} + +int +zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) +{ + int err; + zap_t *zap; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + + bzero(zs, sizeof (zap_stats_t)); + + if (zap->zap_ismicro) { + zs->zs_blocksize = zap->zap_dbuf->db_size; + zs->zs_num_entries = zap->zap_m.zap_num_entries; + zs->zs_num_blocks = 1; + } else { + fzap_get_stats(zap, zs); + } + zap_unlockdir(zap); + return (0); +} diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c new file mode 100644 index 000000000..341dc4dfe --- /dev/null +++ b/module/zfs/zfs_acl.c @@ -0,0 +1,2680 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/sid.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_fuid.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_vfsops.h> +#include <sys/dmu.h> +#include <sys/dnode.h> +#include <sys/zap.h> +#include "fs/fs_subr.h" +#include <acl/acl_common.h> + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +static zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = zfs_acl_fuid_ops; + else + aclp->z_ops = zfs_acl_v0_ops; + return (aclp); +} + +static zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while (aclnode = list_head(&aclp->z_acl)) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (obj_type == VDIR && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops.ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops.ace_flags_get(acep); + *type = aclp->z_ops.ace_type_get(acep); + *access_mask = aclp->z_ops.ace_mask_get(acep); + *who = aclp->z_ops.ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +static zfs_acl_node_t * +zfs_acl_curr_node(zfs_acl_t *aclp) +{ + ASSERT(aclp->z_curr_node); + return (aclp->z_curr_node); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, + zfs_ace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + if (!aclp->z_has_fuids) + aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who); + aceptr->z_fuid = (uint64_t)acep->a_who; + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (EINVAL); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops.ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while (zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type)) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (EINVAL); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while (cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type)) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp, + newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops.ace_mask_set(acep, access_mask); + aclp->z_ops.ace_type_set(acep, access_type); + aclp->z_ops.ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops.ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + * Also, create FUIDs for any User/Group ACEs + */ +static uint64_t +zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, + zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + + mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while (acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type)) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over owner@, group@ or everyone@ inherit only ACEs + */ + if ((iflags & ACE_INHERIT_ONLY_ACE) && + (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + entry_type == OWNING_GROUP)) + continue; + + if (entry_type == ACE_OWNER) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } + /* + * Now handle FUID create for user/group ACEs + */ + if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) { + aclp->z_ops.ace_who_set(acep, + zfs_fuid_create(zp->z_zfsvfs, who, cr, + (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, + tx, fuidp)); + } + } + return (mode); +} + +static zfs_acl_t * +zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + + aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); + + /* + * Version 0 to 1 znode_acl_phys has the size/count fields swapped. + * Version 0 didn't have a size field, only a count. + */ + if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size; + aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count); + } else { + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; + aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size; + } + + aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0); + aclnode->z_ace_count = aclp->z_acl_count; + if (will_modify) { + bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata, + aclp->z_acl_bytes); + } else { + aclnode->z_size = aclp->z_acl_bytes; + aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0]; + } + + list_insert_head(&aclp->z_acl, aclnode); + + return (aclp); +} + +/* + * Read an external acl object. + */ +static int +zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) +{ + uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; + zfs_acl_t *aclp; + size_t aclsize; + size_t acl_count; + zfs_acl_node_t *aclnode; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { + *aclpp = zfs_acl_node_read_internal(zp, will_modify); + return (0); + } + + aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); + if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { + zfs_acl_phys_v0_t *zacl0 = + (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl; + + aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count); + acl_count = zacl0->z_acl_count; + } else { + aclsize = zp->z_phys->zp_acl.z_acl_size; + acl_count = zp->z_phys->zp_acl.z_acl_count; + if (aclsize == 0) + aclsize = acl_count * sizeof (zfs_ace_t); + } + aclnode = zfs_acl_node_alloc(aclsize); + list_insert_head(&aclp->z_acl, aclnode); + error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, + aclsize, aclnode->z_acldata); + aclnode->z_ace_count = acl_count; + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + if (error != 0) { + zfs_acl_free(aclp); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; + return (error); + } + + *aclpp = aclp; + return (0); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, + zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +{ + int error; + znode_phys_t *zphys = zp->z_phys; + zfs_acl_phys_t *zacl = &zphys->zp_acl; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; + uint64_t off = 0; + dmu_object_type_t otype; + zfs_acl_node_t *aclnode; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + dmu_buf_will_dirty(zp->z_dbuf, tx); + + zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx); + + /* + * Decide which opbject type to use. If we are forced to + * use old ACL format than transform ACL into zfs_oldace_t + * layout. + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && aclp->z_version != zacl->z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, + aclp->z_acl_bytes, 0, tx); + } + zphys->zp_acl.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = zacl->z_ace_data; + /* + * Migrating back embedded? + */ + if (zphys->zp_acl.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + if (error) + return (error); + zphys->zp_acl.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + zphys->zp_acl.z_acl_size = aclp->z_acl_count; + zphys->zp_acl.z_acl_count = aclp->z_acl_bytes; + } else { + zphys->zp_acl.z_acl_size = aclp->z_acl_bytes; + zphys->zp_acl.z_acl_count = aclp->z_acl_count; + } + + zphys->zp_acl.z_acl_version = aclp->z_version; + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_phys->zp_flags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; + + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + return (0); +} + +/* + * Update access mask for prepended ACE + * + * This applies the "groupmask" value for aclmode property. + */ +static void +zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep, + mode_t mode, uint64_t owner) +{ + int rmask, wmask, xmask; + int user_ace; + uint16_t aceflags; + uint32_t origmask, acepmask; + uint64_t fuid; + + aceflags = aclp->z_ops.ace_flags_get(acep); + fuid = aclp->z_ops.ace_who_get(acep); + origmask = aclp->z_ops.ace_mask_get(origacep); + acepmask = aclp->z_ops.ace_mask_get(acep); + + user_ace = (!(aceflags & + (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); + + if (user_ace && (fuid == owner)) { + rmask = S_IRUSR; + wmask = S_IWUSR; + xmask = S_IXUSR; + } else { + rmask = S_IRGRP; + wmask = S_IWGRP; + xmask = S_IXGRP; + } + + if (origmask & ACE_READ_DATA) { + if (mode & rmask) { + acepmask &= ~ACE_READ_DATA; + } else { + acepmask |= ACE_READ_DATA; + } + } + + if (origmask & ACE_WRITE_DATA) { + if (mode & wmask) { + acepmask &= ~ACE_WRITE_DATA; + } else { + acepmask |= ACE_WRITE_DATA; + } + } + + if (origmask & ACE_APPEND_DATA) { + if (mode & wmask) { + acepmask &= ~ACE_APPEND_DATA; + } else { + acepmask |= ACE_APPEND_DATA; + } + } + + if (origmask & ACE_EXECUTE) { + if (mode & xmask) { + acepmask &= ~ACE_EXECUTE; + } else { + acepmask |= ACE_EXECUTE; + } + } + aclp->z_ops.ace_mask_set(acep, acepmask); +} + +/* + * Apply mode to canonical six ACEs. + */ +static void +zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) +{ + zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); + void *acep; + int maskoff = aclp->z_ops.ace_mask_off(); + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + + ASSERT(aclnode != NULL); + + acep = (void *)((caddr_t)aclnode->z_acldata + + aclnode->z_size - (abstract_size * 6)); + + /* + * Fixup final ACEs to match the mode + */ + + adjust_ace_pair_common(acep, maskoff, abstract_size, + (mode & 0700) >> 6); /* owner@ */ + + acep = (caddr_t)acep + (abstract_size * 2); + + adjust_ace_pair_common(acep, maskoff, abstract_size, + (mode & 0070) >> 3); /* group@ */ + + acep = (caddr_t)acep + (abstract_size * 2); + adjust_ace_pair_common(acep, maskoff, + abstract_size, mode); /* everyone@ */ +} + + +static int +zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny, + int entry_type, int accessmask) +{ + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + uint16_t type = aclp->z_ops.ace_type_get(acep); + uint16_t flags = aclp->z_ops.ace_flags_get(acep); + + return (mask == accessmask && type == allow_deny && + ((flags & ACE_TYPE_FLAGS) == entry_type)); +} + +/* + * Can prepended ACE be reused? + */ +static int +zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep) +{ + int okay_masks; + uint16_t prevtype; + uint16_t prevflags; + uint16_t flags; + uint32_t mask, prevmask; + + if (prevacep == NULL) + return (B_FALSE); + + prevtype = aclp->z_ops.ace_type_get(prevacep); + prevflags = aclp->z_ops.ace_flags_get(prevacep); + flags = aclp->z_ops.ace_flags_get(acep); + mask = aclp->z_ops.ace_mask_get(acep); + prevmask = aclp->z_ops.ace_mask_get(prevacep); + + if (prevtype != DENY) + return (B_FALSE); + + if (prevflags != (flags & ACE_IDENTIFIER_GROUP)) + return (B_FALSE); + + okay_masks = (mask & OKAY_MASK_BITS); + + if (prevmask & ~okay_masks) + return (B_FALSE); + + return (B_TRUE); +} + + +/* + * Insert new ACL node into chain of zfs_acl_node_t's + * + * This will result in two possible results. + * 1. If the ACL is currently just a single zfs_acl_node and + * we are prepending the entry then current acl node will have + * a new node inserted above it. + * + * 2. If we are inserting in the middle of current acl node then + * the current node will be split in two and new node will be inserted + * in between the two split nodes. + */ +static zfs_acl_node_t * +zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) +{ + zfs_acl_node_t *newnode; + zfs_acl_node_t *trailernode = NULL; + zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp); + int curr_idx = aclp->z_curr_node->z_ace_idx; + int trailer_count; + size_t oldsize; + + newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep)); + newnode->z_ace_count = 1; + + oldsize = currnode->z_size; + + if (curr_idx != 1) { + trailernode = zfs_acl_node_alloc(0); + trailernode->z_acldata = acep; + + trailer_count = currnode->z_ace_count - curr_idx + 1; + currnode->z_ace_count = curr_idx - 1; + currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata; + trailernode->z_size = oldsize - currnode->z_size; + trailernode->z_ace_count = trailer_count; + } + + aclp->z_acl_count += 1; + aclp->z_acl_bytes += aclp->z_ops.ace_size(acep); + + if (curr_idx == 1) + list_insert_before(&aclp->z_acl, currnode, newnode); + else + list_insert_after(&aclp->z_acl, currnode, newnode); + if (trailernode) { + list_insert_after(&aclp->z_acl, newnode, trailernode); + aclp->z_curr_node = trailernode; + trailernode->z_ace_idx = 1; + } + + return (newnode); +} + +/* + * Prepend deny ACE + */ +static void * +zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, + mode_t mode) +{ + zfs_acl_node_t *aclnode; + void *newacep; + uint64_t fuid; + uint16_t flags; + + aclnode = zfs_acl_ace_insert(aclp, acep); + newacep = aclnode->z_acldata; + fuid = aclp->z_ops.ace_who_get(acep); + flags = aclp->z_ops.ace_flags_get(acep); + zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); + zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid); + + return (newacep); +} + +/* + * Split an inherited ACE into inherit_only ACE + * and original ACE with inheritance flags stripped off. + */ +static void +zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep) +{ + zfs_acl_node_t *aclnode; + zfs_acl_node_t *currnode; + void *newacep; + uint16_t type, flags; + uint32_t mask; + uint64_t fuid; + + type = aclp->z_ops.ace_type_get(acep); + flags = aclp->z_ops.ace_flags_get(acep); + mask = aclp->z_ops.ace_mask_get(acep); + fuid = aclp->z_ops.ace_who_get(acep); + + aclnode = zfs_acl_ace_insert(aclp, acep); + newacep = aclnode->z_acldata; + + aclp->z_ops.ace_type_set(newacep, type); + aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE); + aclp->z_ops.ace_mask_set(newacep, mask); + aclp->z_ops.ace_type_set(newacep, type); + aclp->z_ops.ace_who_set(newacep, fuid); + aclp->z_next_ace = acep; + flags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep, flags); + currnode = zfs_acl_curr_node(aclp); + ASSERT(currnode->z_ace_idx >= 1); + currnode->z_ace_idx -= 1; +} + +/* + * Are ACES started at index i, the canonical six ACES? + */ +static int +zfs_have_canonical_six(zfs_acl_t *aclp) +{ + void *acep; + zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); + int i = 0; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + + ASSERT(aclnode != NULL); + + if (aclnode->z_ace_count < 6) + return (0); + + acep = (void *)((caddr_t)aclnode->z_acldata + + aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6)); + + if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + DENY, ACE_OWNER, 0) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY, + OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep + + (abstract_size * i++), + ALLOW, OWNING_GROUP, 0) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) { + return (1); + } else { + return (0); + } +} + + +/* + * Apply step 1g, to group entries + * + * Need to deal with corner case where group may have + * greater permissions than owner. If so then limit + * group permissions, based on what extra permissions + * group has. + */ +static void +zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, + mode_t mode) +{ + uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep); + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep); + mode_t extramode = (mode >> 3) & 07; + mode_t ownermode = (mode >> 6); + + if (prevflags & ACE_IDENTIFIER_GROUP) { + + extramode &= ~ownermode; + + if (extramode) { + if (extramode & S_IROTH) { + prevmask &= ~ACE_READ_DATA; + mask &= ~ACE_READ_DATA; + } + if (extramode & S_IWOTH) { + prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + } + if (extramode & S_IXOTH) { + prevmask &= ~ACE_EXECUTE; + mask &= ~ACE_EXECUTE; + } + } + } + aclp->z_ops.ace_mask_set(acep, mask); + aclp->z_ops.ace_mask_set(prevacep, prevmask); +} + +/* + * Apply the chmod algorithm as described + * in PSARC/2002/240 + */ +static void +zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + void *acep = NULL, *prevacep = NULL; + uint64_t who; + int i; + int entry_type; + int reuse_deny; + int need_canonical_six = 1; + uint16_t iflags, type; + uint32_t access_mask; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + ASSERT(MUTEX_HELD(&zp->z_lock)); + + aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); + + /* + * If discard then just discard all ACL nodes which + * represent the ACEs. + * + * New owner@/group@/everone@ ACEs will be added + * later. + */ + if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + zfs_acl_release_nodes(aclp); + + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type)) { + + entry_type = (iflags & ACE_TYPE_FLAGS); + iflags = (iflags & ALL_INHERIT); + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + if (iflags) + aclp->z_hints |= ZFS_INHERIT_ACE; + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + goto nextace; + } + + /* + * Need to split ace into two? + */ + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) && + (!(iflags & ACE_INHERIT_ONLY_ACE))) { + zfs_acl_split_ace(aclp, acep); + aclp->z_hints |= ZFS_INHERIT_ACE; + goto nextace; + } + + if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + (entry_type == OWNING_GROUP)) { + access_mask &= ~OGE_CLEAR; + aclp->z_ops.ace_mask_set(acep, access_mask); + goto nextace; + } else { + reuse_deny = B_TRUE; + if (type == ALLOW) { + + /* + * Check preceding ACE if any, to see + * if we need to prepend a DENY ACE. + * This is only applicable when the acl_mode + * property == groupmask. + */ + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) { + + reuse_deny = zfs_reuse_deny(aclp, acep, + prevacep); + + if (!reuse_deny) { + prevacep = + zfs_acl_prepend_deny(zp, + aclp, acep, mode); + } else { + zfs_acl_prepend_fixup( + aclp, prevacep, + acep, mode, + zp->z_phys->zp_uid); + } + zfs_fixup_group_entries(aclp, acep, + prevacep, mode); + + } + } + } +nextace: + prevacep = acep; + } + + /* + * Check out last six aces, if we have six. + */ + + if (aclp->z_acl_count >= 6) { + if (zfs_have_canonical_six(aclp)) { + need_canonical_six = 0; + } + } + + if (need_canonical_six) { + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; + zfs_acl_node_t *aclnode = + zfs_acl_node_alloc(abstract_size * 6); + + aclnode->z_size = abstract_size * 6; + aclnode->z_ace_count = 6; + aclp->z_acl_bytes += aclnode->z_size; + list_insert_tail(&aclp->z_acl, aclnode); + + zacep = aclnode->z_acldata; + + i = 0; + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + 0, DENY, -1, ACE_OWNER); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, + DENY, -1, OWNING_GROUP); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, + ALLOW, -1, OWNING_GROUP); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE); + aclp->z_acl_count += 6; + } + + zfs_acl_fixup_canonical_six(aclp, mode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error; + + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + *aclp = NULL; + error = zfs_acl_node_read(zp, aclp, B_TRUE); + if (error == 0) + zfs_acl_chmod(zp, mode, *aclp); + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + return (error); +} + +/* + * strip off write_owner and write_acl + */ +static void +zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) +{ + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + + if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) && + (aclp->z_ops.ace_type_get(acep) == ALLOW)) { + mask &= ~RESTRICTED_CLEAR; + aclp->z_ops.ace_mask_set(acep, mask); + } +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) +{ + int vtype = ZTOV(zp)->v_type; + int iflags = (acep_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, + boolean_t *need_chmod) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + void *pacep; + void *acep, *acep2; + zfs_acl_node_t *aclnode, *aclnode2; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + boolean_t vdir = ZTOV(zp)->v_type == VDIR; + boolean_t vreg = ZTOV(zp)->v_type == VREG; + boolean_t passthrough, passthrough_x, noallow; + + passthrough_x = + zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X; + passthrough = passthrough_x || + zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH; + noallow = + zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW; + + *need_chmod = B_TRUE; + pacep = NULL; + aclp = zfs_acl_alloc(paclp->z_version); + if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD) + return (aclp); + while (pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type)) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (noallow && type == ALLOW) + continue; + + ace_size = aclp->z_ops.ace_size(pacep); + + if (!zfs_ace_can_use(zp, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if (passthrough && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == + OWNING_GROUP)) && (vreg || (vdir && (iflags & + ACE_DIRECTORY_INHERIT_ACE)))) { + *need_chmod = B_FALSE; + + if (!vdir && passthrough_x && + ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + } + + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops.ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops.ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops.ace_flags_get(acep); + + if (vdir) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) { + newflags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + zfs_restricted_update(zfsvfs, aclp, acep); + continue; + } + + ASSERT(vdir); + + newflags = aclp->z_ops.ace_flags_get(acep); + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) != + ACE_FILE_INHERIT_ACE) { + aclnode2 = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode2); + acep2 = aclnode2->z_acldata; + zfs_set_ace(aclp, acep2, + access_mask, type, who, + iflags|ACE_INHERITED_ACE); + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops.ace_flags_set(acep, newflags); + newflags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep2, + newflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = aclp->z_ops.ace_data(acep, + &data1)) != 0) { + VERIFY((data2sz = + aclp->z_ops.ace_data(acep2, + &data2)) == data1sz); + bcopy(data1, data2, data1sz); + } + aclp->z_acl_count++; + aclnode2->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + zfs_restricted_update(zfsvfs, aclp, acep2); + } else { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + */ +void +zfs_perm_init(znode_t *zp, znode_t *parent, int flag, + vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp) +{ + uint64_t mode, fuid, fgid; + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp = NULL; + zfs_acl_t *paclp; + xvattr_t *xvap = (xvattr_t *)vap; + gid_t gid; + boolean_t need_chmod = B_TRUE; + + if (setaclp) + aclp = setaclp; + + mode = MAKEIMODE(vap->va_type, vap->va_mode); + + /* + * Determine uid and gid. + */ + if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr, + ZFS_OWNER, tx, fuidp); + fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, + ZFS_GROUP, tx, fuidp); + gid = vap->va_gid; + } else { + fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp); + fgid = 0; + if (vap->va_mask & AT_GID) { + fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, + ZFS_GROUP, tx, fuidp); + gid = vap->va_gid; + if (fgid != parent->z_phys->zp_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + fgid = 0; + } + if (fgid == 0) { + if (parent->z_phys->zp_mode & S_ISGID) { + fgid = parent->z_phys->zp_gid; + gid = zfs_fuid_map_id(zfsvfs, fgid, + cr, ZFS_GROUP); + } else { + fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, tx, cr, fuidp); + gid = crgetgid(cr); + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) { + mode |= S_ISGID; + } else { + if ((mode & S_ISGID) && + secpolicy_vnode_setids_setgids(cr, gid) != 0) + mode &= ~S_ISGID; + } + + zp->z_phys->zp_uid = fuid; + zp->z_phys->zp_gid = fgid; + zp->z_phys->zp_mode = mode; + + if (aclp == NULL) { + mutex_enter(&parent->z_lock); + if ((ZTOV(parent)->v_type == VDIR && + (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) && + !(zp->z_phys->zp_flags & ZFS_XATTR)) { + mutex_enter(&parent->z_acl_lock); + VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE)); + mutex_exit(&parent->z_acl_lock); + aclp = zfs_acl_inherit(zp, paclp, mode, &need_chmod); + zfs_acl_free(paclp); + } else { + aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + } + mutex_exit(&parent->z_lock); + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + if (need_chmod) + zfs_acl_chmod(zp, mode, aclp); + } else { + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + } + + /* Force auto_inherit on all new directory objects */ + if (vap->va_type == VDIR) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + + error = zfs_aclset_common(zp, aclp, cr, fuidp, tx); + + /* Set optional attributes if any */ + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(zp, xvap); + + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + ASSERT3U(error, ==, 0); + + if (aclp != setaclp) + zfs_acl_free(aclp); +} + +/* + * Retrieve a files ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) + return (error); + + if (mask == 0) + return (ENOSYS); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) && + !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while (zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + zfs_acl_node_t *aclnode = list_head(&aclp->z_acl); + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + bcopy(aclnode->z_acldata, vsecp->vsa_aclentp, + count * sizeof (ace_t)); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, + vsecattr_t *vsecp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (EINVAL); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(obj_type, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a files ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + + if (mask == 0) + return (ENOSYS); + + if (zp->z_phys->zp_flags & ZFS_IMMUTABLE) + return (EPERM); + + if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); + } +top: + if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) { + zfs_acl_free(aclp); + return (error); + } + + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + + if (zp->z_phys->zp_acl.z_acl_extern_obj) { + /* Are we upgrading ACL? */ + if (zfsvfs->z_version <= ZPL_VERSION_FUID && + zp->z_phys->zp_acl.z_acl_version == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, + zp->z_phys->zp_acl.z_acl_extern_obj, + 0, DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, + zp->z_phys->zp_acl.z_acl_extern_obj, + 0, aclp->z_acl_bytes); + } + } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + if (aclp->z_has_fuids) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + ASSERT(error == 0); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + zfs_acl_free(aclp); + dmu_tx_commit(tx); +done: + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + return (error); +} + +/* + * working_mode returns the permissions that were not granted + */ +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t fowner; + uid_t gowner; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0) + return (0); + + *check_privs = B_TRUE; + + if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + *working_mode = 0; + return (0); + } + + *working_mode = v4_mode; + + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)))) { + *check_privs = B_FALSE; + return (EROFS); + } + + /* + * Only check for READONLY on non-directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + (((ZTOV(zp)->v_type != VDIR) && + (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || + (ZTOV(zp)->v_type == VDIR && + (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { + *check_privs = B_FALSE; + return (EPERM); + } + + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { + *check_privs = B_FALSE; + return (EPERM); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { + *check_privs = B_FALSE; + return (EACCES); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type)) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; + break; + } else { + zfs_acl_free(aclp); + mutex_exit(&zp->z_acl_lock); + return (EIO); + } + } + + if (checkit) { + uint32_t mask_matched = (access_mask & *working_mode); + + if (mask_matched) { + if (type == DENY) + deny_mask |= mask_matched; + + *working_mode &= ~mask_matched; + } + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + zfs_acl_free(aclp); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (EACCES); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (EACCES); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +/* + * Determine whether Access should be granted/denied, invoking least + * priv subsytem when a deny is determined. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + boolean_t check_privs; + znode_t *xzp; + znode_t *check_zp = zp; + + is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && + (ZTOV(zp)->v_type == VDIR)); + + /* + * If attribute then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(zp->z_zfsvfs, + zp->z_phys->zp_parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (0); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + uid_t owner; + mode_t checkmode = 0; + + owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr, + ZFS_OWNER); + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + if (checkmode) + error = secpolicy_vnode_access(cr, ZTOV(check_zp), + owner, checkmode); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(cr, B_TRUE); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(cr, B_FALSE); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = EACCES; + } + } + } + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t missing_perms, cred_t *cr) +{ + int error; + uid_t downer; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER); + + error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + mode_t missing_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; + + /* + * We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. + */ + + if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (EPERM); + + /* + * First row + * If the directory permissions allow the delete, we are done. + */ + if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + /* + * If target object has delete permission then we are done + */ + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + ASSERT(dzp_error && zp_error); + + if (!dzpcheck_privs) + return (dzp_error); + if (!zpcheck_privs) + return (zp_error); + + /* + * Second row + * + * If directory returns EACCES then delete_child was denied + * due to deny delete_child. In this case send the request through + * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() + * since that *could* allow the delete based on write/execute permission + * and we want delete permissions to override write/execute. + */ + + if (dzp_error == EACCES) + return (secpolicy_vnode_remove(cr)); + + /* + * Third Row + * only need to see if we have write/execute on directory. + */ + + if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (zfs_sticky_remove_access(dzp, zp, cr)); + + if (!dzpcheck_privs) + return (dzp_error); + + /* + * Fourth row + */ + + missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0; + missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0; + + ASSERT(missing_perms); + + return (zfs_delete_final_check(zp, dzp, missing_perms, cr)); + +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + return (EACCES); + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + */ + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if (error = zfs_zaccess_delete(sdzp, szp, cr)) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if (error = zfs_zaccess_delete(tdzp, tzp, cr)) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/zfs/zfs_byteswap.c b/module/zfs/zfs_byteswap.c new file mode 100644 index 000000000..ab97f83eb --- /dev/null +++ b/module/zfs/zfs_byteswap.c @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/vfs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_acl.h> + +void +zfs_oldace_byteswap(ace_t *ace, int ace_cnt) +{ + int i; + + for (i = 0; i != ace_cnt; i++, ace++) { + ace->a_who = BSWAP_32(ace->a_who); + ace->a_access_mask = BSWAP_32(ace->a_access_mask); + ace->a_flags = BSWAP_16(ace->a_flags); + ace->a_type = BSWAP_16(ace->a_type); + } +} + +/* + * swap ace_t and ace_oject_t + */ +void +zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) +{ + caddr_t end; + caddr_t ptr; + zfs_ace_t *zacep; + ace_t *acep; + uint16_t entry_type; + size_t entry_size; + int ace_type; + + end = (caddr_t)buf + size; + ptr = buf; + + while (ptr < end) { + if (zfs_layout) { + zacep = (zfs_ace_t *)ptr; + zacep->z_hdr.z_access_mask = + BSWAP_32(zacep->z_hdr.z_access_mask); + zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags); + ace_type = zacep->z_hdr.z_type = + BSWAP_16(zacep->z_hdr.z_type); + entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; + } else { + acep = (ace_t *)ptr; + acep->a_access_mask = BSWAP_32(acep->a_access_mask); + acep->a_flags = BSWAP_16(acep->a_flags); + ace_type = acep->a_type = BSWAP_16(acep->a_type); + acep->a_who = BSWAP_32(acep->a_who); + entry_type = acep->a_flags & ACE_TYPE_FLAGS; + } + switch (entry_type) { + case ACE_OWNER: + case ACE_EVERYONE: + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + entry_size = zfs_layout ? + sizeof (zfs_ace_hdr_t) : sizeof (ace_t); + break; + case ACE_IDENTIFIER_GROUP: + default: + if (zfs_layout) { + zacep->z_fuid = BSWAP_64(zacep->z_fuid); + } + switch (ace_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + entry_size = zfs_layout ? + sizeof (zfs_object_ace_t) : + sizeof (ace_object_t); + break; + default: + entry_size = zfs_layout ? sizeof (zfs_ace_t) : + sizeof (ace_t); + break; + } + } + ptr = ptr + entry_size; + } +} + +/* ARGSUSED */ +void +zfs_oldacl_byteswap(void *buf, size_t size) +{ + int cnt; + + /* + * Arggh, since we don't know how many ACEs are in + * the array, we have to swap the entire block + */ + + cnt = size / sizeof (ace_t); + + zfs_oldace_byteswap((ace_t *)buf, cnt); +} + +/* ARGSUSED */ +void +zfs_acl_byteswap(void *buf, size_t size) +{ + zfs_ace_byteswap(buf, size, B_TRUE); +} + +void +zfs_znode_byteswap(void *buf, size_t size) +{ + znode_phys_t *zp = buf; + + ASSERT(size >= sizeof (znode_phys_t)); + + zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); + zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); + zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); + zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); + zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); + zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); + zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); + zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); + zp->zp_gen = BSWAP_64(zp->zp_gen); + zp->zp_mode = BSWAP_64(zp->zp_mode); + zp->zp_size = BSWAP_64(zp->zp_size); + zp->zp_parent = BSWAP_64(zp->zp_parent); + zp->zp_links = BSWAP_64(zp->zp_links); + zp->zp_xattr = BSWAP_64(zp->zp_xattr); + zp->zp_rdev = BSWAP_64(zp->zp_rdev); + zp->zp_flags = BSWAP_64(zp->zp_flags); + zp->zp_uid = BSWAP_64(zp->zp_uid); + zp->zp_gid = BSWAP_64(zp->zp_gid); + zp->zp_zap = BSWAP_64(zp->zp_zap); + zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); + zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); + zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); + + zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); + zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size); + zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); + zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count); + if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { + zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], + ZFS_ACE_SPACE); + } else + zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], + ACE_SLOT_CNT); +} diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c new file mode 100644 index 000000000..208fc3629 --- /dev/null +++ b/module/zfs/zfs_ctldir.c @@ -0,0 +1,1159 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. + */ + +#include <fs/fs_subr.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/vfs_opreg.h> +#include <sys/gfs.h> +#include <sys/stat.h> +#include <sys/dmu.h> +#include <sys/dsl_deleg.h> +#include <sys/mount.h> +#include <sys/sunddi.h> + +#include "zfs_namecheck.h" + +typedef struct zfsctl_node { + gfs_dir_t zc_gfs_private; + uint64_t zc_id; + timestruc_t zc_cmtime; /* ctime and mtime, always the same */ +} zfsctl_node_t; + +typedef struct zfsctl_snapdir { + zfsctl_node_t sd_node; + kmutex_t sd_lock; + avl_tree_t sd_snaps; +} zfsctl_snapdir_t; + +typedef struct { + char *se_name; + vnode_t *se_root; + avl_node_t se_node; +} zfs_snapentry_t; + +static int +snapentry_compare(const void *a, const void *b) +{ + const zfs_snapentry_t *sa = a; + const zfs_snapentry_t *sb = b; + int ret = strcmp(sa->se_name, sb->se_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +vnodeops_t *zfsctl_ops_root; +vnodeops_t *zfsctl_ops_snapdir; +vnodeops_t *zfsctl_ops_snapshot; + +static const fs_operation_def_t zfsctl_tops_root[]; +static const fs_operation_def_t zfsctl_tops_snapdir[]; +static const fs_operation_def_t zfsctl_tops_snapshot[]; + +static vnode_t *zfsctl_mknode_snapdir(vnode_t *); +static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); +static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); + +static gfs_opsvec_t zfsctl_opsvec[] = { + { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, + { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, + { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, + { NULL } +}; + +/* + * Root directory elements. We have only a single static entry, 'snapshot'. + */ +static gfs_dirent_t zfsctl_root_entries[] = { + { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, + { NULL } +}; + +/* include . and .. in the calculation */ +#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ + sizeof (gfs_dirent_t)) + 1) + + +/* + * Initialize the various GFS pieces we'll need to create and manipulate .zfs + * directories. This is called from the ZFS init routine, and initializes the + * vnode ops vectors that we'll be using. + */ +void +zfsctl_init(void) +{ + VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0); +} + +void +zfsctl_fini(void) +{ + /* + * Remove vfsctl vnode ops + */ + if (zfsctl_ops_root) + vn_freevnodeops(zfsctl_ops_root); + if (zfsctl_ops_snapdir) + vn_freevnodeops(zfsctl_ops_snapdir); + if (zfsctl_ops_snapshot) + vn_freevnodeops(zfsctl_ops_snapshot); + + zfsctl_ops_root = NULL; + zfsctl_ops_snapdir = NULL; + zfsctl_ops_snapshot = NULL; +} + +/* + * Return the inode number associated with the 'snapshot' directory. + */ +/* ARGSUSED */ +static ino64_t +zfsctl_root_inode_cb(vnode_t *vp, int index) +{ + ASSERT(index == 0); + return (ZFSCTL_INO_SNAPDIR); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the vfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + vnode_t *vp, *rvp; + zfsctl_node_t *zcp; + + ASSERT(zfsvfs->z_ctldir == NULL); + + vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, + zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, + zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); + zcp = vp->v_data; + zcp->zc_id = ZFSCTL_INO_ROOT; + + VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0); + ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); + VN_RELE(rvp); + + /* + * We're only faking the fact that we have a root of a filesystem for + * the sake of the GFS interfaces. Undo the flag manipulation it did + * for us. + */ + vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT); + + zfsvfs->z_ctldir = vp; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * There might still be more references if we were force unmounted, but only + * new zfs_inactive() calls can occur and they don't reference .zfs + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + VN_RELE(zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +vnode_t * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + VN_HOLD(zp->z_zfsvfs->z_ctldir); + return (zp->z_zfsvfs->z_ctldir); +} + +/* + * Common open routine. Disallow any write access. + */ +/* ARGSUSED */ +static int +zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct) +{ + if (flags & FWRITE) + return (EACCES); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off, + cred_t *cr, caller_context_t *ct) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +/* ARGSUSED */ +static int +zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + if (mode & VWRITE) + return (EACCES); + + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + zfsctl_node_t *zcp = vp->v_data; + timestruc_t now; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purly virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + vap->va_type = VDIR; + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; +} + +/*ARGSUSED*/ +static int +zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_node_t *zcp = vp->v_data; + uint64_t object = zcp->zc_id; + zfid_short_t *zfid; + int i; + + ZFS_ENTER(zfsvfs); + + if (fidp->fid_len < SHORT_FID_LEN) { + fidp->fid_len = SHORT_FID_LEN; + ZFS_EXIT(zfsvfs); + return (ENOSPC); + } + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs znodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/<snap> objectid(snap) + */ + +#define ZFSCTL_INO_SNAP(id) (id) + +/* + * Get root directory attributes. + */ +/* ARGSUSED */ +static int +zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + + ZFS_ENTER(zfsvfs); + vap->va_nodeid = ZFSCTL_INO_ROOT; + vap->va_nlink = vap->va_size = NROOT_ENTRIES; + + zfsctl_common_getattr(vp, vap); + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * Special case the handling of "..". + */ +/* ARGSUSED */ +int +zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + /* + * No extended attributes allowed under .zfs + */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + + if (strcmp(nm, "..") == 0) { + err = VFS_ROOT(dvp->v_vfsp, vpp); + } else { + err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp); + } + + ZFS_EXIT(zfsvfs); + + return (err); +} + +static const fs_operation_def_t zfsctl_tops_root[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, + { NULL } +}; + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + if (snapshot_namecheck(name, NULL, NULL) != 0) + return (EILSEQ); + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (ENAMETOOLONG); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) +{ + vnode_t *svp = sep->se_root; + int error; + + ASSERT(vn_ismntpt(svp)); + + /* this will be dropped by dounmount() */ + if ((error = vn_vfswlock(svp)) != 0) + return (error); + + VN_HOLD(svp); + error = dounmount(vn_mountedvfs(svp), fflags, cr); + if (error) { + VN_RELE(svp); + return (error); + } + VFS_RELE(svp->v_vfsp); + /* + * We can't use VN_RELE(), as that will try to invoke + * zfsctl_snapdir_inactive(), which would cause us to destroy + * the sd_lock mutex held by our caller. + */ + ASSERT(svp->v_count == 1); + gfs_vop_inactive(svp, cr, NULL); + + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + + return (0); +} + +static void +zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) +{ + avl_index_t where; + vfs_t *vfsp; + refstr_t *pathref; + char newpath[MAXNAMELEN]; + char *tail; + + ASSERT(MUTEX_HELD(&sdp->sd_lock)); + ASSERT(sep != NULL); + + vfsp = vn_mountedvfs(sep->se_root); + ASSERT(vfsp != NULL); + + vfs_lock_wait(vfsp); + + /* + * Change the name in the AVL tree. + */ + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strcpy(sep->se_name, nm); + VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); + avl_insert(&sdp->sd_snaps, sep, where); + + /* + * Change the current mountpoint info: + * - update the tail of the mntpoint path + * - update the tail of the resource path + */ + pathref = vfs_getmntpoint(vfsp); + (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); + VERIFY((tail = strrchr(newpath, '/')) != NULL); + *(tail+1) = '\0'; + ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); + (void) strcat(newpath, nm); + refstr_rele(pathref); + vfs_setmntpoint(vfsp, newpath); + + pathref = vfs_getresource(vfsp); + (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); + VERIFY((tail = strrchr(newpath, '@')) != NULL); + *(tail+1) = '\0'; + ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); + (void) strcat(newpath, nm); + refstr_rele(pathref); + vfs_setresource(vfsp, newpath); + + vfs_unlock(vfsp); +} + +/*ARGSUSED*/ +static int +zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, + cred_t *cr, caller_context_t *ct, int flags) +{ + zfsctl_snapdir_t *sdp = sdvp->v_data; + zfs_snapentry_t search, *sep; + zfsvfs_t *zfsvfs; + avl_index_t where; + char from[MAXNAMELEN], to[MAXNAMELEN]; + char real[MAXNAMELEN]; + int err; + + zfsvfs = sdvp->v_vfsp->vfs_data; + ZFS_ENTER(zfsvfs); + + if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + err = dmu_snapshot_realname(zfsvfs->z_os, snm, real, + MAXNAMELEN, NULL); + if (err == 0) { + snm = real; + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + ZFS_EXIT(zfsvfs); + + err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); + if (!err) + err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); + if (!err) + err = zfs_secpolicy_rename_perms(from, to, cr); + if (err) + return (err); + + /* + * Cannot move snapshots out of the snapdir. + */ + if (sdvp != tdvp) + return (EINVAL); + + if (strcmp(snm, tnm) == 0) + return (0); + + mutex_enter(&sdp->sd_lock); + + search.se_name = (char *)snm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { + mutex_exit(&sdp->sd_lock); + return (ENOENT); + } + + err = dmu_objset_rename(from, to, B_FALSE); + if (err == 0) + zfsctl_rename_snap(sdp, sep, tnm); + + mutex_exit(&sdp->sd_lock); + + return (err); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, + caller_context_t *ct, int flags) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + zfs_snapentry_t *sep; + zfs_snapentry_t search; + zfsvfs_t *zfsvfs; + char snapname[MAXNAMELEN]; + char real[MAXNAMELEN]; + int err; + + zfsvfs = dvp->v_vfsp->vfs_data; + ZFS_ENTER(zfsvfs); + + if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + + err = dmu_snapshot_realname(zfsvfs->z_os, name, real, + MAXNAMELEN, NULL); + if (err == 0) { + name = real; + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + ZFS_EXIT(zfsvfs); + + err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); + if (!err) + err = zfs_secpolicy_destroy_perms(snapname, cr); + if (err) + return (err); + + mutex_enter(&sdp->sd_lock); + + search.se_name = name; + sep = avl_find(&sdp->sd_snaps, &search, NULL); + if (sep) { + avl_remove(&sdp->sd_snaps, sep); + err = zfsctl_unmount_snap(sep, MS_FORCE, cr); + if (err) + avl_add(&sdp->sd_snaps, sep); + else + err = dmu_objset_destroy(snapname); + } else { + err = ENOENT; + } + + mutex_exit(&sdp->sd_lock); + + return (err); +} + +/* + * This creates a snapshot under '.zfs/snapshot'. + */ +/* ARGSUSED */ +static int +zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + char name[MAXNAMELEN]; + int err; + static enum symfollow follow = NO_FOLLOW; + static enum uio_seg seg = UIO_SYSSPACE; + + if (snapshot_namecheck(dirname, NULL, NULL) != 0) + return (EILSEQ); + + dmu_objset_name(zfsvfs->z_os, name); + + *vpp = NULL; + + err = zfs_secpolicy_snapshot_perms(name, cr); + if (err) + return (err); + + if (err == 0) { + err = dmu_objset_snapshot(name, dirname, B_FALSE); + if (err) + return (err); + err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); + } + + return (err); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + */ +/* ARGSUSED */ +static int +zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + objset_t *snap; + char snapname[MAXNAMELEN]; + char real[MAXNAMELEN]; + char *mountpoint; + zfs_snapentry_t *sep, search; + struct mounta margs; + vfs_t *vfsp; + size_t mountpoint_len; + avl_index_t where; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + /* + * No extended attributes allowed under .zfs + */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) + return (0); + + /* + * If we get a recursive call, that means we got called + * from the domount() code while it was trying to look up the + * spec (which looks like a local path for zfs). We need to + * add some flag to domount() to tell it not to do this lookup. + */ + if (MUTEX_HELD(&sdp->sd_lock)) + return (ENOENT); + + ZFS_ENTER(zfsvfs); + + if (flags & FIGNORECASE) { + boolean_t conflict = B_FALSE; + + err = dmu_snapshot_realname(zfsvfs->z_os, nm, real, + MAXNAMELEN, &conflict); + if (err == 0) { + nm = real; + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } + if (realpnp) + (void) strlcpy(realpnp->pn_buf, nm, + realpnp->pn_bufsize); + if (conflict && direntflags) + *direntflags = ED_CASE_CONFLICT; + } + + mutex_enter(&sdp->sd_lock); + search.se_name = (char *)nm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { + *vpp = sep->se_root; + VN_HOLD(*vpp); + err = traverse(vpp); + if (err) { + VN_RELE(*vpp); + *vpp = NULL; + } else if (*vpp == sep->se_root) { + /* + * The snapshot was unmounted behind our backs, + * try to remount it. + */ + goto domount; + } else { + /* + * VROOT was set during the traverse call. We need + * to clear it since we're pretending to be part + * of our parent's vfs. + */ + (*vpp)->v_flag &= ~VROOT; + } + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (err); + } + + /* + * The requested snapshot is not currently mounted, look it up. + */ + err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); + if (err) { + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + /* + * handle "ls *" or "?" in a graceful manner, + * forcing EILSEQ to ENOENT. + * Since shell ultimately passes "*" or "?" as name to lookup + */ + return (err == EILSEQ ? ENOENT : err); + } + if (dmu_objset_open(snapname, DMU_OST_ZFS, + DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) { + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (ENOENT); + } + + sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strcpy(sep->se_name, nm); + *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); + avl_insert(&sdp->sd_snaps, sep, where); + + dmu_objset_close(snap); +domount: + mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) + + strlen("/.zfs/snapshot/") + strlen(nm) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", + refstr_value(dvp->v_vfsp->vfs_mntpt), nm); + + margs.spec = snapname; + margs.dir = mountpoint; + margs.flags = MS_SYSSPACE | MS_NOMNTTAB; + margs.fstype = "zfs"; + margs.dataptr = NULL; + margs.datalen = 0; + margs.optptr = NULL; + margs.optlen = 0; + + err = domount("zfs", &margs, *vpp, kcred, &vfsp); + kmem_free(mountpoint, mountpoint_len); + + if (err == 0) { + /* + * Return the mounted root rather than the covered mount point. + * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns + * the ZFS vnode mounted on top of the GFS node. This ZFS + * vnode is the root the newly created vfsp. + */ + VFS_RELE(vfsp); + err = traverse(vpp); + } + + if (err == 0) { + /* + * Fix up the root vnode mounted on .zfs/snapshot/<snapname>. + * + * This is where we lie about our v_vfsp in order to + * make .zfs/snapshot/<snapname> accessible over NFS + * without requiring manual mounts of <snapname>. + */ + ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + (*vpp)->v_vfsp = zfsvfs->z_vfs; + (*vpp)->v_flag &= ~VROOT; + } + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + + /* + * If we had an error, drop our hold on the vnode and + * zfsctl_snapshot_inactive() will clean up. + */ + if (err) { + VN_RELE(*vpp); + *vpp = NULL; + } + return (err); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, + offset_t *offp, offset_t *nextp, void *data, int flags) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + char snapname[MAXNAMELEN]; + uint64_t id, cookie; + boolean_t case_conflict; + int error; + + ZFS_ENTER(zfsvfs); + + cookie = *offp; + error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, + &cookie, &case_conflict); + if (error) { + ZFS_EXIT(zfsvfs); + if (error == ENOENT) { + *eofp = 1; + return (0); + } + return (error); + } + + if (flags & V_RDDIR_ENTFLAGS) { + edirent_t *eodp = dp; + + (void) strcpy(eodp->ed_name, snapname); + eodp->ed_ino = ZFSCTL_INO_SNAP(id); + eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0; + } else { + struct dirent64 *odp = dp; + + (void) strcpy(odp->d_name, snapname); + odp->d_ino = ZFSCTL_INO_SNAP(id); + } + *nextp = cookie; + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * pvp is the '.zfs' directory (zfsctl_node_t). + * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). + * + * This function is the callback to create a GFS vnode for '.zfs/snapshot' + * when a lookup is performed on .zfs for "snapshot". + */ +vnode_t * +zfsctl_mknode_snapdir(vnode_t *pvp) +{ + vnode_t *vp; + zfsctl_snapdir_t *sdp; + + vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, + zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, + zfsctl_snapdir_readdir_cb, NULL); + sdp = vp->v_data; + sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; + sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; + mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&sdp->sd_snaps, snapentry_compare, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); + return (vp); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_snapdir_t *sdp = vp->v_data; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_nodeid = gfs_file_inode(vp); + vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* ARGSUSED */ +static void +zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + zfsctl_snapdir_t *sdp = vp->v_data; + void *private; + + private = gfs_dir_inactive(vp); + if (private != NULL) { + ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); + mutex_destroy(&sdp->sd_lock); + avl_destroy(&sdp->sd_snaps); + kmem_free(private, sizeof (zfsctl_snapdir_t)); + } +} + +static const fs_operation_def_t zfsctl_tops_snapdir[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } }, + { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } }, + { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } }, + { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, + { NULL } +}; + +/* + * pvp is the GFS vnode '.zfs/snapshot'. + * + * This creates a GFS node under '.zfs/snapshot' representing each + * snapshot. This newly created GFS node is what we mount snapshot + * vfs_t's ontop of. + */ +static vnode_t * +zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) +{ + vnode_t *vp; + zfsctl_node_t *zcp; + + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, + zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); + zcp = vp->v_data; + zcp->zc_id = objset; + VFS_HOLD(vp->v_vfsp); + + return (vp); +} + +static void +zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + vnode_t *dvp; + + VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + + if (vp->v_count > 1) { + mutex_exit(&sdp->sd_lock); + return; + } + ASSERT(!vn_ismntpt(vp)); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + next = AVL_NEXT(&sdp->sd_snaps, sep); + + if (sep->se_root == vp) { + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + break; + } + sep = next; + } + ASSERT(sep != NULL); + + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + VFS_RELE(vp->v_vfsp); + + /* + * Dispose of the vnode for the snapshot mount point. + * This is safe to do because once this entry has been removed + * from the AVL tree, it can't be found again, so cannot become + * "active". If we lookup the same name again we will end up + * creating a new vnode. + */ + gfs_vop_inactive(vp, cr, ct); +} + + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static const fs_operation_def_t zfsctl_tops_snapshot[] = { + VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapshot_inactive }, + NULL, NULL +}; + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *dvp, *vp; + zfsctl_snapdir_t *sdp; + zfsctl_node_t *zcp; + zfs_snapentry_t *sep; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, kcred, NULL, NULL, NULL); + if (error != 0) + return (error); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + vp = sep->se_root; + zcp = vp->v_data; + if (zcp->zc_id == objsetid) + break; + + sep = AVL_NEXT(&sdp->sd_snaps, sep); + } + + if (sep != NULL) { + VN_HOLD(vp); + /* + * Return the mounted root rather than the covered mount point. + * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid> + * and returns the ZFS vnode mounted on top of the GFS node. + * This ZFS vnode is the root of the vfs for objset 'objsetid'. + */ + error = traverse(&vp); + if (error == 0) { + if (vp == sep->se_root) + error = EINVAL; + else + *zfsvfsp = VTOZ(vp)->z_zfsvfs; + } + mutex_exit(&sdp->sd_lock); + VN_RELE(vp); + } else { + error = EINVAL; + mutex_exit(&sdp->sd_lock); + } + + VN_RELE(dvp); + + return (error); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *dvp; + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, cr, NULL, NULL, NULL); + if (error != 0) + return (error); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + next = AVL_NEXT(&sdp->sd_snaps, sep); + + /* + * If this snapshot is not mounted, then it must + * have just been unmounted by somebody else, and + * will be cleaned up by zfsctl_snapdir_inactive(). + */ + if (vn_ismntpt(sep->se_root)) { + avl_remove(&sdp->sd_snaps, sep); + error = zfsctl_unmount_snap(sep, fflags, cr); + if (error) { + avl_add(&sdp->sd_snaps, sep); + break; + } + } + sep = next; + } + + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + + return (error); +} diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c new file mode 100644 index 000000000..1ec493264 --- /dev/null +++ b/module/zfs/zfs_dir.c @@ -0,0 +1,977 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/pathname.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/unistd.h> +#include <sys/sunddi.h> +#include <sys/random.h> +#include <sys/policy.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/fs/zfs.h> +#include "fs/fs_subr.h" +#include <sys/zap.h> +#include <sys/dmu.h> +#include <sys/atomic.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/dnlc.h> +#include <sys/extdirent.h> + +/* + * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, + boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + matchtype_t mt = MT_FIRST; + boolean_t conflict = B_FALSE; + size_t bufsz = 0; + char *buf = NULL; + + if (rpnp) { + buf = rpnp->pn_buf; + bufsz = rpnp->pn_bufsize; + } + if (exact) + mt = MT_EXACT; + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, buf, bufsz, &conflict); + if (!error && deflags) + *deflags = conflict ? ED_CASE_CONFLICT : 0; + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + if (error == ENOENT && update) + dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); + + return (error); +} + +/* + * Lock a directory entry. A dirlock on <dzp, name> protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * ZCILOOK: On a mixed sensitivity file system, + * this lookup should be case-insensitive. + * ZCIEXACT: On a purely case-insensitive file system, + * this lookup should be case-sensitive. + * ZRENAMING: we are locking for renaming, force narrow locks + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * direntflags - (case-insensitive lookup only) + * flags if multiple case-sensitive matches exist in directory + * realpnp - (case-insensitive lookup only) + * actual name matched within the directory + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + * NOTE: For case-insensitive file systems we take wide locks (see below), + * but return znode pointers to a single match. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t *dl; + boolean_t update; + boolean_t exact; + uint64_t zoid; + vnode_t *vp = NULL; + int error = 0; + int cmpflags; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || + zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) + return (EEXIST); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect what vnodes can be cached in the DNLC, how we + * perform zap lookups, and the "width" of our dirlocks. + * + * A normal dirlock locks a single name. Note that with + * normalization a name can be composed multiple ways, but + * when normalized, these names all compare equal. A wide + * dirlock locks multiple names. We need these when the file + * system is supporting mixed-mode access. It is sometimes + * necessary to lock all case permutations of file name at + * once so that simultaneous case-insensitive/case-sensitive + * behaves as rationally as possible. + */ + + /* + * Decide if exact matches should be requested when performing + * a zap lookup on file systems supporting case-insensitive + * access. + */ + exact = + ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * Maybe can add TO-UPPERed version of name to dnlc in ci-only + * case for performance improvement? + */ + update = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + + /* + * ZRENAMING indicates we are in a situation where we should + * take narrow locks regardless of the file system's + * preferences for normalizing and case folding. This will + * prevent us deadlocking trying to grab the same wide lock + * twice if the two names happen to be case-insensitive + * matches. + */ + if (flag & ZRENAMING) + cmpflags = 0; + else + cmpflags = zfsvfs->z_norm; + + /* + * Wait until there are no locks on this name. + */ + rw_enter(&dzp->z_name_lock, RW_READER); + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_unlinked) { + mutex_exit(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + return (ENOENT); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { + if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, + U8_UNICODE_LATEST, &error) == 0) || error != 0) + break; + } + if (error != 0) { + mutex_exit(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + return (ENOENT); + } + if (dl == NULL) { + /* + * Allocate a new dirlock and add it to the list. + */ + dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = name; + dl->dl_sharecnt = 0; + dl->dl_namesize = 0; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { + /* + * We're the second shared reference to dl. Make a copy of + * dl_name in case the first thread goes away before we do. + * Note that we initialize the new name before storing its + * pointer into dl_name, because the first thread may load + * dl->dl_name at any time. He'll either see the old value, + * which is his, or the new shared copy; either is OK. + */ + dl->dl_namesize = strlen(dl->dl_name) + 1; + name = kmem_alloc(dl->dl_namesize, KM_SLEEP); + bcopy(dl->dl_name, name, dl->dl_namesize); + dl->dl_name = name; + } + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + zoid = dzp->z_phys->zp_xattr; + error = (zoid == 0 ? ENOENT : 0); + } else { + if (update) + vp = dnlc_lookup(ZTOV(dzp), name); + if (vp == DNLC_NO_VNODE) { + VN_RELE(vp); + error = ENOENT; + } else if (vp) { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + VN_RELE(vp); + return (EEXIST); + } + *dlpp = dl; + *zpp = VTOZ(vp); + return (0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, exact, + update, direntflags, realpnp, &zoid); + } + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (EEXIST); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + if (!(flag & ZXATTR) && update) + dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + if (dl->dl_namesize != 0) + kmem_free(dl->dl_name, dl->dl_namesize); + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl)); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, + int *deflg, pathname_t *rpnp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + int error = 0; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *vpp = ZTOV(dzp); + VN_HOLD(*vpp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (dzp->z_phys->zp_parent == dzp->z_id && + zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", vpp, NULL, 0, NULL, kcred, + NULL, NULL, NULL); + return (error); + } + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); + if (error == 0) + *vpp = ZTOV(zp); + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + *vpp = zfsctl_root(dzp); + } else { + int zf; + + zf = ZEXISTS | ZSHARED; + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + if (error == 0) { + *vpp = ZTOV(zp); + zfs_dirent_unlock(dl); + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + } + rpnp = NULL; + } + + if ((flags & FIGNORECASE) && rpnp && !error) + (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); + + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_unlinked); + ASSERT3U(zp->z_phys->zp_links, ==, 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + /* + * Interate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + zp->z_unlinked = B_TRUE; + VN_RELE(ZTOV(zp)); + } + zap_cursor_fini(&zc); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t dl; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + VN_RELE(ZTOV(xzp)); + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + VN_RELE(ZTOV(xzp)); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + dmu_tx_t *tx; + uint64_t acl_obj; + int error; + + ASSERT(ZTOV(zp)->v_count == 0); + ASSERT(zp->z_phys->zp_links == 0); + + /* + * If this is a ZIL replay then leave the object in the unlinked set. + * Otherwise we can get a deadlock, because the delete can be + * quite large and span multiple tx's and txgs, but each replay + * creates a tx to atomically run the replay function and mark the + * replay record as complete. We deadlock trying to start a tx in + * a new txg to further the deletion but can't because the replay + * tx hasn't finished. + * + * We actually delete the object if we get a failure to create an + * object in zil_replay_log_record(), or after calling zil_replay(). + */ + if (zfsvfs->z_assign >= TXG_INITIAL) { + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } + + /* + * Free up all the data in the file. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space. Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + if (zp->z_phys->zp_xattr) { + error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + ASSERT(error == 0); + } + + acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xzp) { + dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + goto out; + } + + if (xzp) { + dmu_buf_will_dirty(xzp->z_dbuf, tx); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + xzp->z_phys->zp_links = 0; /* no more links to it */ + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + } + + /* Remove this znode from the unlinked set */ + VERIFY3U(0, ==, + zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); +out: + if (xzp) + VN_RELE(ZTOV(xzp)); +} + +static uint64_t +zfs_dirent(znode_t *zp) +{ + uint64_t de = zp->z_id; + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT((zp)->z_phys->zp_mode) << 60; + return (de); +} + +/* + * Link zp into dl. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + vnode_t *vp = ZTOV(zp); + uint64_t value; + int zp_is_dir = (vp->v_type == VDIR); + int error; + + dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (ENOENT); + } + zp->z_phys->zp_links++; + } + zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ + + if (!(flag & ZNEW)) + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + mutex_exit(&zp->z_lock); + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + mutex_enter(&dzp->z_lock); + dzp->z_phys->zp_size++; /* one dirent added */ + dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ + zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + mutex_exit(&dzp->z_lock); + + value = zfs_dirent(zp); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, + 8, 1, &value, tx); + ASSERT(error == 0); + + dnlc_update(ZTOV(dzp), dl->dl_name, vp); + + return (0); +} + +/* + * Unlink zp from dl, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + boolean_t *unlinkedp) +{ + znode_t *dzp = dl->dl_dzp; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + boolean_t unlinked = B_FALSE; + int error; + + dnlc_remove(ZTOV(dzp), dl->dl_name); + + if (!(flag & ZRENAMING)) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + + if (vn_vfswlock(vp)) /* prevent new mounts on zp */ + return (EBUSY); + + if (vn_ismntpt(vp)) { /* don't remove mount point */ + vn_vfsunlock(vp); + return (EBUSY); + } + + mutex_enter(&zp->z_lock); + if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + return (EEXIST); + } + if (zp->z_phys->zp_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %s is %u, " + "should be at least %u", + zp->z_vnode->v_path ? zp->z_vnode->v_path : + "<unknown>", (int)zp->z_phys->zp_links, + zp_is_dir + 1); + zp->z_phys->zp_links = zp_is_dir + 1; + } + if (--zp->z_phys->zp_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_phys->zp_links = 0; + unlinked = B_TRUE; + } else { + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + } + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + } + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + mutex_enter(&dzp->z_lock); + dzp->z_phys->zp_size--; /* one dirent removed */ + dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ + zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + mutex_exit(&dzp->z_lock); + + if (zp->z_zfsvfs->z_norm) { + if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && + (flag & ZCIEXACT)) || + ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) && + !(flag & ZCILOOK))) + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, MT_EXACT, tx); + else + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, MT_FIRST, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, tx); + } + ASSERT(error == 0); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_fuid_info_t *fuidp = NULL; + + *xvpp = NULL; + + if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) + return (error); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + dmu_tx_wait(tx); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); + ASSERT(xzp->z_phys->zp_parent == zp->z_id); + dmu_buf_will_dirty(zp->z_dbuf, tx); + zp->z_phys->zp_xattr = xzp->z_id; + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, fuidp, vap); + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + *xvpp = ZTOV(xzp); + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + if (error) + return (error); + + if (xzp != NULL) { + *xvpp = ZTOV(xzp); + zfs_dirent_unlock(dl); + return (0); + } + + ASSERT(zp->z_phys->zp_xattr == 0); + + if (!(flags & CREATE_XATTR_DIR)) { + zfs_dirent_unlock(dl); + return (ENOENT); + } + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + zfs_dirent_unlock(dl); + return (EROFS); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xvpp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + + if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ + return (0); + + if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(cr)); +} diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c new file mode 100644 index 000000000..236d69e7e --- /dev/null +++ b/module/zfs/zfs_fm.c @@ -0,0 +1,362 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> + +#include <sys/fm/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/sysevent.h> + +/* + * This general routine is responsible for generating all the different ZFS + * ereports. The payload is dependent on the class, and which arguments are + * supplied to the function: + * + * EREPORT POOL VDEV IO + * block X X X + * data X X + * device X X + * pool X + * + * If we are in a loading state, all errors are chained together by the same + * SPA-wide ENA (Error Numeric Association). + * + * For isolated I/O requests, we get the ENA from the zio_t. The propagation + * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want + * to chain together all ereports associated with a logical piece of data. For + * read I/Os, there are basically three 'types' of I/O, which form a roughly + * layered diagram: + * + * +---------------+ + * | Aggregate I/O | No associated logical data or device + * +---------------+ + * | + * V + * +---------------+ Reads associated with a piece of logical data. + * | Read I/O | This includes reads on behalf of RAID-Z, + * +---------------+ mirrors, gang blocks, retries, etc. + * | + * V + * +---------------+ Reads associated with a particular device, but + * | Physical I/O | no logical data. Issued as part of vdev caching + * +---------------+ and I/O aggregation. + * + * Note that 'physical I/O' here is not the same terminology as used in the rest + * of ZIO. Typically, 'physical I/O' simply means that there is no attached + * blockpointer. But I/O with no associated block pointer can still be related + * to a logical piece of data (i.e. RAID-Z requests). + * + * Purely physical I/O always have unique ENAs. They are not related to a + * particular piece of logical data, and therefore cannot be chained together. + * We still generate an ereport, but the DE doesn't correlate it with any + * logical piece of data. When such an I/O fails, the delegated I/O requests + * will issue a retry, which will trigger the 'real' ereport with the correct + * ENA. + * + * We keep track of the ENA for a ZIO chain through the 'io_logical' member. + * When a new logical I/O is issued, we set this to point to itself. Child I/Os + * then inherit this pointer, so that when it is first set subsequent failures + * will use the same ENA. For vdev cache fill and queue aggregation I/O, + * this pointer is set to NULL, and no ereport will be generated (since it + * doesn't actually correspond to any particular device or piece of data, + * and the caller will always retry without caching or queueing anyway). + */ +void +zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, + uint64_t stateoroffset, uint64_t size) +{ +#ifdef _KERNEL + nvlist_t *ereport, *detector; + uint64_t ena; + char class[64]; + int state; + + /* + * If we are doing a spa_tryimport(), ignore errors. + */ + if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + return; + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa->spa_load_state != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return; + + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return; + + /* + * Ignore any errors from speculative I/Os, as failure is an + * expected result. + */ + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + /* + * If the vdev has already been marked as failing due to a + * failed probe, then ignore any subsequent I/O errors, as the + * DE will automatically fault the vdev on the first such + * failure. + */ + if (vd != NULL && + (!vdev_readable(vd) || !vdev_writeable(vd)) && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) + return; + } + + if ((ereport = fm_nvlist_create(NULL)) == NULL) + return; + + if ((detector = fm_nvlist_create(NULL)) == NULL) { + fm_nvlist_destroy(ereport, FM_NVA_FREE); + return; + } + + /* + * Serialize ereport generation + */ + mutex_enter(&spa->spa_errlist_lock); + + /* + * Determine the ENA to use for this event. If we are in a loading + * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use + * a root zio-wide ENA. Otherwise, simply use a unique ENA. + */ + if (spa->spa_load_state != SPA_LOAD_NONE) { + if (spa->spa_ena == 0) + spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); + ena = spa->spa_ena; + } else if (zio != NULL && zio->io_logical != NULL) { + if (zio->io_logical->io_ena == 0) + zio->io_logical->io_ena = + fm_ena_generate(0, FM_ENA_FMT1); + ena = zio->io_logical->io_ena; + } else { + ena = fm_ena_generate(0, FM_ENA_FMT1); + } + + /* + * Construct the full class, detector, and other standard FMA fields. + */ + (void) snprintf(class, sizeof (class), "%s.%s", + ZFS_ERROR_CLASS, subclass); + + fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), + vd != NULL ? vd->vdev_guid : 0); + + fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); + + /* + * Construct the per-ereport payload, depending on which parameters are + * passed in. + */ + + /* + * If we are importing a faulted pool, then we treat it like an open, + * not an import. Otherwise, the DE will ignore all faults during + * import, since the default behavior is to mark the devices as + * persistently unavailable, not leave them in the faulted state. + */ + state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state; + + /* + * Generic payload members common to all ereports. + */ + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, + DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + DATA_TYPE_UINT64, spa_guid(spa), + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, + state, NULL); + + if (spa != NULL) { + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, + DATA_TYPE_STRING, + spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? + FM_EREPORT_FAILMODE_WAIT : + spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? + FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, + NULL); + } + + if (vd != NULL) { + vdev_t *pvd = vd->vdev_parent; + + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + DATA_TYPE_UINT64, vd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); + if (vd->vdev_path) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + DATA_TYPE_STRING, vd->vdev_path, NULL); + if (vd->vdev_devid) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, + DATA_TYPE_STRING, vd->vdev_devid, NULL); + + if (pvd != NULL) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, + DATA_TYPE_UINT64, pvd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, + DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, + NULL); + if (pvd->vdev_path) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, + DATA_TYPE_STRING, pvd->vdev_path, NULL); + if (pvd->vdev_devid) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, + DATA_TYPE_STRING, pvd->vdev_devid, NULL); + } + } + + if (zio != NULL) { + /* + * Payload common to all I/Os. + */ + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, + DATA_TYPE_INT32, zio->io_error, NULL); + + /* + * If the 'size' parameter is non-zero, it indicates this is a + * RAID-Z or other I/O where the physical offset and length are + * provided for us, instead of within the zio_t. + */ + if (vd != NULL) { + if (size) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + DATA_TYPE_UINT64, stateoroffset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, size, NULL); + else + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + DATA_TYPE_UINT64, zio->io_offset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, zio->io_size, NULL); + } + + /* + * Payload for I/Os with corresponding logical information. + */ + if (zio->io_logical != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_objset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_object, + FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, + DATA_TYPE_INT64, + zio->io_logical->io_bookmark.zb_level, + FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_blkid, NULL); + } else if (vd != NULL) { + /* + * If we have a vdev but no zio, this is a device fault, and the + * 'stateoroffset' parameter indicates the previous state of the + * vdev. + */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, + DATA_TYPE_UINT64, stateoroffset, NULL); + } + mutex_exit(&spa->spa_errlist_lock); + + fm_ereport_post(ereport, EVCH_SLEEP); + + fm_nvlist_destroy(ereport, FM_NVA_FREE); + fm_nvlist_destroy(detector, FM_NVA_FREE); +#endif +} + +static void +zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) +{ +#ifdef _KERNEL + nvlist_t *resource; + char class[64]; + + if ((resource = fm_nvlist_create(NULL)) == NULL) + return; + + (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, + ZFS_ERROR_CLASS, name); + VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); + VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); + if (vd) + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); + + fm_ereport_post(resource, EVCH_SLEEP); + + fm_nvlist_destroy(resource, FM_NVA_FREE); +#endif +} + +/* + * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev + * has been removed from the system. This will cause the DE to ignore any + * recent I/O errors, inferring that they are due to the asynchronous device + * removal. + */ +void +zfs_post_remove(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); +} + +/* + * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool + * has the 'autoreplace' property set, and therefore any broken vdevs will be + * handled by higher level logic, and no vdev fault should be generated. + */ +void +zfs_post_autoreplace(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); +} diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c new file mode 100644 index 000000000..7cb505258 --- /dev/null +++ b/module/zfs/zfs_fuid.c @@ -0,0 +1,704 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/sunddi.h> +#include <sys/dmu.h> +#include <sys/avl.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/nvpair.h> +#ifdef _KERNEL +#include <sys/kidmap.h> +#include <sys/sid.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> +#endif +#include <sys/zfs_fuid.h> + +/* + * FUID Domain table(s). + * + * The FUID table is stored as a packed nvlist of an array + * of nvlists which contain an index, domain string and offset + * + * During file system initialization the nvlist(s) are read and + * two AVL trees are created. One tree is keyed by the index number + * and the other by the domain string. Nodes are never removed from + * trees, but new entries may be added. If a new entry is added then the + * on-disk packed nvlist will also be updated. + */ + +#define FUID_IDX "fuid_idx" +#define FUID_DOMAIN "fuid_domain" +#define FUID_OFFSET "fuid_offset" +#define FUID_NVP_ARRAY "fuid_nvlist" + +typedef struct fuid_domain { + avl_node_t f_domnode; + avl_node_t f_idxnode; + ksiddomain_t *f_ksid; + uint64_t f_idx; +} fuid_domain_t; + +static char *nulldomain = ""; + +/* + * Compare two indexes. + */ +static int +idx_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = arg1; + const fuid_domain_t *node2 = arg2; + + if (node1->f_idx < node2->f_idx) + return (-1); + else if (node1->f_idx > node2->f_idx) + return (1); + return (0); +} + +/* + * Compare two domain strings. + */ +static int +domain_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = arg1; + const fuid_domain_t *node2 = arg2; + int val; + + val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); + if (val == 0) + return (0); + return (val > 0 ? 1 : -1); +} + +/* + * load initial fuid domain and idx trees. This function is used by + * both the kernel and zdb. + */ +uint64_t +zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, + avl_tree_t *domain_tree) +{ + dmu_buf_t *db; + uint64_t fuid_size; + + avl_create(idx_tree, idx_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); + avl_create(domain_tree, domain_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); + + VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); + fuid_size = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + if (fuid_size) { + nvlist_t **fuidnvp; + nvlist_t *nvp = NULL; + uint_t count; + char *packed; + int i; + + packed = kmem_alloc(fuid_size, KM_SLEEP); + VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0); + VERIFY(nvlist_unpack(packed, fuid_size, + &nvp, 0) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, + &fuidnvp, &count) == 0); + + for (i = 0; i != count; i++) { + fuid_domain_t *domnode; + char *domain; + uint64_t idx; + + VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, + &domain) == 0); + VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, + &idx) == 0); + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + + domnode->f_idx = idx; + domnode->f_ksid = ksid_lookupdomain(domain); + avl_add(idx_tree, domnode); + avl_add(domain_tree, domnode); + } + nvlist_free(nvp); + kmem_free(packed, fuid_size); + } + return (fuid_size); +} + +void +zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + fuid_domain_t *domnode; + void *cookie; + + cookie = NULL; + while (domnode = avl_destroy_nodes(domain_tree, &cookie)) + ksiddomain_rele(domnode->f_ksid); + + avl_destroy(domain_tree); + cookie = NULL; + while (domnode = avl_destroy_nodes(idx_tree, &cookie)) + kmem_free(domnode, sizeof (fuid_domain_t)); + avl_destroy(idx_tree); +} + +char * +zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + + searchnode.f_idx = idx; + + findnode = avl_find(idx_tree, &searchnode, &loc); + + return (findnode ? findnode->f_ksid->kd_name : nulldomain); +} + +#ifdef _KERNEL +/* + * Load the fuid table(s) into memory. + */ +static void +zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + int error = 0; + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + if (zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + + if (zfsvfs->z_fuid_obj == 0) { + + /* first make sure we need to allocate object */ + + error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); + if (error == ENOENT && tx != NULL) { + zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, sizeof (uint64_t), 1, + &zfsvfs->z_fuid_obj, tx) == 0); + } + } + + if (zfsvfs->z_fuid_obj != 0) { + zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, + zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, + &zfsvfs->z_fuid_domain); + zfsvfs->z_fuid_loaded = B_TRUE; + } + + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Query domain table for a given domain. + * + * If domain isn't found it is added to AVL trees and + * the results are pushed out to disk. + */ +int +zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, + dmu_tx_t *tx) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + krw_t rw = RW_READER; + + /* + * If the dummy "nobody" domain then return an index of 0 + * to cause the created FUID to be a standard POSIX id + * for the user nobody. + */ + if (domain[0] == '\0') { + *retdomain = nulldomain; + return (0); + } + + searchnode.f_ksid = ksid_lookupdomain(domain); + if (retdomain) { + *retdomain = searchnode.f_ksid->kd_name; + } + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs, tx); + +retry: + rw_enter(&zfsvfs->z_fuid_lock, rw); + findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc); + + if (findnode) { + rw_exit(&zfsvfs->z_fuid_lock); + ksiddomain_rele(searchnode.f_ksid); + return (findnode->f_idx); + } else { + fuid_domain_t *domnode; + nvlist_t *nvp; + nvlist_t **fuids; + uint64_t retidx; + size_t nvsize = 0; + char *packed; + dmu_buf_t *db; + int i = 0; + + if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { + rw_exit(&zfsvfs->z_fuid_lock); + rw = RW_WRITER; + goto retry; + } + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + domnode->f_ksid = searchnode.f_ksid; + + retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1; + + avl_add(&zfsvfs->z_fuid_domain, domnode); + avl_add(&zfsvfs->z_fuid_idx, domnode); + /* + * Now resync the on-disk nvlist. + */ + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + domnode = avl_first(&zfsvfs->z_fuid_domain); + fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP); + while (domnode) { + VERIFY(nvlist_alloc(&fuids[i], + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx) == 0); + VERIFY(nvlist_add_uint64(fuids[i], + FUID_OFFSET, 0) == 0); + VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN, + domnode->f_ksid->kd_name) == 0); + domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode); + } + VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + fuids, retidx) == 0); + for (i = 0; i != retidx; i++) + nvlist_free(fuids[i]); + kmem_free(fuids, retidx * sizeof (void *)); + VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + packed = kmem_alloc(nvsize, KM_SLEEP); + VERIFY(nvlist_pack(nvp, &packed, &nvsize, + NV_ENCODE_XDR, KM_SLEEP) == 0); + nvlist_free(nvp); + zfsvfs->z_fuid_size = nvsize; + dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, + zfsvfs->z_fuid_size, packed, tx); + kmem_free(packed, zfsvfs->z_fuid_size); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, + FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; + dmu_buf_rele(db, FTAG); + + rw_exit(&zfsvfs->z_fuid_lock); + return (retidx); + } +} + +/* + * Query domain table by index, returning domain string + * + * Returns a pointer from an avl node of the domain string. + * + */ +static char * +zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) +{ + char *domain; + + if (idx == 0 || !zfsvfs->z_use_fuids) + return (NULL); + + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs, NULL); + + rw_enter(&zfsvfs->z_fuid_lock, RW_READER); + + if (zfsvfs->z_fuid_obj) + domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); + else + domain = nulldomain; + rw_exit(&zfsvfs->z_fuid_lock); + + ASSERT(domain); + return (domain); +} + +void +zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) +{ + *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid, + cr, ZFS_OWNER); + *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid, + cr, ZFS_GROUP); +} + +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + uint32_t index = FUID_INDEX(fuid); + char *domain; + uid_t id; + + if (index == 0) + return (fuid); + + domain = zfs_fuid_find_by_idx(zfsvfs, index); + ASSERT(domain != NULL); + + if (type == ZFS_OWNER || type == ZFS_ACE_USER) { + (void) kidmap_getuidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } else { + (void) kidmap_getgidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } + return (id); +} + +/* + * Add a FUID node to the list of fuid's being created for this + * ACL + * + * If ACL has multiple domains, then keep only one copy of each unique + * domain. + */ +static void +zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, + uint64_t idx, uint64_t id, zfs_fuid_type_t type) +{ + zfs_fuid_t *fuid; + zfs_fuid_domain_t *fuid_domain; + zfs_fuid_info_t *fuidp; + uint64_t fuididx; + boolean_t found = B_FALSE; + + if (*fuidpp == NULL) + *fuidpp = zfs_fuid_info_alloc(); + + fuidp = *fuidpp; + /* + * First find fuid domain index in linked list + * + * If one isn't found then create an entry. + */ + + for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); + fuid_domain; fuid_domain = list_next(&fuidp->z_domains, + fuid_domain), fuididx++) { + if (idx == fuid_domain->z_domidx) { + found = B_TRUE; + break; + } + } + + if (!found) { + fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); + fuid_domain->z_domain = domain; + fuid_domain->z_domidx = idx; + list_insert_tail(&fuidp->z_domains, fuid_domain); + fuidp->z_domain_str_sz += strlen(domain) + 1; + fuidp->z_domain_cnt++; + } + + if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { + /* + * Now allocate fuid entry and add it on the end of the list + */ + + fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + fuid->z_id = id; + fuid->z_domidx = idx; + fuid->z_logfuid = FUID_ENCODE(fuididx, rid); + + list_insert_tail(&fuidp->z_fuids, fuid); + fuidp->z_fuid_cnt++; + } else { + if (type == ZFS_OWNER) + fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); + else + fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); + } +} + +/* + * Create a file system FUID, based on information in the users cred + */ +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uint64_t idx; + ksid_t *ksid; + uint32_t rid; + char *kdomain; + const char *domain; + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); + if (ksid) { + id = ksid_getid(ksid); + } else { + if (type == ZFS_OWNER) + id = crgetuid(cr); + else + id = crgetgid(cr); + } + + if (!zfsvfs->z_use_fuids || (!IS_EPHEMERAL(id))) + return ((uint64_t)id); + + rid = ksid_getrid(ksid); + domain = ksid_getdomain(ksid); + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + + zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); + + return (FUID_ENCODE(idx, rid)); +} + +/* + * Create a file system FUID for an ACL ace + * or a chown/chgrp of the file. + * This is similar to zfs_fuid_create_cred, except that + * we can't find the domain + rid information in the + * cred. Instead we have to query Winchester for the + * domain and rid. + * + * During replay operations the domain+rid information is + * found in the zfs_fuid_info_t that the replay code has + * attached to the zfsvfs of the file system. + */ +uint64_t +zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, + zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp) +{ + const char *domain; + char *kdomain; + uint32_t fuid_idx = FUID_INDEX(id); + uint32_t rid; + idmap_stat status; + uint64_t idx; + boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL); + zfs_fuid_t *zfuid = NULL; + zfs_fuid_info_t *fuidp; + + /* + * If POSIX ID, or entry is already a FUID then + * just return the id + * + * We may also be handed an already FUID'ized id via + * chmod. + */ + + if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) + return (id); + + if (is_replay) { + fuidp = zfsvfs->z_fuid_replay; + + /* + * If we are passed an ephemeral id, but no + * fuid_info was logged then return NOBODY. + * This is most likely a result of idmap service + * not being available. + */ + if (fuidp == NULL) + return (UID_NOBODY); + + switch (type) { + case ZFS_ACE_USER: + case ZFS_ACE_GROUP: + zfuid = list_head(&fuidp->z_fuids); + rid = FUID_RID(zfuid->z_logfuid); + idx = FUID_INDEX(zfuid->z_logfuid); + break; + case ZFS_OWNER: + rid = FUID_RID(fuidp->z_fuid_owner); + idx = FUID_INDEX(fuidp->z_fuid_owner); + break; + case ZFS_GROUP: + rid = FUID_RID(fuidp->z_fuid_group); + idx = FUID_INDEX(fuidp->z_fuid_group); + break; + }; + domain = fuidp->z_domain_table[idx -1]; + } else { + if (type == ZFS_OWNER || type == ZFS_ACE_USER) + status = kidmap_getsidbyuid(crgetzone(cr), id, + &domain, &rid); + else + status = kidmap_getsidbygid(crgetzone(cr), id, + &domain, &rid); + + if (status != 0) { + /* + * When returning nobody we will need to + * make a dummy fuid table entry for logging + * purposes. + */ + rid = UID_NOBODY; + domain = nulldomain; + } + } + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + + if (!is_replay) + zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); + else if (zfuid != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + return (FUID_ENCODE(idx, rid)); +} + +void +zfs_fuid_destroy(zfsvfs_t *zfsvfs) +{ + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + if (!zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Allocate zfs_fuid_info for tracking FUIDs created during + * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() + */ +zfs_fuid_info_t * +zfs_fuid_info_alloc(void) +{ + zfs_fuid_info_t *fuidp; + + fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); + list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), + offsetof(zfs_fuid_domain_t, z_next)); + list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), + offsetof(zfs_fuid_t, z_next)); + return (fuidp); +} + +/* + * Release all memory associated with zfs_fuid_info_t + */ +void +zfs_fuid_info_free(zfs_fuid_info_t *fuidp) +{ + zfs_fuid_t *zfuid; + zfs_fuid_domain_t *zdomain; + + while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + + if (fuidp->z_domain_table != NULL) + kmem_free(fuidp->z_domain_table, + (sizeof (char **)) * fuidp->z_domain_cnt); + + while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { + list_remove(&fuidp->z_domains, zdomain); + kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); + } + + kmem_free(fuidp, sizeof (zfs_fuid_info_t)); +} + +/* + * Check to see if id is a groupmember. If cred + * has ksid info then sidlist is checked first + * and if still not found then POSIX groups are checked + * + * Will use a straight FUID compare when possible. + */ +boolean_t +zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) +{ + ksid_t *ksid = crgetsid(cr, KSID_GROUP); + uid_t gid; + + if (ksid) { + int i; + ksid_t *ksid_groups; + ksidlist_t *ksidlist = crgetsidlist(cr); + uint32_t idx = FUID_INDEX(id); + uint32_t rid = FUID_RID(id); + + ASSERT(ksidlist); + ksid_groups = ksidlist->ksl_sids; + + for (i = 0; i != ksidlist->ksl_nsid; i++) { + if (idx == 0) { + if (id != IDMAP_WK_CREATOR_GROUP_GID && + id == ksid_groups[i].ks_id) { + return (B_TRUE); + } + } else { + char *domain; + + domain = zfs_fuid_find_by_idx(zfsvfs, idx); + ASSERT(domain != NULL); + + if (strcmp(domain, + IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) + return (B_FALSE); + + if ((strcmp(domain, + ksid_groups[i].ks_domain->kd_name) == 0) && + rid == ksid_groups[i].ks_rid) + return (B_TRUE); + } + } + } + + /* + * Not found in ksidlist, check posix groups + */ + gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); + return (groupmember(gid, cr)); +} +#endif diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c new file mode 100644 index 000000000..b6ad57451 --- /dev/null +++ b/module/zfs/zfs_ioctl.c @@ -0,0 +1,3175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/buf.h> +#include <sys/modctl.h> +#include <sys/open.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_znode.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/dmu.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_deleg.h> +#include <sys/dmu_objset.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/policy.h> +#include <sys/zone.h> +#include <sys/nvpair.h> +#include <sys/pathname.h> +#include <sys/mount.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_dir.h> +#include <sys/zvol.h> +#include <sharefs/share.h> +#include <sys/dmu_objset.h> + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_deleg.h" + +extern struct modlfs zfs_modlfs; + +extern void zfs_init(void); +extern void zfs_fini(void); + +ldi_ident_t zfs_li = NULL; +dev_info_t *zfs_dip; + +typedef int zfs_ioc_func_t(zfs_cmd_t *); +typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); + +typedef struct zfs_ioc_vec { + zfs_ioc_func_t *zvec_func; + zfs_secpolicy_func_t *zvec_secpolicy; + enum { + NO_NAME, + POOL_NAME, + DATASET_NAME + } zvec_namecheck; + boolean_t zvec_his_log; +} zfs_ioc_vec_t; + +static void clear_props(char *dataset, nvlist_t *props); +static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, + boolean_t *); +int zfs_set_prop_nvlist(const char *, nvlist_t *); + +/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ +void +__dprintf(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char buf[256]; + va_list adx; + + /* + * Get rid of annoying "../common/" prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + /* + * To get this data, use the zfs-dprintf probe as so: + * dtrace -q -n 'zfs-dprintf \ + * /stringof(arg0) == "dbuf.c"/ \ + * {printf("%s: %s", stringof(arg1), stringof(arg3))}' + * arg0 = file name + * arg1 = function name + * arg2 = line number + * arg3 = message + */ + DTRACE_PROBE4(zfs__dprintf, + char *, newfile, char *, func, int, line, char *, buf); +} + +static void +history_str_free(char *buf) +{ + kmem_free(buf, HIS_MAX_RECORD_LEN); +} + +static char * +history_str_get(zfs_cmd_t *zc) +{ + char *buf; + + if (zc->zc_history == NULL) + return (NULL); + + buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + if (copyinstr((void *)(uintptr_t)zc->zc_history, + buf, HIS_MAX_RECORD_LEN, NULL) != 0) { + history_str_free(buf); + return (NULL); + } + + buf[HIS_MAX_RECORD_LEN -1] = '\0'; + + return (buf); +} + +/* + * Check to see if the named dataset is currently defined as bootable + */ +static boolean_t +zfs_is_bootfs(const char *name) +{ + spa_t *spa; + boolean_t ret = B_FALSE; + + if (spa_open(name, &spa, FTAG) == 0) { + if (spa->spa_bootfs) { + objset_t *os; + + if (dmu_objset_open(name, DMU_OST_ZFS, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + ret = (dmu_objset_id(os) == spa->spa_bootfs); + dmu_objset_close(os); + } + } + spa_close(spa, FTAG); + } + return (ret); +} + +/* + * zfs_earlier_version + * + * Return non-zero if the spa version is less than requested version. + */ +static int +zfs_earlier_version(const char *name, int version) +{ + spa_t *spa; + + if (spa_open(name, &spa, FTAG) == 0) { + if (spa_version(spa) < version) { + spa_close(spa, FTAG); + return (1); + } + spa_close(spa, FTAG); + } + return (0); +} + +/* + * zpl_earlier_version + * + * Return TRUE if the ZPL version is less than requested version. + */ +static boolean_t +zpl_earlier_version(const char *name, int version) +{ + objset_t *os; + boolean_t rc = B_TRUE; + + if (dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + uint64_t zplversion; + + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) + rc = zplversion < version; + dmu_objset_close(os); + } + return (rc); +} + +static void +zfs_log_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *buf; + + if ((buf = history_str_get(zc)) == NULL) + return; + + if (spa_open(zc->zc_name, &spa, FTAG) == 0) { + if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) + (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); + spa_close(spa, FTAG); + } + history_str_free(buf); +} + +/* + * Policy for top-level read operations (list pools). Requires no privileges, + * and can be used in the local zone, as there is no associated dataset. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) +{ + return (0); +} + +/* + * Policy for dataset read operations (list children, get statistics). Requires + * no privileges, but must be visible in the local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) +{ + if (INGLOBALZONE(curproc) || + zone_dataset_visible(zc->zc_name, NULL)) + return (0); + + return (ENOENT); +} + +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + int writable = 1; + + /* + * The dataset must be visible by this zone -- check this first + * so they don't see EPERM on something they shouldn't know about. + */ + if (!INGLOBALZONE(curproc) && + !zone_dataset_visible(dataset, &writable)) + return (ENOENT); + + if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) + return (ENOENT); + + if (INGLOBALZONE(curproc)) { + /* + * If the fs is zoned, only root can access it from the + * global zone. + */ + if (secpolicy_zfs(cr) && zoned) + return (EPERM); + } else { + /* + * If we are in a local zone, the 'zoned' property must be set. + */ + if (!zoned) + return (EPERM); + + /* must be writable by this zone */ + if (!writable) + return (EPERM); + } + return (0); +} + +int +zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck(name, cr); + if (error == 0) { + error = secpolicy_zfs(cr); + if (error) + error = dsl_deleg_access(name, perm, cr); + } + return (error); +} + +static int +zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr) +{ + /* + * Check permissions for special properties. + */ + switch (prop) { + case ZFS_PROP_ZONED: + /* + * Disallow setting of 'zoned' from within a local zone. + */ + if (!INGLOBALZONE(curproc)) + return (EPERM); + break; + + case ZFS_PROP_QUOTA: + if (!INGLOBALZONE(curproc)) { + uint64_t zoned; + char setpoint[MAXNAMELEN]; + /* + * Unprivileged users are allowed to modify the + * quota on things *under* (ie. contained by) + * the thing they own. + */ + if (dsl_prop_get_integer(name, "zoned", &zoned, + setpoint)) + return (EPERM); + if (!zoned || strlen(name) <= strlen(setpoint)) + return (EPERM); + } + break; + } + + return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr)); +} + +int +zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck(zc->zc_name, cr); + if (error) + return (error); + + /* + * permission to set permissions will be evaluated later in + * dsl_deleg_can_allow() + */ + return (0); +} + +int +zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_ROLLBACK, cr); + if (error == 0) + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr); + return (error); +} + +int +zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr)); +} + +int +zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) +{ + if (!INGLOBALZONE(curproc)) + return (EPERM); + + if (secpolicy_nfs(cr) == 0) { + return (0); + } else { + vnode_t *vp; + int error; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (vp->v_vfsp->vfs_fstype != zfsfstype || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EPERM); + } + + VN_RELE(vp); + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_SHARE, cr)); + } +} + +static int +zfs_get_parent(const char *datasetname, char *parent, int parentsize) +{ + char *cp; + + /* + * Remove the @bla or /bla from the end of the name to get the parent. + */ + (void) strncpy(parent, datasetname, parentsize); + cp = strrchr(parent, '@'); + if (cp != NULL) { + cp[0] = '\0'; + } else { + cp = strrchr(parent, '/'); + if (cp == NULL) + return (ENOENT); + cp[0] = '\0'; + } + + return (0); +} + +int +zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); +} + +static int +zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); +} + +/* + * Must have sys_config privilege to check the iscsi permission + */ +/* ARGSUSED */ +static int +zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr) +{ + return (secpolicy_zfs(cr)); +} + +int +zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + int error; + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_RENAME, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + if ((error = zfs_get_parent(to, parentname, + sizeof (parentname))) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (error); +} + +static int +zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); +} + +static int +zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + objset_t *clone; + int error; + + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_PROMOTE, cr); + if (error) + return (error); + + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &clone); + + if (error == 0) { + dsl_dataset_t *pclone = NULL; + dsl_dir_t *dd; + dd = clone->os->os_dsl_dataset->ds_dir; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_origin_obj, FTAG, &pclone); + rw_exit(&dd->dd_pool->dp_config_rwlock); + if (error) { + dmu_objset_close(clone); + return (error); + } + + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr); + + dsl_dataset_name(pclone, parentname); + dmu_objset_close(clone); + dsl_dataset_rele(pclone, FTAG); + if (error == 0) + error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_PROMOTE, cr); + } + return (error); +} + +static int +zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RECEIVE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_CREATE, cr)); +} + +int +zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0) + return (error); + + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_MOUNT, cr); + + return (error); +} + +static int +zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) +{ + + return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); +} + +static int +zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + int error; + + if ((error = zfs_get_parent(zc->zc_name, parentname, + sizeof (parentname))) != 0) + return (error); + + if (zc->zc_value[0] != '\0') { + if ((error = zfs_secpolicy_write_perms(zc->zc_value, + ZFS_DELEG_PERM_CLONE, cr)) != 0) + return (error); + } + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr); + + return (error); +} + +static int +zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + error = secpolicy_fs_unmount(cr, NULL); + if (error) { + error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr); + } + return (error); +} + +/* + * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires + * SYS_CONFIG privilege, which is not available in a local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) + return (EPERM); + + return (0); +} + +/* + * Just like zfs_secpolicy_config, except that we will check for + * mount permission on the dataset for permission to create/remove + * the minor nodes. + */ +static int +zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) { + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr)); + } + + return (0); +} + +/* + * Policy for fault injection. Requires all privileges. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) +{ + return (secpolicy_zinject(cr)); +} + +static int +zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) +{ + zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); + + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(zc->zc_value)) + return (EINVAL); + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_USERPROP, cr)); + } else { + if (!zfs_prop_inheritable(prop)) + return (EINVAL); + return (zfs_secpolicy_setprop(zc->zc_name, prop, cr)); + } +} + +/* + * Returns the nvlist as specified by the user in the zfs_cmd_t. + */ +static int +get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) +{ + char *packed; + int error; + nvlist_t *list = NULL; + + /* + * Read in and unpack the user-supplied nvlist. + */ + if (size == 0) + return (EINVAL); + + packed = kmem_alloc(size, KM_SLEEP); + + if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) { + kmem_free(packed, size); + return (error); + } + + if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { + kmem_free(packed, size); + return (error); + } + + kmem_free(packed, size); + + *nvp = list; + return (0); +} + +static int +put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) +{ + char *packed = NULL; + size_t size; + int error; + + VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); + + if (size > zc->zc_nvlist_dst_size) { + error = ENOMEM; + } else { + packed = kmem_alloc(size, KM_SLEEP); + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + KM_SLEEP) == 0); + error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, + size); + kmem_free(packed, size); + } + + zc->zc_nvlist_dst_size = size; + return (error); +} + +static int +zfs_ioc_pool_create(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config, *props = NULL; + nvlist_t *rootprops = NULL; + nvlist_t *zplprops = NULL; + char *buf; + + if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) + return (error); + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + nvlist_free(config); + return (error); + } + + if (props) { + nvlist_t *nvl = NULL; + uint64_t version = SPA_VERSION; + + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); + if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { + error = EINVAL; + goto pool_props_bad; + } + (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); + if (nvl) { + error = nvlist_dup(nvl, &rootprops, KM_SLEEP); + if (error != 0) { + nvlist_free(config); + nvlist_free(props); + return (error); + } + (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); + } + VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops_root(version, rootprops, + zplprops, NULL); + if (error) + goto pool_props_bad; + } + + buf = history_str_get(zc); + + error = spa_create(zc->zc_name, config, props, buf, zplprops); + + /* + * Set the remaining root properties + */ + if (!error && + (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0) + (void) spa_destroy(zc->zc_name); + + if (buf != NULL) + history_str_free(buf); + +pool_props_bad: + nvlist_free(rootprops); + nvlist_free(zplprops); + nvlist_free(config); + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_pool_destroy(zfs_cmd_t *zc) +{ + int error; + zfs_log_history(zc); + error = spa_destroy(zc->zc_name); + return (error); +} + +static int +zfs_ioc_pool_import(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config, *props = NULL; + uint64_t guid; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) != 0) + return (error); + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + nvlist_free(config); + return (error); + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || + guid != zc->zc_guid) + error = EINVAL; + else if (zc->zc_cookie) + error = spa_import_faulted(zc->zc_name, config, + props); + else + error = spa_import(zc->zc_name, config, props); + + nvlist_free(config); + + if (props) + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_pool_export(zfs_cmd_t *zc) +{ + int error; + boolean_t force = (boolean_t)zc->zc_cookie; + + zfs_log_history(zc); + error = spa_export(zc->zc_name, NULL, force); + return (error); +} + +static int +zfs_ioc_pool_configs(zfs_cmd_t *zc) +{ + nvlist_t *configs; + int error; + + if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) + return (EEXIST); + + error = put_nvlist(zc, configs); + + nvlist_free(configs); + + return (error); +} + +static int +zfs_ioc_pool_stats(zfs_cmd_t *zc) +{ + nvlist_t *config; + int error; + int ret = 0; + + error = spa_get_stats(zc->zc_name, &config, zc->zc_value, + sizeof (zc->zc_value)); + + if (config != NULL) { + ret = put_nvlist(zc, config); + nvlist_free(config); + + /* + * The config may be present even if 'error' is non-zero. + * In this case we return success, and preserve the real errno + * in 'zc_cookie'. + */ + zc->zc_cookie = error; + } else { + ret = error; + } + + return (ret); +} + +/* + * Try to import the given pool, returning pool stats as appropriate so that + * user land knows which devices are available and overall pool health. + */ +static int +zfs_ioc_pool_tryimport(zfs_cmd_t *zc) +{ + nvlist_t *tryconfig, *config; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &tryconfig)) != 0) + return (error); + + config = spa_tryimport(tryconfig); + + nvlist_free(tryconfig); + + if (config == NULL) + return (EINVAL); + + error = put_nvlist(zc, config); + nvlist_free(config); + + return (error); +} + +static int +zfs_ioc_pool_scrub(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_scrub(spa, zc->zc_cookie); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_freeze(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + spa_freeze(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_pool_upgrade(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { + spa_close(spa, FTAG); + return (EINVAL); + } + + spa_upgrade(spa, zc->zc_cookie); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *hist_buf; + uint64_t size; + int error; + + if ((size = zc->zc_history_len) == 0) + return (EINVAL); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (ENOTSUP); + } + + hist_buf = kmem_alloc(size, KM_SLEEP); + if ((error = spa_history_get(spa, &zc->zc_history_offset, + &zc->zc_history_len, hist_buf)) == 0) { + error = xcopyout(hist_buf, + (char *)(uintptr_t)zc->zc_history, + zc->zc_history_len); + } + + spa_close(spa, FTAG); + kmem_free(hist_buf, size); + return (error); +} + +static int +zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) +{ + int error; + + if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)) + return (error); + + return (0); +} + +static int +zfs_ioc_obj_to_path(zfs_cmd_t *zc) +{ + objset_t *osp; + int error; + + if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS, + DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0) + return (error); + error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_close(osp); + + return (error); +} + +static int +zfs_ioc_vdev_add(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *config, **l2cache, **spares; + uint_t nl2cache = 0, nspares = 0; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config); + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache); + + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, + &spares, &nspares); + + /* + * A root pool with concatenated devices is not supported. + * Thus, can not add a device to a root pool. + * + * Intent log device can not be added to a rootpool because + * during mountroot, zil is replayed, a seperated log device + * can not be accessed during the mountroot time. + * + * l2cache and spare devices are ok to be added to a rootpool. + */ + if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) { + spa_close(spa, FTAG); + return (EDOM); + } + + if (error == 0) { + error = spa_vdev_add(spa, config); + nvlist_free(config); + } + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_remove(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_set_state(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + switch (zc->zc_cookie) { + case VDEV_STATE_ONLINE: + error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); + break; + + case VDEV_STATE_OFFLINE: + error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); + break; + + case VDEV_STATE_FAULTED: + error = vdev_fault(spa, zc->zc_guid); + break; + + case VDEV_STATE_DEGRADED: + error = vdev_degrade(spa, zc->zc_guid); + break; + + default: + error = EINVAL; + } + zc->zc_cookie = newstate; + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_attach(zfs_cmd_t *zc) +{ + spa_t *spa; + int replacing = zc->zc_cookie; + nvlist_t *config; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) == 0) { + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); + nvlist_free(config); + } + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_detach(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setpath(spa, guid, path); + spa_close(spa, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_objset_stats(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + nvlist_t *nv; + + if (error = dmu_objset_open(zc->zc_name, + DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) + return (error); + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) { + dmu_objset_stats(os, nv); + /* + * NB: zvol_get_stats() will read the objset contents, + * which we aren't supposed to do with a + * DS_MODE_USER hold, because it could be + * inconsistent. So this is a bit of a workaround... + */ + if (!zc->zc_objset_stats.dds_inconsistent) { + if (dmu_objset_type(os) == DMU_OST_ZVOL) + VERIFY(zvol_get_stats(os, nv) == 0); + } + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + dmu_objset_close(os); + return (error); +} + +static int +nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) +{ + uint64_t value; + int error; + + /* + * zfs_get_zplprop() will either find a value or give us + * the default value (if there is one). + */ + if ((error = zfs_get_zplprop(os, prop, &value)) != 0) + return (error); + VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for zpl property nvlist + * + * outputs: + * zc_nvlist_dst zpl property nvlist + * zc_nvlist_dst_size size of zpl property nvlist + */ +static int +zfs_ioc_objset_zplprops(zfs_cmd_t *zc) +{ + objset_t *os; + int err; + + if (err = dmu_objset_open(zc->zc_name, + DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) + return (err); + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + /* + * NB: nvl_add_zplprop() will read the objset contents, + * which we aren't supposed to do with a DS_MODE_USER + * hold, because it could be inconsistent. + */ + if (zc->zc_nvlist_dst != NULL && + !zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZFS) { + nvlist_t *nv; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) + err = put_nvlist(zc, nv); + nvlist_free(nv); + } else { + err = ENOENT; + } + dmu_objset_close(os); + return (err); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_name name of next filesystem + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_dataset_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + char *p; + + if (error = dmu_objset_open(zc->zc_name, + DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) { + if (error == ENOENT) + error = ESRCH; + return (error); + } + + p = strrchr(zc->zc_name, '/'); + if (p == NULL || p[1] != '\0') + (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); + p = zc->zc_name + strlen(zc->zc_name); + + do { + error = dmu_dir_list_next(os, + sizeof (zc->zc_name) - (p - zc->zc_name), p, + NULL, &zc->zc_cookie); + if (error == ENOENT) + error = ESRCH; + } while (error == 0 && !INGLOBALZONE(curproc) && + !zone_dataset_visible(zc->zc_name, NULL)); + dmu_objset_close(os); + + /* + * If it's a hidden dataset (ie. with a '$' in its name), don't + * try to get stats for it. Userland will skip over it. + */ + if (error == 0 && strchr(zc->zc_name, '$') == NULL) + error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_name name of next snapshot + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + error = dmu_objset_open(zc->zc_name, + DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os); + if (error) + return (error == ENOENT ? ESRCH : error); + + /* + * A dataset name of maximum length cannot have any snapshots, + * so exit immediately. + */ + if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { + dmu_objset_close(os); + return (ESRCH); + } + + error = dmu_snapshot_list_next(os, + sizeof (zc->zc_name) - strlen(zc->zc_name), + zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL); + dmu_objset_close(os); + if (error == 0) + error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + else if (error == ENOENT) + error = ESRCH; + + /* if we failed, undo the @ that we tacked on to zc_name */ + if (error) + *strchr(zc->zc_name, '@') = '\0'; + return (error); +} + +int +zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) +{ + nvpair_t *elem; + int error; + uint64_t intval; + char *strval; + + /* + * First validate permission to set all of the properties + */ + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + zfs_prop_t prop = zfs_name_to_prop(propname); + + if (prop == ZPROP_INVAL) { + /* + * If this is a user-defined property, it must be a + * string, and there is no further validation to do. + */ + if (!zfs_prop_user(propname) || + nvpair_type(elem) != DATA_TYPE_STRING) + return (EINVAL); + + if (error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_USERPROP, CRED())) + return (error); + continue; + } + + if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) + return (error); + + /* + * Check that this value is valid for this pool version + */ + switch (prop) { + case ZFS_PROP_COMPRESSION: + /* + * If the user specified gzip compression, make sure + * the SPA supports it. We ignore any errors here since + * we'll catch them later. + */ + if (nvpair_type(elem) == DATA_TYPE_UINT64 && + nvpair_value_uint64(elem, &intval) == 0) { + if (intval >= ZIO_COMPRESS_GZIP_1 && + intval <= ZIO_COMPRESS_GZIP_9 && + zfs_earlier_version(name, + SPA_VERSION_GZIP_COMPRESSION)) + return (ENOTSUP); + + /* + * If this is a bootable dataset then + * verify that the compression algorithm + * is supported for booting. We must return + * something other than ENOTSUP since it + * implies a downrev pool version. + */ + if (zfs_is_bootfs(name) && + !BOOTFS_COMPRESS_VALID(intval)) + return (ERANGE); + } + break; + + case ZFS_PROP_COPIES: + if (zfs_earlier_version(name, + SPA_VERSION_DITTO_BLOCKS)) + return (ENOTSUP); + break; + + case ZFS_PROP_SHARESMB: + if (zpl_earlier_version(name, ZPL_VERSION_FUID)) + return (ENOTSUP); + break; + + case ZFS_PROP_ACLINHERIT: + if (nvpair_type(elem) == DATA_TYPE_UINT64 && + nvpair_value_uint64(elem, &intval) == 0) + if (intval == ZFS_ACL_PASSTHROUGH_X && + zfs_earlier_version(name, + SPA_VERSION_PASSTHROUGH_X)) + return (ENOTSUP); + } + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + zfs_prop_t prop = zfs_name_to_prop(propname); + + if (prop == ZPROP_INVAL) { + VERIFY(nvpair_value_string(elem, &strval) == 0); + error = dsl_prop_set(name, propname, 1, + strlen(strval) + 1, strval); + if (error == 0) + continue; + else + return (error); + } + + switch (prop) { + case ZFS_PROP_QUOTA: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dir_set_quota(name, intval)) != 0) + return (error); + break; + + case ZFS_PROP_REFQUOTA: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dataset_set_quota(name, intval)) != 0) + return (error); + break; + + case ZFS_PROP_RESERVATION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dir_set_reservation(name, + intval)) != 0) + return (error); + break; + + case ZFS_PROP_REFRESERVATION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dataset_set_reservation(name, + intval)) != 0) + return (error); + break; + + case ZFS_PROP_VOLSIZE: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zvol_set_volsize(name, + ddi_driver_major(zfs_dip), intval)) != 0) + return (error); + break; + + case ZFS_PROP_VOLBLOCKSIZE: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zvol_set_volblocksize(name, intval)) != 0) + return (error); + break; + + case ZFS_PROP_VERSION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zfs_set_version(name, intval)) != 0) + return (error); + break; + + default: + if (nvpair_type(elem) == DATA_TYPE_STRING) { + if (zfs_prop_get_type(prop) != + PROP_TYPE_STRING) + return (EINVAL); + VERIFY(nvpair_value_string(elem, &strval) == 0); + if ((error = dsl_prop_set(name, + nvpair_name(elem), 1, strlen(strval) + 1, + strval)) != 0) + return (error); + } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + const char *unused; + + VERIFY(nvpair_value_uint64(elem, &intval) == 0); + + switch (zfs_prop_get_type(prop)) { + case PROP_TYPE_NUMBER: + break; + case PROP_TYPE_STRING: + return (EINVAL); + case PROP_TYPE_INDEX: + if (zfs_prop_index_to_string(prop, + intval, &unused) != 0) + return (EINVAL); + break; + default: + cmn_err(CE_PANIC, + "unknown property type"); + break; + } + + if ((error = dsl_prop_set(name, propname, + 8, 1, &intval)) != 0) + return (error); + } else { + return (EINVAL); + } + break; + } + } + + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to inherit + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_cookie clear existing local props? + * + * outputs: none + */ +static int +zfs_ioc_set_prop(zfs_cmd_t *zc) +{ + nvlist_t *nvl; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvl)) != 0) + return (error); + + if (zc->zc_cookie) { + nvlist_t *origprops; + objset_t *os; + + if (dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + if (dsl_prop_get_all(os, &origprops, TRUE) == 0) { + clear_props(zc->zc_name, origprops); + nvlist_free(origprops); + } + dmu_objset_close(os); + } + + } + + error = zfs_set_prop_nvlist(zc->zc_name, nvl); + + nvlist_free(nvl); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to inherit + * + * outputs: none + */ +static int +zfs_ioc_inherit_prop(zfs_cmd_t *zc) +{ + /* the property name has been validated by zfs_secpolicy_inherit() */ + return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); +} + +static int +zfs_ioc_pool_set_props(zfs_cmd_t *zc) +{ + nvlist_t *props; + spa_t *spa; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &props))) + return (error); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + nvlist_free(props); + return (error); + } + + error = spa_prop_set(spa, props); + + nvlist_free(props); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_props(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *nvp = NULL; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_prop_get(spa, &nvp); + + if (error == 0 && zc->zc_nvlist_dst != NULL) + error = put_nvlist(zc, nvp); + else + error = EFAULT; + + spa_close(spa, FTAG); + + if (nvp) + nvlist_free(nvp); + return (error); +} + +static int +zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + uint32_t uid; + uint32_t gid; + uint32_t *groups; + uint_t group_cnt; + cred_t *usercred; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvp)) != 0) { + return (error); + } + + if ((error = nvlist_lookup_uint32(nvp, + ZFS_DELEG_PERM_UID, &uid)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + + if ((error = nvlist_lookup_uint32(nvp, + ZFS_DELEG_PERM_GID, &gid)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + + if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS, + &groups, &group_cnt)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + usercred = cralloc(); + if ((crsetugid(usercred, uid, gid) != 0) || + (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) { + nvlist_free(nvp); + crfree(usercred); + return (EPERM); + } + nvlist_free(nvp); + error = dsl_deleg_access(zc->zc_name, + zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred); + crfree(usercred); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_src{_size} nvlist of delegated permissions + * zc_perm_action allow/unallow flag + * + * outputs: none + */ +static int +zfs_ioc_set_fsacl(zfs_cmd_t *zc) +{ + int error; + nvlist_t *fsaclnv = NULL; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &fsaclnv)) != 0) + return (error); + + /* + * Verify nvlist is constructed correctly + */ + if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { + nvlist_free(fsaclnv); + return (EINVAL); + } + + /* + * If we don't have PRIV_SYS_MOUNT, then validate + * that user is allowed to hand out each permission in + * the nvlist(s) + */ + + error = secpolicy_zfs(CRED()); + if (error) { + if (zc->zc_perm_action == B_FALSE) { + error = dsl_deleg_can_allow(zc->zc_name, + fsaclnv, CRED()); + } else { + error = dsl_deleg_can_unallow(zc->zc_name, + fsaclnv, CRED()); + } + } + + if (error == 0) + error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); + + nvlist_free(fsaclnv); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of delegated permissions + */ +static int +zfs_ioc_get_fsacl(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + +/* + * inputs: + * zc_name name of volume + * + * outputs: none + */ +static int +zfs_ioc_create_minor(zfs_cmd_t *zc) +{ + return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip))); +} + +/* + * inputs: + * zc_name name of volume + * + * outputs: none + */ +static int +zfs_ioc_remove_minor(zfs_cmd_t *zc) +{ + return (zvol_remove_minor(zc->zc_name)); +} + +/* + * Search the vfs list for a specified resource. Returns a pointer to it + * or NULL if no suitable entry is found. The caller of this routine + * is responsible for releasing the returned vfs pointer. + */ +static vfs_t * +zfs_get_vfs(const char *resource) +{ + struct vfs *vfsp; + struct vfs *vfs_found = NULL; + + vfs_list_read_lock(); + vfsp = rootvfs; + do { + if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { + VFS_HOLD(vfsp); + vfs_found = vfsp; + break; + } + vfsp = vfsp->vfs_next; + } while (vfsp != rootvfs); + vfs_list_unlock(); + return (vfs_found); +} + +/* ARGSUSED */ +static void +zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + + zfs_create_fs(os, cr, zct->zct_zplprops, tx); +} + +#define ZFS_PROP_UNDEFINED ((uint64_t)-1) + +/* + * inputs: + * createprops list of properties requested by creator + * default_zplver zpl version to use if unspecified in createprops + * fuids_ok fuids allowed in this version of the spa? + * os parent objset pointer (NULL if root fs) + * + * outputs: + * zplprops values for the zplprops we attach to the master node object + * is_ci true if requested file system will be purely case-insensitive + * + * Determine the settings for utf8only, normalization and + * casesensitivity. Specific values may have been requested by the + * creator and/or we can inherit values from the parent dataset. If + * the file system is of too early a vintage, a creator can not + * request settings for these properties, even if the requested + * setting is the default value. We don't actually want to create dsl + * properties for these, so remove them from the source nvlist after + * processing. + */ +static int +zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver, + boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops, + boolean_t *is_ci) +{ + uint64_t zplver = default_zplver; + uint64_t sense = ZFS_PROP_UNDEFINED; + uint64_t norm = ZFS_PROP_UNDEFINED; + uint64_t u8 = ZFS_PROP_UNDEFINED; + + ASSERT(zplprops != NULL); + + /* + * Pull out creator prop choices, if any. + */ + if (createprops) { + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_CASE), &sense); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_CASE)); + } + + /* + * If the zpl version requested is whacky or the file system + * or pool is version is too "young" to support normalization + * and the creator tried to set a value for one of the props, + * error out. + */ + if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || + (zplver >= ZPL_VERSION_FUID && !fuids_ok) || + (zplver < ZPL_VERSION_NORMALIZATION && + (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || + sense != ZFS_PROP_UNDEFINED))) + return (ENOTSUP); + + /* + * Put the version in the zplprops + */ + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); + + if (norm == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); + + /* + * If we're normalizing, names must always be valid UTF-8 strings. + */ + if (norm) + u8 = 1; + if (u8 == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); + + if (sense == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); + + if (is_ci) + *is_ci = (sense == ZFS_CASE_INSENSITIVE); + + return (0); +} + +static int +zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + boolean_t fuids_ok = B_TRUE; + uint64_t zplver = ZPL_VERSION; + objset_t *os = NULL; + char parentname[MAXNAMELEN]; + char *cp; + int error; + + (void) strlcpy(parentname, dataset, sizeof (parentname)); + cp = strrchr(parentname, '/'); + ASSERT(cp != NULL); + cp[0] = '\0'; + + if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) { + zplver = ZPL_VERSION_FUID - 1; + fuids_ok = B_FALSE; + } + + /* + * Open parent object set so we can inherit zplprop values. + */ + if ((error = dmu_objset_open(parentname, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) + return (error); + + error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops, + zplprops, is_ci); + dmu_objset_close(os); + return (error); +} + +static int +zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + boolean_t fuids_ok = B_TRUE; + uint64_t zplver = ZPL_VERSION; + int error; + + if (spa_vers < SPA_VERSION_FUID) { + zplver = ZPL_VERSION_FUID - 1; + fuids_ok = B_FALSE; + } + + error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops, + zplprops, is_ci); + return (error); +} + +/* + * inputs: + * zc_objset_type type of objset to create (fs vs zvol) + * zc_name name of new objset + * zc_value name of snapshot to clone from (may be empty) + * zc_nvlist_src{_size} nvlist of properties to apply + * + * outputs: none + */ +static int +zfs_ioc_create(zfs_cmd_t *zc) +{ + objset_t *clone; + int error = 0; + zfs_creat_t zct; + nvlist_t *nvprops = NULL; + void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); + dmu_objset_type_t type = zc->zc_objset_type; + + switch (type) { + + case DMU_OST_ZFS: + cbfunc = zfs_create_cb; + break; + + case DMU_OST_ZVOL: + cbfunc = zvol_create_cb; + break; + + default: + cbfunc = NULL; + break; + } + if (strchr(zc->zc_name, '@') || + strchr(zc->zc_name, '%')) + return (EINVAL); + + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvprops)) != 0) + return (error); + + zct.zct_zplprops = NULL; + zct.zct_props = nvprops; + + if (zc->zc_value[0] != '\0') { + /* + * We're creating a clone of an existing snapshot. + */ + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { + nvlist_free(nvprops); + return (EINVAL); + } + + error = dmu_objset_open(zc->zc_value, type, + DS_MODE_USER | DS_MODE_READONLY, &clone); + if (error) { + nvlist_free(nvprops); + return (error); + } + + error = dmu_objset_create(zc->zc_name, type, clone, 0, + NULL, NULL); + if (error) { + dmu_objset_close(clone); + nvlist_free(nvprops); + return (error); + } + dmu_objset_close(clone); + } else { + boolean_t is_insensitive = B_FALSE; + + if (cbfunc == NULL) { + nvlist_free(nvprops); + return (EINVAL); + } + + if (type == DMU_OST_ZVOL) { + uint64_t volsize, volblocksize; + + if (nvprops == NULL || + nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), + &volsize) != 0) { + nvlist_free(nvprops); + return (EINVAL); + } + + if ((error = nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize)) != 0 && error != ENOENT) { + nvlist_free(nvprops); + return (EINVAL); + } + + if (error != 0) + volblocksize = zfs_prop_default_numeric( + ZFS_PROP_VOLBLOCKSIZE); + + if ((error = zvol_check_volblocksize( + volblocksize)) != 0 || + (error = zvol_check_volsize(volsize, + volblocksize)) != 0) { + nvlist_free(nvprops); + return (error); + } + } else if (type == DMU_OST_ZFS) { + int error; + + /* + * We have to have normalization and + * case-folding flags correct when we do the + * file system creation, so go figure them out + * now. + */ + VERIFY(nvlist_alloc(&zct.zct_zplprops, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops(zc->zc_name, nvprops, + zct.zct_zplprops, &is_insensitive); + if (error != 0) { + nvlist_free(nvprops); + nvlist_free(zct.zct_zplprops); + return (error); + } + } + error = dmu_objset_create(zc->zc_name, type, NULL, + is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); + nvlist_free(zct.zct_zplprops); + } + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0) + (void) dmu_objset_destroy(zc->zc_name); + } + nvlist_free(nvprops); + return (error); +} + +struct snap_prop_arg { + nvlist_t *nvprops; + const char *snapname; +}; + +static int +set_snap_props(char *name, void *arg) +{ + struct snap_prop_arg *snpa = arg; + int len = strlen(name) + strlen(snpa->snapname) + 2; + char *buf = kmem_alloc(len, KM_SLEEP); + int err; + + (void) snprintf(buf, len, "%s@%s", name, snpa->snapname); + err = zfs_set_prop_nvlist(buf, snpa->nvprops); + if (err) + (void) dmu_objset_destroy(buf); + kmem_free(buf, len); + return (err); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value short name of snapshot + * zc_cookie recursive flag + * + * outputs: none + */ +static int +zfs_ioc_snapshot(zfs_cmd_t *zc) +{ + nvlist_t *nvprops = NULL; + int error; + boolean_t recursive = zc->zc_cookie; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvprops)) != 0) + return (error); + + error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive); + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + struct snap_prop_arg snpa; + snpa.nvprops = nvprops; + snpa.snapname = zc->zc_value; + if (recursive) { + error = dmu_objset_find(zc->zc_name, + set_snap_props, &snpa, DS_FIND_CHILDREN); + if (error) { + (void) dmu_snapshots_destroy(zc->zc_name, + zc->zc_value); + } + } else { + error = set_snap_props(zc->zc_name, &snpa); + } + } + nvlist_free(nvprops); + return (error); +} + +int +zfs_unmount_snap(char *name, void *arg) +{ + vfs_t *vfsp = NULL; + + if (arg) { + char *snapname = arg; + int len = strlen(name) + strlen(snapname) + 2; + char *buf = kmem_alloc(len, KM_SLEEP); + + (void) strcpy(buf, name); + (void) strcat(buf, "@"); + (void) strcat(buf, snapname); + vfsp = zfs_get_vfs(buf); + kmem_free(buf, len); + } else if (strchr(name, '@')) { + vfsp = zfs_get_vfs(name); + } + + if (vfsp) { + /* + * Always force the unmount for snapshots. + */ + int flag = MS_FORCE; + int err; + + if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { + VFS_RELE(vfsp); + return (err); + } + VFS_RELE(vfsp); + if ((err = dounmount(vfsp, flag, kcred)) != 0) + return (err); + } + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value short name of snapshot + * + * outputs: none + */ +static int +zfs_ioc_destroy_snaps(zfs_cmd_t *zc) +{ + int err; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + err = dmu_objset_find(zc->zc_name, + zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); + if (err) + return (err); + return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); +} + +/* + * inputs: + * zc_name name of dataset to destroy + * zc_objset_type type of objset + * + * outputs: none + */ +static int +zfs_ioc_destroy(zfs_cmd_t *zc) +{ + if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { + int err = zfs_unmount_snap(zc->zc_name, NULL); + if (err) + return (err); + } + + return (dmu_objset_destroy(zc->zc_name)); +} + +/* + * inputs: + * zc_name name of dataset to rollback (to most recent snapshot) + * + * outputs: none + */ +static int +zfs_ioc_rollback(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + zfsvfs_t *zfsvfs = NULL; + + /* + * Get the zfsvfs for the receiving objset. There + * won't be one if we're operating on a zvol, if the + * objset doesn't exist yet, or is not mounted. + */ + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) + return (error); + + if (dmu_objset_type(os) == DMU_OST_ZFS) { + mutex_enter(&os->os->os_user_ptr_lock); + zfsvfs = dmu_objset_get_user(os); + if (zfsvfs != NULL) + VFS_HOLD(zfsvfs->z_vfs); + mutex_exit(&os->os->os_user_ptr_lock); + } + + if (zfsvfs != NULL) { + char *osname; + int mode; + + osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + error = zfs_suspend_fs(zfsvfs, osname, &mode); + if (error == 0) { + int resume_err; + + ASSERT(strcmp(osname, zc->zc_name) == 0); + error = dmu_objset_rollback(os); + resume_err = zfs_resume_fs(zfsvfs, osname, mode); + error = error ? error : resume_err; + } else { + dmu_objset_close(os); + } + kmem_free(osname, MAXNAMELEN); + VFS_RELE(zfsvfs->z_vfs); + } else { + error = dmu_objset_rollback(os); + } + /* Note, the dmu_objset_rollback() releases the objset for us. */ + + return (error); +} + +/* + * inputs: + * zc_name old name of dataset + * zc_value new name of dataset + * zc_cookie recursive flag (only valid for snapshots) + * + * outputs: none + */ +static int +zfs_ioc_rename(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie & 1; + + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '%')) + return (EINVAL); + + /* + * Unmount snapshot unless we're doing a recursive rename, + * in which case the dataset code figures out which snapshots + * to unmount. + */ + if (!recursive && strchr(zc->zc_name, '@') != NULL && + zc->zc_objset_type == DMU_OST_ZFS) { + int err = zfs_unmount_snap(zc->zc_name, NULL); + if (err) + return (err); + } + return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); +} + +static void +clear_props(char *dataset, nvlist_t *props) +{ + zfs_cmd_t *zc; + nvpair_t *prop; + + if (props == NULL) + return; + zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strcpy(zc->zc_name, dataset); + for (prop = nvlist_next_nvpair(props, NULL); prop; + prop = nvlist_next_nvpair(props, prop)) { + (void) strcpy(zc->zc_value, nvpair_name(prop)); + if (zfs_secpolicy_inherit(zc, CRED()) == 0) + (void) zfs_ioc_inherit_prop(zc); + } + kmem_free(zc, sizeof (zfs_cmd_t)); +} + +/* + * inputs: + * zc_name name of containing filesystem + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_value name of snapshot to create + * zc_string name of clone origin (if DRR_FLAG_CLONE) + * zc_cookie file descriptor to recv from + * zc_begin_record the BEGIN record of the stream (not byteswapped) + * zc_guid force flag + * + * outputs: + * zc_cookie number of bytes read + */ +static int +zfs_ioc_recv(zfs_cmd_t *zc) +{ + file_t *fp; + objset_t *os; + dmu_recv_cookie_t drc; + zfsvfs_t *zfsvfs = NULL; + boolean_t force = (boolean_t)zc->zc_guid; + int error, fd; + offset_t off; + nvlist_t *props = NULL; + nvlist_t *origprops = NULL; + objset_t *origin = NULL; + char *tosnap; + char tofs[ZFS_MAXNAMELEN]; + + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '@') == NULL || + strchr(zc->zc_value, '%')) + return (EINVAL); + + (void) strcpy(tofs, zc->zc_value); + tosnap = strchr(tofs, '@'); + *tosnap = '\0'; + tosnap++; + + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &props)) != 0) + return (error); + + fd = zc->zc_cookie; + fp = getf(fd); + if (fp == NULL) { + nvlist_free(props); + return (EBADF); + } + + if (dmu_objset_open(tofs, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + /* + * Try to get the zfsvfs for the receiving objset. + * There won't be one if we're operating on a zvol, + * if the objset doesn't exist yet, or is not mounted. + */ + mutex_enter(&os->os->os_user_ptr_lock); + if (zfsvfs = dmu_objset_get_user(os)) { + if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { + mutex_exit(&os->os->os_user_ptr_lock); + dmu_objset_close(os); + zfsvfs = NULL; + error = EBUSY; + goto out; + } + VFS_HOLD(zfsvfs->z_vfs); + } + mutex_exit(&os->os->os_user_ptr_lock); + + /* + * If new properties are supplied, they are to completely + * replace the existing ones, so stash away the existing ones. + */ + if (props) + (void) dsl_prop_get_all(os, &origprops, TRUE); + + dmu_objset_close(os); + } + + if (zc->zc_string[0]) { + error = dmu_objset_open(zc->zc_string, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &origin); + if (error) + goto out; + } + + error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, + force, origin, zfsvfs != NULL, &drc); + if (origin) + dmu_objset_close(origin); + if (error) + goto out; + + /* + * Reset properties. We do this before we receive the stream + * so that the properties are applied to the new data. + */ + if (props) { + clear_props(tofs, origprops); + /* + * XXX - Note, this is all-or-nothing; should be best-effort. + */ + (void) zfs_set_prop_nvlist(tofs, props); + } + + off = fp->f_offset; + error = dmu_recv_stream(&drc, fp->f_vnode, &off); + + if (error == 0 && zfsvfs) { + char *osname; + int mode; + + /* online recv */ + osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + error = zfs_suspend_fs(zfsvfs, osname, &mode); + if (error == 0) { + int resume_err; + + error = dmu_recv_end(&drc); + resume_err = zfs_resume_fs(zfsvfs, osname, mode); + error = error ? error : resume_err; + } else { + dmu_recv_abort_cleanup(&drc); + } + kmem_free(osname, MAXNAMELEN); + } else if (error == 0) { + error = dmu_recv_end(&drc); + } + + zc->zc_cookie = off - fp->f_offset; + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; + + /* + * On error, restore the original props. + */ + if (error && props) { + clear_props(tofs, props); + (void) zfs_set_prop_nvlist(tofs, origprops); + } +out: + if (zfsvfs) { + mutex_exit(&zfsvfs->z_online_recv_lock); + VFS_RELE(zfsvfs->z_vfs); + } + nvlist_free(props); + nvlist_free(origprops); + releasef(fd); + return (error); +} + +/* + * inputs: + * zc_name name of snapshot to send + * zc_value short name of incremental fromsnap (may be empty) + * zc_cookie file descriptor to send stream to + * zc_obj fromorigin flag (mutually exclusive with zc_value) + * + * outputs: none + */ +static int +zfs_ioc_send(zfs_cmd_t *zc) +{ + objset_t *fromsnap = NULL; + objset_t *tosnap; + file_t *fp; + int error; + offset_t off; + + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &tosnap); + if (error) + return (error); + + if (zc->zc_value[0] != '\0') { + char *buf; + char *cp; + + buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strncpy(buf, zc->zc_name, MAXPATHLEN); + cp = strchr(buf, '@'); + if (cp) + *(cp+1) = 0; + (void) strncat(buf, zc->zc_value, MAXPATHLEN); + error = dmu_objset_open(buf, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &fromsnap); + kmem_free(buf, MAXPATHLEN); + if (error) { + dmu_objset_close(tosnap); + return (error); + } + } + + fp = getf(zc->zc_cookie); + if (fp == NULL) { + dmu_objset_close(tosnap); + if (fromsnap) + dmu_objset_close(fromsnap); + return (EBADF); + } + + off = fp->f_offset; + error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_vnode, &off); + + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; + releasef(zc->zc_cookie); + if (fromsnap) + dmu_objset_close(fromsnap); + dmu_objset_close(tosnap); + return (error); +} + +static int +zfs_ioc_inject_fault(zfs_cmd_t *zc) +{ + int id, error; + + error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, + &zc->zc_inject_record); + + if (error == 0) + zc->zc_guid = (uint64_t)id; + + return (error); +} + +static int +zfs_ioc_clear_fault(zfs_cmd_t *zc) +{ + return (zio_clear_fault((int)zc->zc_guid)); +} + +static int +zfs_ioc_inject_list_next(zfs_cmd_t *zc) +{ + int id = (int)zc->zc_guid; + int error; + + error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), + &zc->zc_inject_record); + + zc->zc_guid = id; + + return (error); +} + +static int +zfs_ioc_error_log(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + size_t count = (size_t)zc->zc_nvlist_dst_size; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, + &count); + if (error == 0) + zc->zc_nvlist_dst_size = count; + else + zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_clear(zfs_cmd_t *zc) +{ + spa_t *spa; + vdev_t *vd; + int error; + + /* + * On zpool clear we also fix up missing slogs + */ + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(zc->zc_name); + if (spa == NULL) { + mutex_exit(&spa_namespace_lock); + return (EIO); + } + if (spa->spa_log_state == SPA_LOG_MISSING) { + /* we need to let spa_open/spa_load clear the chains */ + spa->spa_log_state = SPA_LOG_CLEAR; + } + mutex_exit(&spa_namespace_lock); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + spa_vdev_state_enter(spa); + + if (zc->zc_guid == 0) { + vd = NULL; + } else { + vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENODEV); + spa_close(spa, FTAG); + return (ENODEV); + } + } + + vdev_clear(spa, vd); + + (void) spa_vdev_state_exit(spa, NULL, 0); + + /* + * Resume any suspended I/Os. + */ + zio_resume(spa); + + spa_close(spa, FTAG); + + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of origin snapshot + * + * outputs: none + */ +static int +zfs_ioc_promote(zfs_cmd_t *zc) +{ + char *cp; + + /* + * We don't need to unmount *all* the origin fs's snapshots, but + * it's easier. + */ + cp = strchr(zc->zc_value, '@'); + if (cp) + *cp = '\0'; + (void) dmu_objset_find(zc->zc_value, + zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); + return (dsl_dataset_promote(zc->zc_name)); +} + +/* + * We don't want to have a hard dependency + * against some special symbols in sharefs + * nfs, and smbsrv. Determine them if needed when + * the first file system is shared. + * Neither sharefs, nfs or smbsrv are unloadable modules. + */ +int (*znfsexport_fs)(void *arg); +int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); +int (*zsmbexport_fs)(void *arg, boolean_t add_share); + +int zfs_nfsshare_inited; +int zfs_smbshare_inited; + +ddi_modhandle_t nfs_mod; +ddi_modhandle_t sharefs_mod; +ddi_modhandle_t smbsrv_mod; +kmutex_t zfs_share_lock; + +static int +zfs_init_sharefs() +{ + int error; + + ASSERT(MUTEX_HELD(&zfs_share_lock)); + /* Both NFS and SMB shares also require sharetab support. */ + if (sharefs_mod == NULL && ((sharefs_mod = + ddi_modopen("fs/sharefs", + KRTLD_MODE_FIRST, &error)) == NULL)) { + return (ENOSYS); + } + if (zshare_fs == NULL && ((zshare_fs = + (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) + ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { + return (ENOSYS); + } + return (0); +} + +static int +zfs_ioc_share(zfs_cmd_t *zc) +{ + int error; + int opcode; + + switch (zc->zc_share.z_sharetype) { + case ZFS_SHARE_NFS: + case ZFS_UNSHARE_NFS: + if (zfs_nfsshare_inited == 0) { + mutex_enter(&zfs_share_lock); + if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", + KRTLD_MODE_FIRST, &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + if (znfsexport_fs == NULL && + ((znfsexport_fs = (int (*)(void *)) + ddi_modsym(nfs_mod, + "nfs_export", &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + error = zfs_init_sharefs(); + if (error) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + zfs_nfsshare_inited = 1; + mutex_exit(&zfs_share_lock); + } + break; + case ZFS_SHARE_SMB: + case ZFS_UNSHARE_SMB: + if (zfs_smbshare_inited == 0) { + mutex_enter(&zfs_share_lock); + if (smbsrv_mod == NULL && ((smbsrv_mod = + ddi_modopen("drv/smbsrv", + KRTLD_MODE_FIRST, &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + if (zsmbexport_fs == NULL && ((zsmbexport_fs = + (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, + "smb_server_share", &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + error = zfs_init_sharefs(); + if (error) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + zfs_smbshare_inited = 1; + mutex_exit(&zfs_share_lock); + } + break; + default: + return (EINVAL); + } + + switch (zc->zc_share.z_sharetype) { + case ZFS_SHARE_NFS: + case ZFS_UNSHARE_NFS: + if (error = + znfsexport_fs((void *) + (uintptr_t)zc->zc_share.z_exportdata)) + return (error); + break; + case ZFS_SHARE_SMB: + case ZFS_UNSHARE_SMB: + if (error = zsmbexport_fs((void *) + (uintptr_t)zc->zc_share.z_exportdata, + zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? + B_TRUE : B_FALSE)) { + return (error); + } + break; + } + + opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || + zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? + SHAREFS_ADD : SHAREFS_REMOVE; + + /* + * Add or remove share from sharetab + */ + error = zshare_fs(opcode, + (void *)(uintptr_t)zc->zc_share.z_sharedata, + zc->zc_share.z_sharemax); + + return (error); + +} + +/* + * pool create, destroy, and export don't log the history as part of + * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export + * do the logging of those commands. + */ +static zfs_ioc_vec_t zfs_ioc_vec[] = { + { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE }, + { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_dataset_list_next, zfs_secpolicy_read, + DATASET_NAME, B_FALSE }, + { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, + DATASET_NAME, B_FALSE }, + { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE }, + { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, + { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, + { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE }, + { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, + { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE }, + { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE }, + { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE }, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE }, + { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE }, + { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE }, + { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, + { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE }, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE }, + { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE }, + { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, + DATASET_NAME, B_FALSE }, + { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE }, + { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE }, +}; + +static int +zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +{ + zfs_cmd_t *zc; + uint_t vec; + int error, rc; + + if (getminor(dev) != 0) + return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp)); + + vec = cmd - ZFS_IOC; + ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); + + if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) + return (EINVAL); + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + + error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t)); + + if (error == 0) + error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr); + + /* + * Ensure that all pool/dataset names are valid before we pass down to + * the lower layers. + */ + if (error == 0) { + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + switch (zfs_ioc_vec[vec].zvec_namecheck) { + case POOL_NAME: + if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) + error = EINVAL; + break; + + case DATASET_NAME: + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) + error = EINVAL; + break; + + case NO_NAME: + break; + } + } + + if (error == 0) + error = zfs_ioc_vec[vec].zvec_func(zc); + + rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t)); + if (error == 0) { + error = rc; + if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE) + zfs_log_history(zc); + } + + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); +} + +static int +zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, + DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + zfs_dip = dip; + + ddi_report_dev(dip); + + return (DDI_SUCCESS); +} + +static int +zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (spa_busy() || zfs_busy() || zvol_busy()) + return (DDI_FAILURE); + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + zfs_dip = NULL; + + ddi_prop_remove_all(dip); + ddi_remove_minor_node(dip, NULL); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = zfs_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + return (DDI_SUCCESS); + } + + return (DDI_FAILURE); +} + +/* + * OK, so this is a little weird. + * + * /dev/zfs is the control node, i.e. minor 0. + * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. + * + * /dev/zfs has basically nothing to do except serve up ioctls, + * so most of the standard driver entry points are in zvol.c. + */ +static struct cb_ops zfs_cb_ops = { + zvol_open, /* open */ + zvol_close, /* close */ + zvol_strategy, /* strategy */ + nodev, /* print */ + zvol_dump, /* dump */ + zvol_read, /* read */ + zvol_write, /* write */ + zfsdev_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ + CB_REV, /* version */ + nodev, /* async read */ + nodev, /* async write */ +}; + +static struct dev_ops zfs_dev_ops = { + DEVO_REV, /* version */ + 0, /* refcnt */ + zfs_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + zfs_attach, /* attach */ + zfs_detach, /* detach */ + nodev, /* reset */ + &zfs_cb_ops, /* driver operations */ + NULL, /* no bus operations */ + NULL, /* power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv zfs_modldrv = { + &mod_driverops, + "ZFS storage pool", + &zfs_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&zfs_modlfs, + (void *)&zfs_modldrv, + NULL +}; + + +uint_t zfs_fsyncer_key; +extern uint_t rrw_tsd_key; + +int +_init(void) +{ + int error; + + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + + if ((error = mod_install(&modlinkage)) != 0) { + zvol_fini(); + zfs_fini(); + spa_fini(); + return (error); + } + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, NULL); + + error = ldi_ident_from_mod(&modlinkage, &zfs_li); + ASSERT(error == 0); + mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +int +_fini(void) +{ + int error; + + if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + zvol_fini(); + zfs_fini(); + spa_fini(); + if (zfs_nfsshare_inited) + (void) ddi_modclose(nfs_mod); + if (zfs_smbshare_inited) + (void) ddi_modclose(smbsrv_mod); + if (zfs_nfsshare_inited || zfs_smbshare_inited) + (void) ddi_modclose(sharefs_mod); + + tsd_destroy(&zfs_fsyncer_key); + ldi_ident_release(zfs_li); + zfs_li = NULL; + mutex_destroy(&zfs_share_lock); + + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c new file mode 100644 index 000000000..11cd4c264 --- /dev/null +++ b/module/zfs/zfs_log.c @@ -0,0 +1,694 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/byteorder.h> +#include <sys/policy.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/acl.h> +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/zfs_fuid.h> +#include <sys/ddi.h> + +/* + * All the functions in this file are used to construct the log entries + * to record transactions. They allocate * an intent log transaction + * structure (itx_t) and save within it all the information necessary to + * possibly replay the transaction. The itx is then assigned a sequence + * number and inserted in the in-memory list anchored in the zilog. + */ + +int +zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) +{ + int isxvattr = (vap->va_mask & AT_XVATTR); + switch (type) { + case Z_FILE: + if (vsecp == NULL && !isxvattr) + return (TX_CREATE); + if (vsecp && isxvattr) + return (TX_CREATE_ACL_ATTR); + if (vsecp) + return (TX_CREATE_ACL); + else + return (TX_CREATE_ATTR); + /*NOTREACHED*/ + case Z_DIR: + if (vsecp == NULL && !isxvattr) + return (TX_MKDIR); + if (vsecp && isxvattr) + return (TX_MKDIR_ACL_ATTR); + if (vsecp) + return (TX_MKDIR_ACL); + else + return (TX_MKDIR_ATTR); + case Z_XATTRDIR: + return (TX_MKXATTR); + } + ASSERT(0); + return (TX_MAX_TYPE); +} + +/* + * build up the log data necessary for logging xvattr_t + * First lr_attr_t is initialized. following the lr_attr_t + * is the mapsize and attribute bitmap copied from the xvattr_t. + * Following the bitmap and bitmapsize two 64 bit words are reserved + * for the create time which may be set. Following the create time + * records a single 64 bit integer which has the bits to set on + * replay for the xvattr. + */ +static void +zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + uint32_t *bitmap; + uint64_t *attrs; + uint64_t *crtime; + xoptattr_t *xoap; + void *scanstamp; + int i; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + lrattr->lr_attr_masksize = xvap->xva_mapsize; + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { + *bitmap = xvap->xva_reqattrmap[i]; + } + + /* Now pack the attributes up in a single uint64_t */ + attrs = (uint64_t *)bitmap; + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + *attrs = 0; + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + *attrs |= (xoap->xoa_readonly == 0) ? 0 : + XAT0_READONLY; + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + *attrs |= (xoap->xoa_hidden == 0) ? 0 : + XAT0_HIDDEN; + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + *attrs |= (xoap->xoa_system == 0) ? 0 : + XAT0_SYSTEM; + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + *attrs |= (xoap->xoa_archive == 0) ? 0 : + XAT0_ARCHIVE; + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + *attrs |= (xoap->xoa_immutable == 0) ? 0 : + XAT0_IMMUTABLE; + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + *attrs |= (xoap->xoa_nounlink == 0) ? 0 : + XAT0_NOUNLINK; + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + *attrs |= (xoap->xoa_appendonly == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + *attrs |= (xoap->xoa_opaque == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + *attrs |= (xoap->xoa_nodump == 0) ? 0 : + XAT0_NODUMP; + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : + XAT0_AV_QUARANTINED; + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + *attrs |= (xoap->xoa_av_modified == 0) ? 0 : + XAT0_AV_MODIFIED; + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); +} + +static void * +zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_t *zfuid; + uint64_t *fuidloc = start; + + /* First copy in the ACE FUIDs */ + for (zfuid = list_head(&fuidp->z_fuids); zfuid; + zfuid = list_next(&fuidp->z_fuids, zfuid)) { + *fuidloc++ = zfuid->z_logfuid; + } + return (fuidloc); +} + + +static void * +zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_domain_t *zdomain; + + /* now copy in the domain info, if any */ + if (fuidp->z_domain_str_sz != 0) { + for (zdomain = list_head(&fuidp->z_domains); zdomain; + zdomain = list_next(&fuidp->z_domains, zdomain)) { + bcopy((void *)zdomain->z_domain, start, + strlen(zdomain->z_domain) + 1); + start = (caddr_t)start + + strlen(zdomain->z_domain) + 1; + } + } + return (start); +} + +/* + * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, + * TX_MKDIR_ATTR and TX_MKXATTR + * transactions. + * + * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID + * domain information appended prior to the name. In this case the + * uid/gid in the log record will be a log centric FUID. + * + * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that + * may contain attributes, ACL and optional fuid information. + * + * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify + * and ACL and normal users/groups in the ACEs. + * + * There may be an optional xvattr attribute information similar + * to zfs_log_setattr. + * + * Also, after the file name "domain" strings may be appended. + */ +void +zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, + zfs_fuid_info_t *fuidp, vattr_t *vap) +{ + itx_t *itx; + uint64_t seq; + lr_create_t *lr; + lr_acl_create_t *lracl; + size_t aclsize; + size_t xvatsize = 0; + size_t txsize; + xvattr_t *xvap = (xvattr_t *)vap; + void *end; + size_t lrsize; + size_t namesize = strlen(name) + 1; + size_t fuidsz = 0; + + if (zilog == NULL) + return; + + /* + * If we have FUIDs present then add in space for + * domains and ACE fuid's if any. + */ + if (fuidp) { + fuidsz += fuidp->z_domain_str_sz; + fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); + } + + if (vap->va_mask & AT_XVATTR) + xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || + (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || + (int)txtype == TX_MKXATTR) { + txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; + lrsize = sizeof (*lr); + } else { + aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; + txsize = + sizeof (lr_acl_create_t) + namesize + fuidsz + + ZIL_ACE_LENGTH(aclsize) + xvatsize; + lrsize = sizeof (lr_acl_create_t); + } + + itx = zil_itx_create(txtype, txsize); + + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_mode = zp->z_phys->zp_mode; + if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) { + lr->lr_uid = (uint64_t)zp->z_phys->zp_uid; + } else { + lr->lr_uid = fuidp->z_fuid_owner; + } + if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) { + lr->lr_gid = (uint64_t)zp->z_phys->zp_gid; + } else { + lr->lr_gid = fuidp->z_fuid_group; + } + lr->lr_gen = zp->z_phys->zp_gen; + lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; + lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + lr->lr_rdev = zp->z_phys->zp_rdev; + + /* + * Fill in xvattr info if any + */ + if (vap->va_mask & AT_XVATTR) { + zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); + end = (caddr_t)lr + lrsize + xvatsize; + } else { + end = (caddr_t)lr + lrsize; + } + + /* Now fill in any ACL info */ + + if (vsecp) { + lracl = (lr_acl_create_t *)&itx->itx_lr; + lracl->lr_aclcnt = vsecp->vsa_aclcnt; + lracl->lr_acl_bytes = aclsize; + lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) + lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lracl->lr_acl_flags = 0; + + bcopy(vsecp->vsa_aclentp, end, aclsize); + end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); + } + + /* drop in FUID info */ + if (fuidp) { + end = zfs_log_fuid_ids(fuidp, end); + end = zfs_log_fuid_domains(fuidp, end); + } + /* + * Now place file name in log record + */ + bcopy(name, end, namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. + */ +void +zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_remove_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_remove_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; +} + +/* + * zfs_log_link() handles TX_LINK transactions. + */ +void +zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_link_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_link_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_link_obj = zp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_symlink() handles TX_SYMLINK transactions. + */ +void +zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link) +{ + itx_t *itx; + uint64_t seq; + lr_create_t *lr; + size_t namesize = strlen(name) + 1; + size_t linksize = strlen(link) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_mode = zp->z_phys->zp_mode; + lr->lr_uid = zp->z_phys->zp_uid; + lr->lr_gid = zp->z_phys->zp_gid; + lr->lr_gen = zp->z_phys->zp_gen; + lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; + lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + bcopy(name, (char *)(lr + 1), namesize); + bcopy(link, (char *)(lr + 1) + namesize, linksize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_rename() handles TX_RENAME transactions. + */ +void +zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +{ + itx_t *itx; + uint64_t seq; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + bcopy(sname, (char *)(lr + 1), snamesize); + bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + + seq = zil_itx_assign(zilog, itx, tx); + sdzp->z_last_itx = seq; + tdzp->z_last_itx = seq; + szp->z_last_itx = seq; +} + +/* + * zfs_log_write() handles TX_WRITE transactions. + */ +ssize_t zfs_immediate_write_sz = 32768; + +#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ + sizeof (lr_write_t)) + +void +zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t resid, int ioflag) +{ + itx_wr_state_t write_state; + boolean_t slogging; + uintptr_t fsync_cnt; + + if (zilog == NULL || zp->z_unlinked) + return; + + /* + * Writes are handled in three different ways: + * + * WR_INDIRECT: + * In this mode, if we need to commit the write later, then the block + * is immediately written into the file system (using dmu_sync), + * and a pointer to the block is put into the log record. + * When the txg commits the block is linked in. + * This saves additionally writing the data into the log record. + * There are a few requirements for this to occur: + * - write is greater than zfs_immediate_write_sz + * - not using slogs (as slogs are assumed to always be faster + * than writing into the main pool) + * - the write occupies only one block + * WR_COPIED: + * If we know we'll immediately be committing the + * transaction (FSYNC or FDSYNC), the we allocate a larger + * log record here for the data and copy the data in. + * WR_NEED_COPY: + * Otherwise we don't allocate a buffer, and *if* we need to + * flush the write later then a buffer is allocated and + * we retrieve the data using the dmu. + */ + slogging = spa_has_slogs(zilog->zl_spa); + if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz) + write_state = WR_INDIRECT; + else if (ioflag & (FSYNC | FDSYNC)) + write_state = WR_COPIED; + else + write_state = WR_NEED_COPY; + + if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { + (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); + } + + while (resid) { + itx_t *itx; + lr_write_t *lr; + ssize_t len; + + /* + * If the write would overflow the largest block then split it. + */ + if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) + len = SPA_MAXBLOCKSIZE >> 1; + else + len = resid; + + itx = zil_itx_create(txtype, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, + zp->z_id, off, len, lr + 1) != 0) { + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } + + itx->itx_wr_state = write_state; + if (write_state == WR_NEED_COPY) + itx->itx_sod += len; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = zp->z_zfsvfs; + + if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) || + (ioflag & (FSYNC | FDSYNC))) + itx->itx_sync = B_TRUE; + else + itx->itx_sync = B_FALSE; + + zp->z_last_itx = zil_itx_assign(zilog, itx, tx); + + off += len; + resid -= len; + } +} + +/* + * zfs_log_truncate() handles TX_TRUNCATE transactions. + */ +void +zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len) +{ + itx_t *itx; + uint64_t seq; + lr_truncate_t *lr; + + if (zilog == NULL || zp->z_unlinked) + return; + + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} + +/* + * zfs_log_setattr() handles TX_SETATTR transactions. + */ +void +zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) +{ + itx_t *itx; + uint64_t seq; + lr_setattr_t *lr; + xvattr_t *xvap = (xvattr_t *)vap; + size_t recsize = sizeof (lr_setattr_t); + void *start; + + + if (zilog == NULL || zp->z_unlinked) + return; + + /* + * If XVATTR set, then log record size needs to allow + * for lr_attr_t + xvattr mask, mapsize and create time + * plus actual attribute values + */ + if (vap->va_mask & AT_XVATTR) + recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if (fuidp) + recsize += fuidp->z_domain_str_sz; + + itx = zil_itx_create(txtype, recsize); + lr = (lr_setattr_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_mask = (uint64_t)mask_applied; + lr->lr_mode = (uint64_t)vap->va_mode; + if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid)) + lr->lr_uid = fuidp->z_fuid_owner; + else + lr->lr_uid = (uint64_t)vap->va_uid; + + if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid)) + lr->lr_gid = fuidp->z_fuid_group; + else + lr->lr_gid = (uint64_t)vap->va_gid; + + lr->lr_size = (uint64_t)vap->va_size; + ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & AT_XVATTR) { + zfs_log_xvattr((lr_attr_t *)start, xvap); + start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); + } + + /* + * Now stick on domain information if any on end + */ + + if (fuidp) + (void) zfs_log_fuid_domains(fuidp, start); + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} + +/* + * zfs_log_acl() handles TX_ACL transactions. + */ +void +zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) +{ + itx_t *itx; + uint64_t seq; + lr_acl_v0_t *lrv0; + lr_acl_t *lr; + int txtype; + int lrsize; + size_t txsize; + size_t aclbytes = vsecp->vsa_aclentsz; + + if (zilog == NULL || zp->z_unlinked) + return; + + txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? + TX_ACL_V0 : TX_ACL; + + if (txtype == TX_ACL) + lrsize = sizeof (*lr); + else + lrsize = sizeof (*lrv0); + + txsize = lrsize + + ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + + (fuidp ? fuidp->z_domain_str_sz : 0) + + sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); + + itx = zil_itx_create(txtype, txsize); + + lr = (lr_acl_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + if (txtype == TX_ACL) { + lr->lr_acl_bytes = aclbytes; + lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) + lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lr->lr_acl_flags = 0; + } + lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; + + if (txtype == TX_ACL_V0) { + lrv0 = (lr_acl_v0_t *)lr; + bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); + } else { + void *start = (ace_t *)(lr + 1); + + bcopy(vsecp->vsa_aclentp, start, aclbytes); + + start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); + + if (fuidp) { + start = zfs_log_fuid_ids(fuidp, start); + (void) zfs_log_fuid_domains(fuidp, start); + } + } + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c new file mode 100644 index 000000000..85b79703a --- /dev/null +++ b/module/zfs/zfs_replay.c @@ -0,0 +1,878 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/vfs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_fuid.h> +#include <sys/spa.h> +#include <sys/zil.h> +#include <sys/byteorder.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/acl.h> +#include <sys/atomic.h> +#include <sys/cred.h> + +/* + * Functions to replay ZFS intent log (ZIL) records + * The functions are called through a function vector (zfs_replay_vector) + * which is indexed by the transaction type. + */ + +static void +zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) +{ + bzero(vap, sizeof (*vap)); + vap->va_mask = (uint_t)mask; + vap->va_type = IFTOVT(mode); + vap->va_mode = mode & MODEMASK; + vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; + vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_nodeid = nodeid; +} + +/* ARGSUSED */ +static int +zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) +{ + return (ENOTSUP); +} + +static void +zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + uint64_t *attrs; + uint64_t *crtime; + uint32_t *bitmap; + void *scanstamp; + int i; + + xvap->xva_vattr.va_mask |= AT_XVATTR; + if ((xoap = xva_getxoptattr(xvap)) == NULL) { + xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */ + return; + } + + ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); + + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) + xvap->xva_reqattrmap[i] = *bitmap; + + attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + xoap->xoa_av_quarantined = + ((*attrs & XAT0_AV_QUARANTINED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); +} + +static int +zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) +{ + uint64_t uid_idx; + uint64_t gid_idx; + int domcnt = 0; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + if (uid_idx) + domcnt++; + if (gid_idx > 0 && gid_idx != uid_idx) + domcnt++; + + return (domcnt); +} + +static void * +zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, + int domcnt) +{ + int i; + + for (i = 0; i != domcnt; i++) { + fuid_infop->z_domain_table[i] = start; + start = (caddr_t)start + strlen(start) + 1; + } + + return (start); +} + +/* + * Set the uid/gid in the fuid_info structure. + */ +static void +zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) +{ + /* + * If owner or group are log specific FUIDs then slurp up + * domain information and build zfs_fuid_info_t + */ + if (IS_EPHEMERAL(uid)) + fuid_infop->z_fuid_owner = uid; + + if (IS_EPHEMERAL(gid)) + fuid_infop->z_fuid_group = gid; +} + +/* + * Load fuid domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) +{ + int domcnt; + + zfs_fuid_info_t *fuid_infop; + + fuid_infop = zfs_fuid_info_alloc(); + + domcnt = zfs_replay_domain_cnt(uid, gid); + + if (domcnt == 0) + return (fuid_infop); + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + fuid_infop->z_domain_cnt = domcnt; + *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); + return (fuid_infop); +} + +/* + * load zfs_fuid_t's and fuid_domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, + uint64_t gid) +{ + uint64_t *log_fuid = (uint64_t *)start; + zfs_fuid_info_t *fuid_infop; + int i; + + fuid_infop = zfs_fuid_info_alloc(); + fuid_infop->z_domain_cnt = domcnt; + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); + + for (i = 0; i != idcnt; i++) { + zfs_fuid_t *zfuid; + + zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + zfuid->z_logfuid = *log_fuid; + zfuid->z_id = -1; + zfuid->z_domidx = 0; + list_insert_tail(&fuid_infop->z_fuids, zfuid); + log_fuid++; + } + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); + return (fuid_infop); +} + +static void +zfs_replay_swap_attrs(lr_attr_t *lrattr) +{ + /* swap the lr_attr structure */ + byteswap_uint32_array(lrattr, sizeof (*lrattr)); + /* swap the bitmap */ + byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * + sizeof (uint32_t)); + /* swap the attributes, create time + 64 bit word for attributes */ + byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * + (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); +} + +/* + * Replay file create with optional ACL, xvattr information as well + * as option FUID information. + */ +static int +zfs_replay_create_acl(zfsvfs_t *zfsvfs, + lr_acl_create_t *lracl, boolean_t byteswap) +{ + char *name = NULL; /* location determined later */ + lr_create_t *lr = (lr_create_t *)lracl; + znode_t *dzp; + vnode_t *vp = NULL; + xvattr_t xva; + int vflg = 0; + vsecattr_t vsec = { 0 }; + lr_attr_t *lrattr; + void *aclstart; + void *fuidstart; + size_t xvatlen = 0; + uint64_t txtype; + int error; + + if (byteswap) { + byteswap_uint64_array(lracl, sizeof (*lracl)); + txtype = (int)lr->lr_common.lrc_txtype; + if (txtype == TX_CREATE_ACL_ATTR || + txtype == TX_MKDIR_ACL_ATTR) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + zfs_replay_swap_attrs(lrattr); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + aclstart = (caddr_t)(lracl + 1) + xvatlen; + zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); + /* swap fuids */ + if (lracl->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), + lracl->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time and generation number. The generic VOP_CREATE() + * doesn't have either concept, so we smuggle the values inside + * the vattr's otherwise unused va_ctime and va_nblocks fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + + error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); + if (error != ENOENT) + goto bail; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + switch ((int)lr->lr_common.lrc_txtype) { + case TX_CREATE_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_CREATE_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + xva.xva_vattr.va_mask |= AT_XVATTR; + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } + + error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, + 0, 0, &vp, kcred, vflg, NULL, &vsec); + break; + case TX_MKDIR_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_MKDIR_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } + error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, + &vp, kcred, NULL, vflg, &vsec); + break; + default: + error = ENOTSUP; + } + +bail: + if (error == 0 && vp != NULL) + VN_RELE(vp); + + VN_RELE(ZTOV(dzp)); + + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + + return (error); +} + +static int +zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) +{ + char *name = NULL; /* location determined later */ + char *link; /* symlink content follows name */ + znode_t *dzp; + vnode_t *vp = NULL; + xvattr_t xva; + int vflg = 0; + size_t lrsize = sizeof (lr_create_t); + lr_attr_t *lrattr; + void *start; + size_t xvatlen; + uint64_t txtype; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + txtype = (int)lr->lr_common.lrc_txtype; + if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time and generation number. The generic VOP_CREATE() + * doesn't have either concept, so we smuggle the values inside + * the vattr's otherwise unused va_ctime and va_nblocks fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + + error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); + if (error != ENOENT) + goto out; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + /* + * Symlinks don't have fuid info, and CIFS never creates + * symlinks. + * + * The _ATTR versions will grab the fuid info in their subcases. + */ + if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && + (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && + (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { + start = (lr + 1); + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + } + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_CREATE_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ + case TX_CREATE: + if (name == NULL) + name = (char *)start; + + error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, + 0, 0, &vp, kcred, vflg, NULL, NULL); + break; + case TX_MKDIR_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ + case TX_MKDIR: + if (name == NULL) + name = (char *)(lr + 1); + + error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, + &vp, kcred, NULL, vflg, NULL); + break; + case TX_MKXATTR: + name = (char *)(lr + 1); + error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); + break; + case TX_SYMLINK: + name = (char *)(lr + 1); + link = name + strlen(name) + 1; + error = VOP_SYMLINK(ZTOV(dzp), name, &xva.xva_vattr, + link, kcred, NULL, vflg); + break; + default: + error = ENOTSUP; + } + +out: + if (error == 0 && vp != NULL) + VN_RELE(vp); + + VN_RELE(ZTOV(dzp)); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + return (error); +} + +static int +zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_remove_t */ + znode_t *dzp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_REMOVE: + error = VOP_REMOVE(ZTOV(dzp), name, kcred, NULL, vflg); + break; + case TX_RMDIR: + error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred, NULL, vflg); + break; + default: + error = ENOTSUP; + } + + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_link_t */ + znode_t *dzp, *zp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { + VN_RELE(ZTOV(dzp)); + return (error); + } + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred, NULL, vflg); + + VN_RELE(ZTOV(zp)); + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) +{ + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + znode_t *sdzp, *tdzp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { + VN_RELE(ZTOV(sdzp)); + return (error); + } + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred, + NULL, vflg); + + VN_RELE(ZTOV(tdzp)); + VN_RELE(ZTOV(sdzp)); + + return (error); +} + +static int +zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) +{ + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + znode_t *zp; + int error; + ssize_t resid; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log writes out of order, it's possible the + * file has been removed. In this case just drop the write + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, + lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) +{ + znode_t *zp; + flock64_t fl; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log truncates out of order, it's possible the + * file has been removed. In this case just drop the truncate + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + bzero(&fl, sizeof (fl)); + fl.l_type = F_WRLCK; + fl.l_whence = 0; + fl.l_start = lr->lr_offset; + fl.l_len = lr->lr_length; + + error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX, + lr->lr_offset, kcred, NULL); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) +{ + znode_t *zp; + xvattr_t xva; + vattr_t *vap = &xva.xva_vattr; + int error; + void *start; + + xva_init(&xva); + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((lr->lr_mask & AT_XVATTR) && + zfsvfs->z_version >= ZPL_VERSION_INITIAL) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log setattrs out of order, it's possible the + * file has been removed. In this case just drop the setattr + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, + lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); + + vap->va_size = lr->lr_size; + ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); + + /* + * Fill in xvattr_t portions if necessary. + */ + + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & AT_XVATTR) { + zfs_replay_xvattr((lr_attr_t *)start, &xva); + start = (caddr_t)start + + ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); + } else + xva.xva_vattr.va_mask &= ~AT_XVATTR; + + zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + + error = VOP_SETATTR(ZTOV(zp), vap, 0, kcred, NULL); + + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap) +{ + ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ + vsecattr_t vsa; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_oldace_byteswap(ace, lr->lr_aclcnt); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log acls out of order, it's possible the + * file has been removed. In this case just drop the acl + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; + vsa.vsa_aclflags = 0; + vsa.vsa_aclentp = ace; + + error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * Replaying ACLs is complicated by FUID support. + * The log record may contain some optional data + * to be used for replaying FUID's. These pieces + * are the actual FUIDs that were created initially. + * The FUID table index may no longer be valid and + * during zfs_create() a new index may be assigned. + * Because of this the log will contain the original + * doman+rid in order to create a new FUID. + * + * The individual ACEs may contain an ephemeral uid/gid which is no + * longer valid and will need to be replaced with an actual FUID. + * + */ +static int +zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) +{ + ace_t *ace = (ace_t *)(lr + 1); + vsecattr_t vsa; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); + if (lr->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes), + lr->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log acls out of order, it's possible the + * file has been removed. In this case just drop the acl + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentp = ace; + vsa.vsa_aclentsz = lr->lr_acl_bytes; + vsa.vsa_aclflags = lr->lr_acl_flags; + + if (lr->lr_fuidcnt) { + void *fuidstart = (caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes); + + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, &fuidstart, + lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); + } + + error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + + zfsvfs->z_fuid_replay = NULL; + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * Callback vectors for replaying records + */ +zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { + zfs_replay_error, /* 0 no such transaction type */ + zfs_replay_create, /* TX_CREATE */ + zfs_replay_create, /* TX_MKDIR */ + zfs_replay_create, /* TX_MKXATTR */ + zfs_replay_create, /* TX_SYMLINK */ + zfs_replay_remove, /* TX_REMOVE */ + zfs_replay_remove, /* TX_RMDIR */ + zfs_replay_link, /* TX_LINK */ + zfs_replay_rename, /* TX_RENAME */ + zfs_replay_write, /* TX_WRITE */ + zfs_replay_truncate, /* TX_TRUNCATE */ + zfs_replay_setattr, /* TX_SETATTR */ + zfs_replay_acl_v0, /* TX_ACL_V0 */ + zfs_replay_acl, /* TX_ACL */ + zfs_replay_create_acl, /* TX_CREATE_ACL */ + zfs_replay_create, /* TX_CREATE_ATTR */ + zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL */ + zfs_replay_create, /* TX_MKDIR_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ +}; diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c new file mode 100644 index 000000000..f0a75b5fa --- /dev/null +++ b/module/zfs/zfs_rlock.c @@ -0,0 +1,602 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file contains the code to implement file range locking in + * ZFS, although there isn't much specific to ZFS (all that comes to mind + * support for growing the blocksize). + * + * Interface + * --------- + * Defined in zfs_rlock.h but essentially: + * rl = zfs_range_lock(zp, off, len, lock_type); + * zfs_range_unlock(rl); + * zfs_range_reduce(rl, off, len); + * + * AVL tree + * -------- + * An AVL tree is used to maintain the state of the existing ranges + * that are locked for exclusive (writer) or shared (reader) use. + * The starting range offset is used for searching and sorting the tree. + * + * Common case + * ----------- + * The (hopefully) usual case is of no overlaps or contention for + * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree + * searched that finds no overlap, and *this* rl_t is placed in the tree. + * + * Overlaps/Reference counting/Proxy locks + * --------------------------------------- + * The avl code only allows one node at a particular offset. Also it's very + * inefficient to search through all previous entries looking for overlaps + * (because the very 1st in the ordered list might be at offset 0 but + * cover the whole file). + * So this implementation uses reference counts and proxy range locks. + * Firstly, only reader locks use reference counts and proxy locks, + * because writer locks are exclusive. + * When a reader lock overlaps with another then a proxy lock is created + * for that range and replaces the original lock. If the overlap + * is exact then the reference count of the proxy is simply incremented. + * Otherwise, the proxy lock is split into smaller lock ranges and + * new proxy locks created for non overlapping ranges. + * The reference counts are adjusted accordingly. + * Meanwhile, the orginal lock is kept around (this is the callers handle) + * and its offset and length are used when releasing the lock. + * + * Thread coordination + * ------------------- + * In order to make wakeups efficient and to ensure multiple continuous + * readers on a range don't starve a writer for the same range lock, + * two condition variables are allocated in each rl_t. + * If a writer (or reader) can't get a range it initialises the writer + * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; + * and waits on that cv. When a thread unlocks that range it wakes up all + * writers then all readers before destroying the lock. + * + * Append mode writes + * ------------------ + * Append mode writes need to lock a range at the end of a file. + * The offset of the end of the file is determined under the + * range locking mutex, and the lock type converted from RL_APPEND to + * RL_WRITER and the range locked. + * + * Grow block handling + * ------------------- + * ZFS supports multiple block sizes currently upto 128K. The smallest + * block size is used for the file which is grown as needed. During this + * growth all other writers and readers must be excluded. + * So if the block size needs to be grown then the whole file is + * exclusively locked, then later the caller will reduce the lock + * range to just the range to be written using zfs_reduce_range. + */ + +#include <sys/zfs_rlock.h> + +/* + * Check if a write lock can be grabbed, or wait and recheck until available. + */ +static void +zfs_range_lock_writer(znode_t *zp, rl_t *new) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *rl; + avl_index_t where; + uint64_t end_size; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + for (;;) { + /* + * Range locking is also used by zvol and uses a + * dummied up znode. However, for zvol, we don't need to + * append or grow blocksize, and besides we don't have + * a z_phys or z_zfsvfs - so skip that processing. + * + * Yes, this is ugly, and would be solved by not handling + * grow or append in range lock code. If that was done then + * we could make the range locking code generically available + * to other non-zfs consumers. + */ + if (zp->z_vnode) { /* caller is ZPL */ + /* + * If in append mode pick up the current end of file. + * This is done under z_range_lock to avoid races. + */ + if (new->r_type == RL_APPEND) + new->r_off = zp->z_phys->zp_size; + + /* + * If we need to grow the block size then grab the whole + * file range. This is also done under z_range_lock to + * avoid races. + */ + end_size = MAX(zp->z_phys->zp_size, new->r_off + len); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + new->r_off = 0; + new->r_len = UINT64_MAX; + } + } + + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(tree) == 0) { + new->r_type = RL_WRITER; /* convert to writer */ + avl_add(tree, new); + return; + } + + /* + * Look for any locks in the range. + */ + rl = avl_find(tree, new, &where); + if (rl) + goto wait; /* already locked at same offset */ + + rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + if (rl && (rl->r_off < new->r_off + new->r_len)) + goto wait; + + rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + if (rl && rl->r_off + rl->r_len > new->r_off) + goto wait; + + new->r_type = RL_WRITER; /* convert possible RL_APPEND */ + avl_insert(tree, new, where); + return; +wait: + if (!rl->r_write_wanted) { + cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); + rl->r_write_wanted = B_TRUE; + } + cv_wait(&rl->r_wr_cv, &zp->z_range_lock); + + /* reset to original */ + new->r_off = off; + new->r_len = len; + } +} + +/* + * If this is an original (non-proxy) lock then replace it by + * a proxy and return the proxy. + */ +static rl_t * +zfs_range_proxify(avl_tree_t *tree, rl_t *rl) +{ + rl_t *proxy; + + if (rl->r_proxy) + return (rl); /* already a proxy */ + + ASSERT3U(rl->r_cnt, ==, 1); + ASSERT(rl->r_write_wanted == B_FALSE); + ASSERT(rl->r_read_wanted == B_FALSE); + avl_remove(tree, rl); + rl->r_cnt = 0; + + /* create a proxy range lock */ + proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); + proxy->r_off = rl->r_off; + proxy->r_len = rl->r_len; + proxy->r_cnt = 1; + proxy->r_type = RL_READER; + proxy->r_proxy = B_TRUE; + proxy->r_write_wanted = B_FALSE; + proxy->r_read_wanted = B_FALSE; + avl_add(tree, proxy); + + return (proxy); +} + +/* + * Split the range lock at the supplied offset + * returning the *front* proxy. + */ +static rl_t * +zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) +{ + rl_t *front, *rear; + + ASSERT3U(rl->r_len, >, 1); + ASSERT3U(off, >, rl->r_off); + ASSERT3U(off, <, rl->r_off + rl->r_len); + ASSERT(rl->r_write_wanted == B_FALSE); + ASSERT(rl->r_read_wanted == B_FALSE); + + /* create the rear proxy range lock */ + rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); + rear->r_off = off; + rear->r_len = rl->r_off + rl->r_len - off; + rear->r_cnt = rl->r_cnt; + rear->r_type = RL_READER; + rear->r_proxy = B_TRUE; + rear->r_write_wanted = B_FALSE; + rear->r_read_wanted = B_FALSE; + + front = zfs_range_proxify(tree, rl); + front->r_len = off - rl->r_off; + + avl_insert_here(tree, rear, front, AVL_AFTER); + return (front); +} + +/* + * Create and add a new proxy range lock for the supplied range. + */ +static void +zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) +{ + rl_t *rl; + + ASSERT(len); + rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); + rl->r_off = off; + rl->r_len = len; + rl->r_cnt = 1; + rl->r_type = RL_READER; + rl->r_proxy = B_TRUE; + rl->r_write_wanted = B_FALSE; + rl->r_read_wanted = B_FALSE; + avl_add(tree, rl); +} + +static void +zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) +{ + rl_t *next; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + /* + * prev arrives either: + * - pointing to an entry at the same offset + * - pointing to the entry with the closest previous offset whose + * range may overlap with the new range + * - null, if there were no ranges starting before the new one + */ + if (prev) { + if (prev->r_off + prev->r_len <= off) { + prev = NULL; + } else if (prev->r_off != off) { + /* + * convert to proxy if needed then + * split this entry and bump ref count + */ + prev = zfs_range_split(tree, prev, off); + prev = AVL_NEXT(tree, prev); /* move to rear range */ + } + } + ASSERT((prev == NULL) || (prev->r_off == off)); + + if (prev) + next = prev; + else + next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + + if (next == NULL || off + len <= next->r_off) { + /* no overlaps, use the original new rl_t in the tree */ + avl_insert(tree, new, where); + return; + } + + if (off < next->r_off) { + /* Add a proxy for initial range before the overlap */ + zfs_range_new_proxy(tree, off, next->r_off - off); + } + + new->r_cnt = 0; /* will use proxies in tree */ + /* + * We now search forward through the ranges, until we go past the end + * of the new range. For each entry we make it a proxy if it + * isn't already, then bump its reference count. If there's any + * gaps between the ranges then we create a new proxy range. + */ + for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { + if (off + len <= next->r_off) + break; + if (prev && prev->r_off + prev->r_len < next->r_off) { + /* there's a gap */ + ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); + zfs_range_new_proxy(tree, prev->r_off + prev->r_len, + next->r_off - (prev->r_off + prev->r_len)); + } + if (off + len == next->r_off + next->r_len) { + /* exact overlap with end */ + next = zfs_range_proxify(tree, next); + next->r_cnt++; + return; + } + if (off + len < next->r_off + next->r_len) { + /* new range ends in the middle of this block */ + next = zfs_range_split(tree, next, off + len); + next->r_cnt++; + return; + } + ASSERT3U(off + len, >, next->r_off + next->r_len); + next = zfs_range_proxify(tree, next); + next->r_cnt++; + } + + /* Add the remaining end range. */ + zfs_range_new_proxy(tree, prev->r_off + prev->r_len, + (off + len) - (prev->r_off + prev->r_len)); +} + +/* + * Check if a reader lock can be grabbed, or wait and recheck until available. + */ +static void +zfs_range_lock_reader(znode_t *zp, rl_t *new) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *prev, *next; + avl_index_t where; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + /* + * Look for any writer locks in the range. + */ +retry: + prev = avl_find(tree, new, &where); + if (prev == NULL) + prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + + /* + * Check the previous range for a writer lock overlap. + */ + if (prev && (off < prev->r_off + prev->r_len)) { + if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { + if (!prev->r_read_wanted) { + cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); + prev->r_read_wanted = B_TRUE; + } + cv_wait(&prev->r_rd_cv, &zp->z_range_lock); + goto retry; + } + if (off + len < prev->r_off + prev->r_len) + goto got_lock; + } + + /* + * Search through the following ranges to see if there's + * write lock any overlap. + */ + if (prev) + next = AVL_NEXT(tree, prev); + else + next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + for (; next; next = AVL_NEXT(tree, next)) { + if (off + len <= next->r_off) + goto got_lock; + if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { + if (!next->r_read_wanted) { + cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); + next->r_read_wanted = B_TRUE; + } + cv_wait(&next->r_rd_cv, &zp->z_range_lock); + goto retry; + } + if (off + len <= next->r_off + next->r_len) + goto got_lock; + } + +got_lock: + /* + * Add the read lock, which may involve splitting existing + * locks and bumping ref counts (r_cnt). + */ + zfs_range_add_reader(tree, new, prev, where); +} + +/* + * Lock a range (offset, length) as either shared (RL_READER) + * or exclusive (RL_WRITER). Returns the range lock structure + * for later unlocking or reduce range (if entire file + * previously locked as RL_WRITER). + */ +rl_t * +zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) +{ + rl_t *new; + + ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); + + new = kmem_alloc(sizeof (rl_t), KM_SLEEP); + new->r_zp = zp; + new->r_off = off; + new->r_len = len; + new->r_cnt = 1; /* assume it's going to be in the tree */ + new->r_type = type; + new->r_proxy = B_FALSE; + new->r_write_wanted = B_FALSE; + new->r_read_wanted = B_FALSE; + + mutex_enter(&zp->z_range_lock); + if (type == RL_READER) { + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(&zp->z_range_avl) == 0) + avl_add(&zp->z_range_avl, new); + else + zfs_range_lock_reader(zp, new); + } else + zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ + mutex_exit(&zp->z_range_lock); + return (new); +} + +/* + * Unlock a reader lock + */ +static void +zfs_range_unlock_reader(znode_t *zp, rl_t *remove) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *rl, *next; + uint64_t len; + + /* + * The common case is when the remove entry is in the tree + * (cnt == 1) meaning there's been no other reader locks overlapping + * with this one. Otherwise the remove entry will have been + * removed from the tree and replaced by proxies (one or + * more ranges mapping to the entire range). + */ + if (remove->r_cnt == 1) { + avl_remove(tree, remove); + if (remove->r_write_wanted) { + cv_broadcast(&remove->r_wr_cv); + cv_destroy(&remove->r_wr_cv); + } + if (remove->r_read_wanted) { + cv_broadcast(&remove->r_rd_cv); + cv_destroy(&remove->r_rd_cv); + } + } else { + ASSERT3U(remove->r_cnt, ==, 0); + ASSERT3U(remove->r_write_wanted, ==, 0); + ASSERT3U(remove->r_read_wanted, ==, 0); + /* + * Find start proxy representing this reader lock, + * then decrement ref count on all proxies + * that make up this range, freeing them as needed. + */ + rl = avl_find(tree, remove, NULL); + ASSERT(rl); + ASSERT(rl->r_cnt); + ASSERT(rl->r_type == RL_READER); + for (len = remove->r_len; len != 0; rl = next) { + len -= rl->r_len; + if (len) { + next = AVL_NEXT(tree, rl); + ASSERT(next); + ASSERT(rl->r_off + rl->r_len == next->r_off); + ASSERT(next->r_cnt); + ASSERT(next->r_type == RL_READER); + } + rl->r_cnt--; + if (rl->r_cnt == 0) { + avl_remove(tree, rl); + if (rl->r_write_wanted) { + cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); + } + if (rl->r_read_wanted) { + cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); + } + kmem_free(rl, sizeof (rl_t)); + } + } + } + kmem_free(remove, sizeof (rl_t)); +} + +/* + * Unlock range and destroy range lock structure. + */ +void +zfs_range_unlock(rl_t *rl) +{ + znode_t *zp = rl->r_zp; + + ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); + ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); + ASSERT(!rl->r_proxy); + + mutex_enter(&zp->z_range_lock); + if (rl->r_type == RL_WRITER) { + /* writer locks can't be shared or split */ + avl_remove(&zp->z_range_avl, rl); + mutex_exit(&zp->z_range_lock); + if (rl->r_write_wanted) { + cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); + } + if (rl->r_read_wanted) { + cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); + } + kmem_free(rl, sizeof (rl_t)); + } else { + /* + * lock may be shared, let zfs_range_unlock_reader() + * release the lock and free the rl_t + */ + zfs_range_unlock_reader(zp, rl); + mutex_exit(&zp->z_range_lock); + } +} + +/* + * Reduce range locked as RL_WRITER from whole file to specified range. + * Asserts the whole file is exclusivly locked and so there's only one + * entry in the tree. + */ +void +zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) +{ + znode_t *zp = rl->r_zp; + + /* Ensure there are no other locks */ + ASSERT(avl_numnodes(&zp->z_range_avl) == 1); + ASSERT(rl->r_off == 0); + ASSERT(rl->r_type == RL_WRITER); + ASSERT(!rl->r_proxy); + ASSERT3U(rl->r_len, ==, UINT64_MAX); + ASSERT3U(rl->r_cnt, ==, 1); + + mutex_enter(&zp->z_range_lock); + rl->r_off = off; + rl->r_len = len; + mutex_exit(&zp->z_range_lock); + if (rl->r_write_wanted) + cv_broadcast(&rl->r_wr_cv); + if (rl->r_read_wanted) + cv_broadcast(&rl->r_rd_cv); +} + +/* + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. + */ +int +zfs_range_compare(const void *arg1, const void *arg2) +{ + const rl_t *rl1 = arg1; + const rl_t *rl2 = arg2; + + if (rl1->r_off > rl2->r_off) + return (1); + if (rl1->r_off < rl2->r_off) + return (-1); + return (0); +} diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c new file mode 100644 index 000000000..06b4dee46 --- /dev/null +++ b/module/zfs/zfs_vfsops.c @@ -0,0 +1,1652 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/mntent.h> +#include <sys/mount.h> +#include <sys/cmn_err.h> +#include "fs/fs_subr.h" +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/varargs.h> +#include <sys/policy.h> +#include <sys/atomic.h> +#include <sys/mkdev.h> +#include <sys/modctl.h> +#include <sys/refstr.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/bootconf.h> +#include <sys/sunddi.h> +#include <sys/dnlc.h> +#include <sys/dmu_objset.h> +#include <sys/spa_boot.h> + +int zfsfstype; +vfsops_t *zfs_vfsops = NULL; +static major_t zfs_major; +static minor_t zfs_minor; +static kmutex_t zfs_dev_mtx; + +static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); +static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); +static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); +static int zfs_root(vfs_t *vfsp, vnode_t **vpp); +static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); +static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); +static void zfs_freevfs(vfs_t *vfsp); + +static const fs_operation_def_t zfs_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, + VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, + VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, + VFSNAME_ROOT, { .vfs_root = zfs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, + VFSNAME_SYNC, { .vfs_sync = zfs_sync }, + VFSNAME_VGET, { .vfs_vget = zfs_vget }, + VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, + NULL, NULL +}; + +static const fs_operation_def_t zfs_vfsops_eio_template[] = { + VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, + NULL, NULL +}; + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; +static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; +static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; +static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; + +/* + * MO_DEFAULT is not used since the default value is determined + * by the equivalent property. + */ +static mntopt_t mntopts[] = { + { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, + { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, + { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, + { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } +}; + +static mntopts_t zfs_mntopts = { + sizeof (mntopts) / sizeof (mntopt_t), + mntopts +}; + +/*ARGSUSED*/ +int +zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) +{ + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * SYNC_ATTR is used by fsflush() to force old filesystems like UFS + * to sync metadata, which they would otherwise cache indefinitely. + * Semantically, the only requirement is that the sync be initiated. + * The DMU syncs out txgs frequently, so there's nothing to do. + */ + if (flag & SYNC_ATTR) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, UINT64_MAX, 0); + else + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static int +zfs_create_unique_device(dev_t *dev) +{ + major_t new_major; + + do { + ASSERT3U(zfs_minor, <=, MAXMIN32); + minor_t start = zfs_minor; + do { + mutex_enter(&zfs_dev_mtx); + if (zfs_minor >= MAXMIN32) { + /* + * If we're still using the real major + * keep out of /dev/zfs and /dev/zvol minor + * number space. If we're using a getudev()'ed + * major number, we can use all of its minors. + */ + if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) + zfs_minor = ZFS_MIN_MINOR; + else + zfs_minor = 0; + } else { + zfs_minor++; + } + *dev = makedevice(zfs_major, zfs_minor); + mutex_exit(&zfs_dev_mtx); + } while (vfs_devismounted(*dev) && zfs_minor != start); + if (zfs_minor == start) { + /* + * We are using all ~262,000 minor numbers for the + * current major number. Create a new major number. + */ + if ((new_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_mount: Can't get unique major " + "device number."); + return (-1); + } + mutex_enter(&zfs_dev_mtx); + zfs_major = new_major; + zfs_minor = 0; + + mutex_exit(&zfs_dev_mtx); + } else { + break; + } + /* CONSTANTCONDITION */ + } while (1); + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval < SPA_MINBLOCKSIZE || + newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) + newval = SPA_MAXBLOCKSIZE; + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->vfs_bsize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + } +} + +static void +devices_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + uint64_t nbmand; + int readonly, do_readonly = B_FALSE; + int setuid, do_setuid = B_FALSE; + int exec, do_exec = B_FALSE; + int devices, do_devices = B_FALSE; + int xattr, do_xattr = B_FALSE; + int atime, do_atime = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + devices = B_FALSE; + setuid = B_FALSE; + do_devices = B_TRUE; + do_setuid = B_TRUE; + } else { + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { + devices = B_FALSE; + do_devices = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { + devices = B_TRUE; + do_devices = B_TRUE; + } + + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { + xattr = B_FALSE; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { + xattr = B_TRUE; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { + atime = B_TRUE; + do_atime = B_TRUE; + } + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else { + char osname[MAXNAMELEN]; + + dmu_objset_name(os, osname); + if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, + NULL)) { + return (error); + } + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + ds = dmu_objset_ds(os); + error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "xattr", xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "recordsize", blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "readonly", readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "devices", devices_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "setuid", setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "exec", exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "snapdir", snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "aclmode", acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "aclinherit", acl_inherit_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "vscan", vscan_changed_cb, zfsvfs); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_devices) + devices_changed_cb(zfsvfs, devices); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + nbmand_changed_cb(zfsvfs, nbmand); + + return (0); + +unregister: + /* + * We may attempt to unregister some callbacks that are not + * registered, but this is OK; it will simply return ENOMSG, + * which we will ignore. + */ + (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, + zfsvfs); + (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); + return (error); + +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + boolean_t readonly; + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + + /* + * Parse and replay the intent log. + */ + zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, + zfs_replay_vector, zfs_unlinked_drain); + + zfs_unlinked_drain(zfsvfs); + zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ + } + + if (!zil_disable) + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + return (0); +} + +static void +zfs_freezfsvfs(zfsvfs_t *zfsvfs) +{ + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_online_recv_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrw_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static int +zfs_domount(vfs_t *vfsp, char *osname) +{ + dev_t mount_dev; + uint64_t recordsize, readonly; + int error = 0; + int mode; + zfsvfs_t *zfsvfs; + znode_t *zp = NULL; + + ASSERT(vfsp); + ASSERT(osname); + + /* + * Initialize the zfs-specific filesystem structure. + * Should probably make this a kmem cache, shuffle fields, + * and just bzero up to z_hold_mtx[]. + */ + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + zfsvfs->z_vfs = vfsp; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_assign = TXG_NOWAIT; + zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rrw_init(&zfsvfs->z_teardown_lock); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + + /* Initialize the generic filesystem structure. */ + vfsp->vfs_bcount = 0; + vfsp->vfs_data = NULL; + + if (zfs_create_unique_device(&mount_dev) == -1) { + error = ENODEV; + goto out; + } + ASSERT(vfs_devismounted(mount_dev) == 0); + + if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, + NULL)) + goto out; + + vfsp->vfs_dev = mount_dev; + vfsp->vfs_fstype = zfsfstype; + vfsp->vfs_bsize = recordsize; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfsp->vfs_data = zfsvfs; + + if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) + goto out; + + mode = DS_MODE_OWNER; + if (readonly) + mode |= DS_MODE_READONLY; + + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + if (error == EROFS) { + mode = DS_MODE_OWNER | DS_MODE_READONLY; + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, + &zfsvfs->z_os); + } + + if (error) + goto out; + + if (error = zfs_init_fs(zfsvfs, &zp)) + goto out; + + /* The call to zfs_init_fs leaves the vnode held, release it here. */ + VN_RELE(ZTOV(zp)); + + /* + * Set features for file system. + */ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_use_fuids) { + vfs_set_feature(vfsp, VFSFT_XVATTR); + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); + vfs_set_feature(vfsp, VFSFT_ACLONCREATE); + } + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + ASSERT(mode & DS_MODE_READONLY); + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + } else { + error = zfsvfs_setup(zfsvfs, B_TRUE); + } + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + if (zfsvfs->z_os) + dmu_objset_close(zfsvfs->z_os); + zfs_freezfsvfs(zfsvfs); + } else { + atomic_add_32(&zfs_active_fs_count, 1); + } + + return (error); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + struct dsl_dataset *ds; + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) { + ds = dmu_objset_ds(os); + VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "aclinherit", + acl_inherit_changed_cb, zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "vscan", + vscan_changed_cb, zfsvfs) == 0); + } +} + +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (EINVAL); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (EINVAL); + + (void) strcpy(outpath, bpath); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + return (0); + } + + /* if not a number, just return the root dataset name */ + if (str_to_uint64(slashp+1, &objnum)) { + return (0); + } + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +static int +zfs_mountroot(vfs_t *vfsp, enum whymountroot why) +{ + int error = 0; + static int zfsrootdone = 0; + zfsvfs_t *zfsvfs = NULL; + znode_t *zp = NULL; + vnode_t *vp = NULL; + char *zfs_bootfs; + char *zfs_devid; + + ASSERT(vfsp); + + /* + * The filesystem that we mount as root is defined in the + * boot property "zfs-bootfs" with a format of + * "poolname/root-dataset-objnum". + */ + if (why == ROOT_INIT) { + if (zfsrootdone++) + return (EBUSY); + /* + * the process of doing a spa_load will require the + * clock to be set before we could (for example) do + * something better by looking at the timestamp on + * an uberblock, so just set it to -1. + */ + clkset(-1); + + if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { + cmn_err(CE_NOTE, "spa_get_bootfs: can not get " + "bootfs name"); + return (EINVAL); + } + zfs_devid = spa_get_bootprop("diskdevid"); + error = spa_import_rootpool(rootfs.bo_name, zfs_devid); + if (zfs_devid) + spa_free_bootprop(zfs_devid); + if (error) { + spa_free_bootprop(zfs_bootfs); + cmn_err(CE_NOTE, "spa_import_rootpool: error %d", + error); + return (error); + } + if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { + spa_free_bootprop(zfs_bootfs); + cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", + error); + return (error); + } + + spa_free_bootprop(zfs_bootfs); + + if (error = vfs_lock(vfsp)) + return (error); + + if (error = zfs_domount(vfsp, rootfs.bo_name)) { + cmn_err(CE_NOTE, "zfs_domount: error %d", error); + goto out; + } + + zfsvfs = (zfsvfs_t *)vfsp->vfs_data; + ASSERT(zfsvfs); + if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { + cmn_err(CE_NOTE, "zfs_zget: error %d", error); + goto out; + } + + vp = ZTOV(zp); + mutex_enter(&vp->v_lock); + vp->v_flag |= VROOT; + mutex_exit(&vp->v_lock); + rootvp = vp; + + /* + * Leave rootvp held. The root file system is never unmounted. + */ + + vfs_add((struct vnode *)0, vfsp, + (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); +out: + vfs_unlock(vfsp); + return (error); + } else if (why == ROOT_REMOUNT) { + readonly_changed_cb(vfsp->vfs_data, B_FALSE); + vfsp->vfs_flag |= VFS_REMOUNT; + + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + return (zfs_register_callbacks(vfsp)); + + } else if (why == ROOT_UNMOUNT) { + zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); + (void) zfs_sync(vfsp, 0, 0); + return (0); + } + + /* + * if "why" is equal to anything else other than ROOT_INIT, + * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. + */ + return (ENOTSUP); +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + char *osname; + pathname_t spn; + int error = 0; + uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE; + int canwrite; + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_REMOUNT) == 0 && + (uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * ZFS does not support passing unparsed data in via MS_DATA. + * Users should use the MS_OPTIONSTR interface; this means + * that all option parsing is already done and the options struct + * can be interrogated. + */ + if ((uap->flags & MS_DATA) && uap->datalen > 0) + return (EINVAL); + + /* + * Get the objset name (the "special" mount argument). + */ + if (error = pn_get(uap->spec, fromspace, &spn)) + return (error); + + osname = spn.pn_path; + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); + if (error == 0) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { + goto out; + } + + if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { + error = EPERM; + goto out; + } + + secpolicy_fs_mount_clearopts(cr, vfsp); + } else { + goto out; + } + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = EPERM; + goto out; + } + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (uap->flags & MS_REMOUNT) { + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + error = zfs_register_callbacks(vfsp); + goto out; + } + + error = zfs_domount(vfsp, osname); + +out: + pn_free(&spn); + return (error); +} + +static int +zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + dev32_t d32; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; + statp->f_bsize = zfsvfs->z_max_blksz; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; + statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, statp->f_bfree); + statp->f_favail = statp->f_ffree; /* no "root reservation" */ + statp->f_files = statp->f_ffree + usedobjs; + + (void) cmpldev(&d32, vfsp->vfs_dev); + statp->f_fsid = d32; + + /* + * We're a zfs filesystem. + */ + (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); + + statp->f_flag = vf_to_stf(vfsp->vfs_flag); + + statp->f_namemax = ZFS_MAXNAMELEN; + + /* + * We have all of 32 characters to stuff a string here. + * Is there anything useful we could/should provide? + */ + bzero(statp->f_fstr, sizeof (statp->f_fstr)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root(vfs_t *vfsp, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + + rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + return (EIO); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relavent for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) + if (zp->z_dbuf) { + ASSERT(ZTOV(zp)->v_count > 0); + zfs_znode_dmu_fini(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) dmu_objset_evict_dbufs(zfsvfs->z_os); + } + + return (0); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + objset_t *os; + int ret; + + ret = secpolicy_fs_unmount(cr, vfsp); + if (ret) { + ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), + ZFS_DELEG_PERM_MOUNT, cr); + if (ret) + return (ret); + } + + /* + * We purge the parent filesystem's vfsp as the parent filesystem + * and all of its snapshots have their vnode's v_vfsp set to the + * parent's filesystem's vfsp. Note, 'z_parent' is self + * referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL && + (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { + return (ret); + } + + if (!(fflag & MS_FORCE)) { + /* + * Check the number of active vnodes in the file system. + * Our count is maintained in the vfs structure, but the + * number is off by 1 to indicate a hold on the vfs + * structure itself. + * + * The '.zfs' directory maintains a reference of its + * own, and any active references underneath are + * reflected in the vnode count. + */ + if (zfsvfs->z_ctldir == NULL) { + if (vfsp->vfs_count > 1) + return (EBUSY); + } else { + if (vfsp->vfs_count > 2 || + zfsvfs->z_ctldir->v_count > 1) + return (EBUSY); + } + } + + vfsp->vfs_flag |= VFS_UNMOUNTED; + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_close(os); + } + + /* + * We can now safely destroy the '.zfs' directory node. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + if (fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (EINVAL); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* A zero fid_gen means we are in the .zfs control directories */ + if (fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { + *vpp = zfsvfs->z_ctldir; + ASSERT(*vpp != NULL); + if (object == ZFSCTL_INO_SNAPDIR) { + VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, + 0, NULL, NULL, NULL, NULL, NULL) == 0); + } else { + VN_HOLD(*vpp); + } + ZFS_EXIT(zfsvfs); + return (0); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + if (err = zfs_zget(zfsvfs, object, &zp)) { + ZFS_EXIT(zfsvfs); + return (err); + } + zp_gen = zp->z_phys->zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + VN_RELE(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + *mode = zfsvfs->z_os->os_mode; + dmu_objset_name(zfsvfs->z_os, name); + dmu_objset_close(zfsvfs->z_os); + + return (0); +} + +/* + * Reopen zfsvfs_t::z_os and release VOPs. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) +{ + int err; + + ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + if (err) { + zfsvfs->z_os = NULL; + } else { + znode_t *zp; + + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + } + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't reopen zfsvfs::z_os, force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); + } + return (err); +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + int i; + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + + zfs_fuid_destroy(zfsvfs); + zfs_freezfsvfs(zfsvfs); + + atomic_add_32(&zfs_active_fs_count, -1); +} + +/* + * VFS_INIT() initialization. Note that there is no VFS_FINI(), + * so we can't safely do any non-idempotent initialization here. + * Leave that to zfs_init() and zfs_fini(), which are called + * from the module's _init() and _fini() entry points. + */ +/*ARGSUSED*/ +static int +zfs_vfsinit(int fstype, char *name) +{ + int error; + + zfsfstype = fstype; + + /* + * Setup vfsops and vnodeops tables. + */ + error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); + if (error != 0) { + cmn_err(CE_WARN, "zfs: bad vfs ops template"); + } + + error = zfs_create_op_tables(); + if (error) { + zfs_remove_op_tables(); + cmn_err(CE_WARN, "zfs: bad vnode ops template"); + (void) vfs_freevfsops_by_type(zfsfstype); + return (error); + } + + mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + + /* + * Unique major number for all zfs mounts. + * If we run out of 32-bit minors, we'll getudev() another major. + */ + zfs_major = ddi_name_to_major(ZFS_DRIVER); + zfs_minor = ZFS_MIN_MINOR; + + return (0); +} + +void +zfs_init(void) +{ + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); +} + +void +zfs_fini(void) +{ + zfsctl_fini(); + zfs_znode_fini(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +int +zfs_set_version(const char *name, uint64_t newvers) +{ + int error; + objset_t *os; + dmu_tx_t *tx; + uint64_t curvers; + + /* + * XXX for now, require that the filesystem be unmounted. Would + * be nice to find the zfsvfs_t and just update that if + * possible. + */ + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (EINVAL); + + error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os); + if (error) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &curvers); + if (error) + goto out; + if (newvers < curvers) { + error = EINVAL; + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, + &newvers, tx); + + spa_history_internal_log(LOG_DS_UPGRADE, + dmu_objset_spa(os), tx, CRED(), + "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, + dmu_objset_id(os)); + dmu_tx_commit(tx); + +out: + dmu_objset_close(os); + return (error); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + const char *pname; + int error = ENOENT; + + /* + * Look up the file system's value for the property. For the + * version property, we look up a slightly different string. + */ + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + default: + return (error); + } + error = 0; + } + return (error); +} + +static vfsdef_t vfw = { + VFSDEF_VERSION, + MNTTYPE_ZFS, + zfs_vfsinit, + VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| + VSW_XID, + &zfs_mntopts +}; + +struct modlfs zfs_modlfs = { + &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw +}; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c new file mode 100644 index 000000000..8e0037e37 --- /dev/null +++ b/module/zfs/zfs_vnops.c @@ -0,0 +1,4561 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Portions Copyright 2007 Jeremy Teo */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/uio.h> +#include <sys/vmsystm.h> +#include <sys/atomic.h> +#include <sys/vm.h> +#include <vm/seg_vn.h> +#include <vm/pvn.h> +#include <vm/as.h> +#include <vm/kpm.h> +#include <vm/seg_kpm.h> +#include <sys/mman.h> +#include <sys/pathname.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/dirent.h> +#include <sys/policy.h> +#include <sys/sunddi.h> +#include <sys/filio.h> +#include <sys/sid.h> +#include "fs/fs_subr.h" +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/dnlc.h> +#include <sys/zfs_rlock.h> +#include <sys/extdirent.h> +#include <sys/kidmap.h> +#include <sys/cred_impl.h> +#include <sys/attr.h> + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). + * In normal operation, this will be TXG_NOWAIT. During ZIL replay, + * it will be a specific txg. Either way, dmu_tx_assign() never blocks. + * This is critical because we don't want to block while holding locks. + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing to + * use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, seq, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +static int +zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && + zp->z_phys->zp_size > 0) { + if (fs_vscan(*vpp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (EACCES); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + atomic_dec_32(&zp->z_sync_cnt); + + /* + * Clean up any locks held by this process on the vp. + */ + cleanlocks(vp, ddi_get_pid(), 0); + cleanshares(vp, ddi_get_pid()); + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && + zp->z_phys->zp_size > 0) + VERIFY(fs_vscan(vp, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and + * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey(vnode_t *vp, int cmd, offset_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_phys->zp_size; + if (noff >= file_sz) { + return (ENXIO); + } + + if (cmd == _FIO_SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + + /* end of file? */ + if ((error == ESRCH) || (noff > file_sz)) { + /* + * Handle the virtual hole at the end of file. + */ + if (hole) { + *off = file_sz; + return (0); + } + return (ENXIO); + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, + int *rvalp, caller_context_t *ct) +{ + offset_t off; + int error; + zfsvfs_t *zfsvfs; + znode_t *zp; + + switch (com) { + case _FIOFFS: + return (zfs_sync(vp->v_vfsp, 0, cred)); + + /* + * The following two ioctls are used by bfu. Faking out, + * necessary to avoid bfu errors. + */ + case _FIOGDIO: + case _FIOSDIO: + return (0); + + case _FIO_SEEK_DATA: + case _FIO_SEEK_HOLE: + if (ddi_copyin((void *)data, &off, sizeof (off), flag)) + return (EFAULT); + + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* offset parameter is in/out */ + error = zfs_holey(vp, com, &off); + ZFS_EXIT(zfsvfs); + if (error) + return (error); + if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) + return (EFAULT); + return (0); + } + return (ENOTTY); +} + +/* + * Utility functions to map and unmap a single physical page. These + * are used to manage the mappable copies of ZFS file data, and therefore + * do not update ref/mod bits. + */ +caddr_t +zfs_map_page(page_t *pp, enum seg_rw rw) +{ + if (kpm_enable) + return (hat_kpm_mapin(pp, 0)); + ASSERT(rw == S_READ || rw == S_WRITE); + return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), + (caddr_t)-1)); +} + +void +zfs_unmap_page(page_t *pp, caddr_t addr) +{ + if (kpm_enable) { + hat_kpm_mapout(pp, 0, addr); + } else { + ppmapout(addr); + } +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int64_t start, off; + int len = nbytes; + int error = 0; + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + page_t *pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + uint64_t woff = uio->uio_loffset; + + /* + * We don't want a new page to "appear" in the middle of + * the file update (because it may not get the write + * update data), so we grab a lock to block + * zfs_getpage(). + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (pp = page_lookup(vp, start, SE_SHARED)) { + caddr_t va; + + rw_exit(&zp->z_map_lock); + va = zfs_map_page(pp, S_WRITE); + error = uiomove(va+off, bytes, UIO_WRITE, uio); + if (error == 0) { + dmu_write(zfsvfs->z_os, zp->z_id, + woff, bytes, va+off, tx); + } + zfs_unmap_page(pp, va); + page_unlock(pp); + } else { + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + uio, bytes, tx); + rw_exit(&zp->z_map_lock); + } + len -= bytes; + off = 0; + if (error) + break; + } + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + int64_t start, off; + int len = nbytes; + int error = 0; + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + page_t *pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + if (pp = page_lookup(vp, start, SE_SHARED)) { + caddr_t va; + + va = zfs_map_page(pp, S_READ); + error = uiomove(va + off, bytes, UIO_READ, uio); + zfs_unmap_page(pp, va); + page_unlock(pp); + } else { + error = dmu_read_uio(os, zp->z_id, uio, bytes); + } + len -= bytes; + off = 0; + if (error) + break; + } + return (error); +} + +offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 if success + * error code if failure + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + ssize_t n, nbytes; + int error; + rl_t *rl; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + os = zfsvfs->z_os; + + if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (EACCES); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Check for mandatory locks + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { + if (error = chklock(vp, FREAD, + uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + if (ioflag & FRSYNC) + zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + + /* + * Lock the range against changes. + */ + rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_phys->zp_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_phys->zp_size); + n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (vn_has_cached_data(vp)) + error = mappedread(vp, nbytes, uio); + else + error = dmu_read_uio(os, zp->z_id, uio, nbytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; + break; + } + + n -= nbytes; + } + +out: + zfs_range_unlock(rl); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND flag set if in append mode. + * cr - credentials of caller. + * ct - caller context (NFS/CIFS fem monitor only) + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + rlim64_t limit = uio->uio_llimit; + ssize_t start_resid = uio->uio_resid; + ssize_t tx_bytes; + uint64_t end_size; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + offset_t woff; + ssize_t n, nbytes; + rl_t *rl; + int max_blksz = zfsvfs->z_max_blksz; + uint64_t pflags; + int error; + + /* + * Fasttrack empty write + */ + n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If immutable or not appending then return EPERM + */ + pflags = zp->z_phys->zp_flags; + if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_phys->zp_size))) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + zilog = zfsvfs->z_log; + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + */ + uio_prefaultpages(n, uio); + + /* + * If in append mode, set the io offset pointer to eof. + */ + if (ioflag & FAPPEND) { + /* + * Range lock for a file append: + * The value for the start of range will be determined by + * zfs_range_lock() (to guarantee append semantics). + * If this write will cause the block size to increase, + * zfs_range_lock() will lock the entire file, so we must + * later reduce the range after we grow the block size. + */ + rl = zfs_range_lock(zp, 0, n, RL_APPEND); + if (rl->r_len == UINT64_MAX) { + /* overlocked, zp_size can't change */ + woff = uio->uio_loffset = zp->z_phys->zp_size; + } else { + woff = uio->uio_loffset = rl->r_off; + } + } else { + woff = uio->uio_loffset; + /* + * Validate file offset + */ + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * If we need to grow the block size then zfs_range_lock() + * will lock a wider range than we request here. + * Later after growing the block size we reduce the range. + */ + rl = zfs_range_lock(zp, woff, n, RL_WRITER); + } + + if (woff >= limit) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (EFBIG); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* + * Check for mandatory locks + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode) && + (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (error); + } + end_size = MAX(zp->z_phys->zp_size, woff + n); + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + /* + * Start a transaction. + */ + woff = uio->uio_loffset; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + continue; + } + dmu_tx_abort(tx); + break; + } + + /* + * If zfs_range_lock() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since zfs_range_reduce() will + * shrink down r_len to the appropriate size. + */ + if (rl->r_len == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_range_reduce(rl, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + rw_enter(&zp->z_map_lock, RW_READER); + + tx_bytes = uio->uio_resid; + if (vn_has_cached_data(vp)) { + rw_exit(&zp->z_map_lock); + error = mappedwrite(vp, nbytes, uio, tx); + } else { + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + uio, nbytes, tx); + rw_exit(&zp->z_map_lock); + } + tx_bytes -= uio->uio_resid; + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the excute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (zp->z_phys->zp_mode & S_ISUID) != 0 && + zp->z_phys->zp_uid == 0) != 0) { + zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); + } + mutex_exit(&zp->z_acl_lock); + + /* + * Update time stamp. NOTE: This marks the bonus buffer as + * dirty, so we don't have to do it again for zp_size. + */ + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) + (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, + uio->uio_loffset); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + } + + zfs_range_unlock(rl); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (FSYNC | FDSYNC)) + zil_commit(zilog, zp->z_last_itx, zp->z_id); + + ZFS_EXIT(zfsvfs); + return (0); +} + +void +zfs_get_done(dmu_buf_t *db, void *vzgd) +{ + zgd_t *zgd = (zgd_t *)vzgd; + rl_t *rl = zgd->zgd_rl; + vnode_t *vp = ZTOV(rl->r_zp); + + dmu_buf_rele(db, vzgd); + zfs_range_unlock(rl); + VN_RELE(vp); + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + kmem_free(zgd, sizeof (zgd_t)); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t off = lr->lr_offset; + dmu_buf_t *db; + rl_t *rl; + zgd_t *zgd; + int dlen = lr->lr_length; /* length of user data */ + int error = 0; + + ASSERT(zio); + ASSERT(dlen != 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) + return (ENOENT); + if (zp->z_unlinked) { + VN_RELE(ZTOV(zp)); + return (ENOENT); + } + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + rl = zfs_range_lock(zp, off, dlen, RL_READER); + /* test for truncation needs to be done while range locked */ + if (off >= zp->z_phys->zp_size) { + error = ENOENT; + goto out; + } + VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); + } else { /* indirect write */ + uint64_t boff; /* block starting offset */ + + /* + * Have to lock the whole block to ensure when it's + * written out and it's checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + if (ISP2(zp->z_blksz)) { + boff = P2ALIGN_TYPED(off, zp->z_blksz, + uint64_t); + } else { + boff = 0; + } + dlen = zp->z_blksz; + rl = zfs_range_lock(zp, boff, dlen, RL_READER); + if (zp->z_blksz == dlen) + break; + zfs_range_unlock(rl); + } + /* test for truncation needs to be done while range locked */ + if (off >= zp->z_phys->zp_size) { + error = ENOENT; + goto out; + } + zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_rl = rl; + zgd->zgd_zilog = zfsvfs->z_log; + zgd->zgd_bp = &lr->lr_blkptr; + VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); + ASSERT(boff == db->db_offset); + lr->lr_blkoff = off - boff; + error = dmu_sync(zio, db, &lr->lr_blkptr, + lr->lr_common.lrc_txg, zfs_get_done, zgd); + ASSERT((error && error != EINPROGRESS) || + lr->lr_length <= zp->z_blksz); + if (error == 0) + zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); + /* + * If we get EINPROGRESS, then we need to wait for a + * write IO initiated by dmu_sync() to complete before + * we can release this dbuf. We will finish everything + * up in the zfs_get_done() callback. + */ + if (error == EINPROGRESS) + return (0); + dmu_buf_rele(db, zgd); + kmem_free(zgd, sizeof (zgd_t)); + } +out: + zfs_range_unlock(rl); + VN_RELE(ZTOV(zp)); + return (error); +} + +/*ARGSUSED*/ +static int +zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_phys->zp_flags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, + B_FALSE, cr)) { + VN_RELE(*vpp); + *vpp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + if (dvp->v_type != VDIR) { + ZFS_EXIT(zfsvfs); + return (ENOTDIR); + } + + /* + * Check accessibility of directory. + */ + + if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + + error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); + if (error == 0) { + /* + * Convert device special files + */ + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + else + *vpp = svp; + } + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +static int +zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp = NULL; + zfs_fuid_info_t *fuidp = NULL; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } +top: + *vpp = NULL; + + if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~VSVTX; + + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + VN_HOLD(dvp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible VN_HOLD(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (strcmp(name, "..") == 0) + error = EISDIR; + ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (error); + } + } + if (vsecp && aclp == NULL) { + error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); + if (error) { + ZFS_EXIT(zfsvfs); + if (dl) + zfs_dirent_unlock(dl); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + if ((dzp->z_phys->zp_flags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = EINVAL; + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || + IS_EPHEMERAL(gid)) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, + FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, SPA_MAXBLOCKSIZE); + } + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + (void) zfs_link_create(dl, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, fuidp, vap); + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + } else { + int aflags = (flag & FAPPEND) ? V_APPEND : 0; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl == EXCL) { + error = EEXIST; + goto out; + } + /* + * Can't open a directory for writing. + */ + if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { + error = EISDIR; + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if ((ZTOV(zp)->v_type == VREG) && + (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + zfs_dirent_unlock(dl); + dl = NULL; + error = zfs_freesp(zp, 0, 0, mode, TRUE); + if (error == 0) { + vnevent_create(ZTOV(zp), ct); + } + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + VN_RELE(ZTOV(zp)); + } else { + *vpp = ZTOV(zp); + /* + * If vnode is for a device return a specfs vnode instead. + */ + if (IS_DEVVP(*vpp)) { + struct vnode *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + error = ENOSYS; + } + *vpp = svp; + } + } + if (aclp) + zfs_acl_free(aclp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ +/*ARGSUSED*/ +static int +zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, + int flags) +{ + znode_t *zp, *dzp = VTOZ(dvp); + znode_t *xzp = NULL; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + pathname_t *realnmp = NULL; + pathname_t realnm; + int error; + int zflg = ZEXISTS; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + pn_alloc(&realnm); + realnmp = &realnm; + } + +top: + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp)) { + if (realnmp) + pn_free(realnmp); + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (vp->v_type == VDIR) { + error = EPERM; + goto out; + } + + vnevent_remove(vp, dvp, name, ct); + + if (realnmp) + dnlc_remove(dvp, realnmp->pn_buf); + else + dnlc_remove(dvp, name); + + mutex_enter(&vp->v_lock); + may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); + mutex_exit(&vp->v_lock); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_bonus(tx, zp->z_id); + if (may_delete_now) { + toobig = + zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } + + /* are there any extended attributes? */ + if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { + /* XXX - do we need this if we are deleting? */ + dmu_tx_hold_bonus(tx, xattr_obj); + } + + /* are there any additional acls */ + if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && + may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + VN_RELE(vp); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + if (realnmp) + pn_free(realnmp); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + mutex_enter(&vp->v_lock); + delete_now = may_delete_now && !toobig && + vp->v_count == 1 && !vn_has_cached_data(vp) && + zp->z_phys->zp_xattr == xattr_obj && + zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; + mutex_exit(&vp->v_lock); + } + + if (delete_now) { + if (zp->z_phys->zp_xattr) { + error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + ASSERT3U(error, ==, 0); + ASSERT3U(xzp->z_phys->zp_links, ==, 2); + dmu_buf_will_dirty(xzp->z_dbuf, tx); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = 1; + xzp->z_phys->zp_links = 0; + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + zp->z_phys->zp_xattr = 0; /* probably unnecessary */ + } + mutex_enter(&zp->z_lock); + mutex_enter(&vp->v_lock); + vp->v_count--; + ASSERT3U(vp->v_count, ==, 0); + mutex_exit(&vp->v_lock); + mutex_exit(&zp->z_lock); + zfs_znode_delete(zp, tx); + } else if (unlinked) { + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name); + + dmu_tx_commit(tx); +out: + if (realnmp) + pn_free(realnmp); + + zfs_dirent_unlock(dl); + + if (!delete_now) { + VN_RELE(vp); + } else if (xzp) { + /* this rele is delayed to prevent nesting transactions */ + VN_RELE(ZTOV(xzp)); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +static int +zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, + caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp = NULL; + zfs_fuid_info_t *fuidp = NULL; + int zf = ZNEW; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + + ASSERT(vap->va_type == VDIR); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_phys->zp_flags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & AT_XVATTR) + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + */ +top: + *vpp = NULL; + + if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (vsecp && aclp == NULL) { + error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); + if (error) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + } + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || + IS_EPHEMERAL(gid)) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, SPA_MAXBLOCKSIZE); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + + if (aclp) + zfs_acl_free(aclp); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); + + *vpp = ZTOV(zp); + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + if (vp == cwd) { + error = EINVAL; + goto out; + } + + vnevent_rmdir(vp, dvp, name, ct); + + /* + * Grab a lock on the directory to make sure that noone is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + VN_RELE(vp); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +out: + zfs_dirent_unlock(dl); + + VN_RELE(vp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure. + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + edirent_t *eodp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + int local_eof; + int outcount; + int error; + uint8_t prefetch; + boolean_t check_sysattrs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + error = 0; + os = zfsvfs->z_os; + offset = uio->uio_loffset; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = uio->uio_iov; + bytes_wanted = iovp->iov_len; + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + odp = (struct dirent64 *)iovp->iov_base; + } + eodp = (struct edirent *)odp; + + /* + * If this VFS supports the system attribute view interface; and + * we're looking at an extended attribute directory; and we care + * about normalization conflicts on this vfs; then we must check + * for normalization conflicts with the sysattr name space. + */ + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + off64_t *next; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = zp->z_phys->zp_parent; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + } else { + /* + * Grab next entry. + */ + if (error = zap_cursor_retrieve(&zc, &zap)) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = ENXIO; + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + + if (check_sysattrs && !zap.za_normalization_conflict) { + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); + } + } + + if (flags & V_RDDIR_ENTFLAGS) + reclen = EDIRENT_RECLEN(strlen(zap.za_name)); + else + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = EINVAL; + goto update; + } + break; + } + if (flags & V_RDDIR_ENTFLAGS) { + /* + * Add extended flag entry: + */ + eodp->ed_ino = objnum; + eodp->ed_reclen = reclen; + /* NOTE: ed_off is the offset for the *next* entry */ + next = &(eodp->ed_off); + eodp->ed_eflags = zap.za_normalization_conflict ? + ED_CASE_CONFLICT : 0; + (void) strncpy(eodp->ed_name, zap.za_name, + EDIRENT_NAMELEN(reclen)); + eodp = (edirent_t *)((intptr_t)eodp + reclen); + } else { + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + /* NOTE: d_off is the offset for the *next* entry */ + next = &(odp->d_off); + (void) strncpy(odp->d_name, zap.za_name, + DIRENT64_NAMELEN(reclen)); + odp = (dirent64_t *)((intptr_t)odp + reclen); + } + outcount += reclen; + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0); + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + *next = offset; + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + uio->uio_resid -= outcount; + } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { + /* + * Reset the pointer. + */ + offset = uio->uio_loffset; + } + +update: + zap_cursor_fini(&zc); + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio->uio_loffset = offset; + ZFS_EXIT(zfsvfs); + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +static int +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + /* + * Regardless of whether this is required for standards conformance, + * this is the logical behavior when fsync() is called on a file with + * dirty pages. We use B_ASYNC since the ZIL transactions are already + * going to be pushed out as part of the zil_commit(). + */ + if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && + (vp->v_type == VREG) && !(IS_SWAPVP(vp))) + (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + ZFS_EXIT(zfsvfs); + return (0); +} + + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * ct - caller context + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +static int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_phys_t *pzp; + int error = 0; + uint64_t links; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + pzp = zp->z_phys; + + mutex_enter(&zp->z_lock); + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && + (pzp->zp_uid != crgetuid(cr))) { + if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr)) { + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + vap->va_type = vp->v_type; + vap->va_mode = pzp->zp_mode & MODEMASK; + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; + vap->va_nodeid = zp->z_id; + if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) + links = pzp->zp_links + 1; + else + links = pzp->zp_links; + vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ + vap->va_size = pzp->zp_size; + vap->va_rdev = vp->v_rdev; + vap->va_seq = zp->z_seq; + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((pzp->zp_flags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((pzp->zp_flags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((pzp->zp_flags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((pzp->zp_flags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((pzp->zp_flags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((pzp->zp_flags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((pzp->zp_flags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((pzp->zp_flags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vp->v_type == VREG && + (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { + size_t len; + dmu_object_info_t doi; + + /* + * Only VREG files have anti-virus scanstamps, so we + * won't conflict with symlinks in the bonus buffer. + */ + dmu_object_info_from_db(zp->z_dbuf, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + sizeof (znode_phys_t); + if (len <= doi.doi_bonus_size) { + /* + * pzp points to the start of the + * znode_phys_t. pzp + 1 points to the + * first byte after the znode_phys_t. + */ + (void) memcpy(xoap->xoa_av_scanstamp, + pzp + 1, + sizeof (xoap->xoa_av_scanstamp)); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); + ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); + ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); + + mutex_exit(&zp->z_lock); + + dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks); + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: vp - vnode of file to be modified. + * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +static int +zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + znode_phys_t *pzp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + uint_t mask = vap->va_mask; + uint_t saved_mask; + int trim_mask = 0; + uint64_t new_mode; + znode_t *attrzp; + int need_policy = FALSE; + int err; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + pzp = zp->z_phys; + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (mask & AT_SIZE && vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (EISDIR); + } + + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((pzp->zp_flags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + ZFS_EXIT(zfsvfs); + return (EOVERFLOW); + } + } + +top: + attrzp = NULL; + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (EROFS); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = pzp->zp_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = pzp->zp_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + if ((need_policy == FALSE) && + (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && + xoap->xoa_appendonly != + ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && + xoap->xoa_nounlink != + ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && + xoap->xoa_immutable != + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_NODUMP) && + xoap->xoa_nodump != + ((pzp->zp_flags & ZFS_NODUMP) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && + xoap->xoa_av_modified != + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || + ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && + ((vp->v_type != VREG && xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + + if (mask & AT_MODE) { + uint64_t pmode = pzp->zp_mode; + + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (err); + } + if (pzp->zp_acl.z_acl_extern_obj) { + /* Are we upgrading ACL from old V0 format to new V1 */ + if (zfsvfs->z_version <= ZPL_VERSION_FUID && + pzp->zp_acl.z_acl_version == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, + pzp->zp_acl.z_acl_extern_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, + pzp->zp_acl.z_acl_extern_obj, 0, + aclp->z_acl_bytes); + } + } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + } + + if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { + err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); + if (err) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (err); + } + dmu_tx_hold_bonus(tx, attrzp->z_id); + } + + err = dmu_tx_assign(tx, zfsvfs->z_assign); + if (err) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); + + if (aclp) { + zfs_acl_free(aclp); + aclp = NULL; + } + + if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (err); + } + + dmu_buf_will_dirty(zp->z_dbuf, tx); + + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + mutex_enter(&zp->z_lock); + + if (mask & AT_MODE) { + mutex_enter(&zp->z_acl_lock); + zp->z_phys->zp_mode = new_mode; + err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + ASSERT3U(err, ==, 0); + mutex_exit(&zp->z_acl_lock); + } + + if (attrzp) + mutex_enter(&attrzp->z_lock); + + if (mask & AT_UID) { + pzp->zp_uid = zfs_fuid_create(zfsvfs, + vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); + if (attrzp) { + attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, + vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); + } + } + + if (mask & AT_GID) { + pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, + cr, ZFS_GROUP, tx, &fuidp); + if (attrzp) + attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, + vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); + } + + if (aclp) + zfs_acl_free(aclp); + + if (attrzp) + mutex_exit(&attrzp->z_lock); + + if (mask & AT_ATIME) + ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); + + if (mask & AT_MTIME) + ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ + if (mask & AT_SIZE) + zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); + else if (mask != 0) + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + size_t len; + dmu_object_info_t doi; + + ASSERT(vp->v_type == VREG); + + /* Grow the bonus buffer if necessary. */ + dmu_object_info_from_db(zp->z_dbuf, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + sizeof (znode_phys_t); + if (len > doi.doi_bonus_size) + VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); + } + zfs_xvattr_set(zp, xvap); + } + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + mutex_exit(&zp->z_lock); + + if (attrzp) + VN_RELE(ZTOV(attrzp)); + + dmu_tx_commit(tx); + + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + VN_RELE(ZTOV(zl->zl_znode)); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = zp->z_zfsvfs->z_root; + uint64_t *oidp = &zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = &zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (*oidp == szp->z_id) /* We're a descendant of szp */ + return (EINVAL); + + if (*oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + oidp = &zp->z_phys->zp_parent; + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *tdzp, *szp, *tzp; + znode_t *sdzp = VTOZ(sdvp); + zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; + zilog_t *zilog; + vnode_t *realvp; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + + /* + * Make sure we have the real vp for the target directory. + */ + if (VOP_REALVP(tdvp, &realvp, ct) == 0) + tdvp = realvp; + + if (tdvp->v_vfsp != sdvp->v_vfsp) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } + + tdzp = VTOZ(tdvp); + ZFS_VERIFY_ZP(tdzp); + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != + (sdzp->z_phys->zp_flags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + VN_RELE(ZTOV(tzp)); + } + if (strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + VN_RELE(ZTOV(szp)); + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) + goto out; + + if (ZTOV(szp)->v_type == VDIR) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (ZTOV(szp)->v_type == VDIR) { + if (ZTOV(tzp)->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + } else { + if (ZTOV(tzp)->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); + if (tzp) + vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ + dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) + dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ + if (tzp) + dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + VN_RELE(ZTOV(szp)); + if (tzp) + VN_RELE(ZTOV(tzp)); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + ASSERT(error == 0); + + zfs_log_rename(zilog, tx, + TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), + sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + + /* Update path information for the target vnode */ + vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + VN_RELE(ZTOV(szp)); + if (tzp) + VN_RELE(ZTOV(tzp)); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * target - Target path of new symlink. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + int len = strlen(link); + int error; + int zflg = ZNEW; + zfs_fuid_info_t *fuidp = NULL; + + ASSERT(vap->va_type == VLNK); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (ENAMETOOLONG); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + + /* + * Create a new object for the symlink. + * Put the link content into bonus buffer if it will fit; + * otherwise, store it just like any other file data. + */ + if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { + zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); + if (len != 0) + bcopy(link, zp->z_phys + 1, len); + } else { + dmu_buf_t *dbp; + + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); + /* + * Nothing can access the znode yet so no locking needed + * for growing the znode's blocksize. + */ + zfs_grow_blocksize(zp, len, tx); + + VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, + zp->z_id, 0, FTAG, &dbp)); + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp, FTAG); + } + zp->z_phys->zp_size = len; + + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); +out: + if (error == 0) { + uint64_t txtype = TX_SYMLINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + } + if (fuidp) + zfs_fuid_info_free(fuidp); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + VN_RELE(ZTOV(zp)); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uoip - structure to contain the link path. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - structure to contain the link path. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + size_t bufsz; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + bufsz = (size_t)zp->z_phys->zp_size; + if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { + error = uiomove(zp->z_phys + 1, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + } else { + dmu_buf_t *dbp; + error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + dmu_buf_rele(dbp, FTAG); + } + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +static int +zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *dzp = VTOZ(tdvp); + znode_t *tzp, *szp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + vnode_t *realvp; + int error; + int zf = ZNEW; + uid_t owner; + + ASSERT(tdvp->v_type == VDIR); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (VOP_REALVP(svp, &realvp, ct) == 0) + svp = realvp; + + if (svp->v_vfsp != tdvp->v_vfsp) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } + szp = VTOZ(svp); + ZFS_VERIFY_ZP(szp); + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + +top: + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_phys->zp_flags & ZFS_XATTR) != + (dzp->z_phys->zp_flags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (svp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && + secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, szp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, dzp, szp, name); + } + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (error == 0) { + vnevent_link(svp, ct); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * zfs_null_putapage() is used when the file system has been force + * unmounted. It just drops the pages. + */ +/* ARGSUSED */ +static int +zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); + return (0); +} + +/* + * Push a page out to disk, klustering if possible. + * + * IN: vp - file to push page to. + * pp - page to push. + * flags - additional flags. + * cr - credentials of caller. + * + * OUT: offp - start of range pushed. + * lenp - len of range pushed. + * + * RETURN: 0 if success + * error code if failure + * + * NOTE: callers must have locked the page to be pushed. On + * exit, the page (and all other pages in the kluster) must be + * unlocked. + */ +/* ARGSUSED */ +static int +zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + dmu_tx_t *tx; + rl_t *rl; + u_offset_t off, koff; + size_t len, klen; + uint64_t filesz; + int err; + + filesz = zp->z_phys->zp_size; + off = pp->p_offset; + len = PAGESIZE; + /* + * If our blocksize is bigger than the page size, try to kluster + * muiltiple pages so that we write a full block (thus avoiding + * a read-modify-write). + */ + if (off < filesz && zp->z_blksz > PAGESIZE) { + if (!ISP2(zp->z_blksz)) { + /* Only one block in the file. */ + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = 0; + } else { + klen = zp->z_blksz; + koff = P2ALIGN(off, (u_offset_t)klen); + } + ASSERT(koff <= filesz); + if (koff + klen > filesz) + klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); + pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); + } + ASSERT3U(btop(len), ==, btopr(len)); +top: + rl = zfs_range_lock(zp, off, len, RL_WRITER); + /* + * Can't push pages past end-of-file. + */ + filesz = zp->z_phys->zp_size; + if (off >= filesz) { + /* ignore all pages */ + err = 0; + goto out; + } else if (off + len > filesz) { + int npages = btopr(filesz - off); + page_t *trunc; + + page_list_break(&pp, &trunc, npages); + /* ignore pages past end of file */ + if (trunc) + pvn_write_done(trunc, flags); + len = filesz - off; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + dmu_tx_hold_bonus(tx, zp->z_id); + err = dmu_tx_assign(tx, zfsvfs->z_assign); + if (err != 0) { + if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + zfs_range_unlock(rl); + dmu_tx_wait(tx); + dmu_tx_abort(tx); + err = 0; + goto top; + } + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz <= PAGESIZE) { + caddr_t va = zfs_map_page(pp, S_READ); + ASSERT3U(len, <=, PAGESIZE); + dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); + zfs_unmap_page(pp, va); + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); + } + + if (err == 0) { + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); + dmu_tx_commit(tx); + } + +out: + zfs_range_unlock(rl); + pvn_write_done(pp, (err ? B_ERROR : 0) | flags); + if (offp) + *offp = off; + if (lenp) + *lenp = len; + + return (err); +} + +/* + * Copy the portion of the file indicated from pages into the file. + * The pages are stored in a page list attached to the files vnode. + * + * IN: vp - vnode of file to push page data to. + * off - position in file to put data. + * len - amount of data to write. + * flags - flags to control the operation. + * cr - credentials of caller. + * ct - caller context. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t *pp; + size_t io_len; + u_offset_t io_off; + uint64_t filesz; + int error = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (len == 0) { + /* + * Search the entire vp list for pages >= off. + */ + error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, + flags, cr); + goto out; + } + + filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ + if (off > filesz) { + /* past end of file */ + ZFS_EXIT(zfsvfs); + return (0); + } + + len = MIN(len, filesz - off); + + for (io_off = off; io_off < off + len; io_off += io_len) { + if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { + pp = page_lookup(vp, io_off, + (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); + } else { + pp = page_lookup_nowait(vp, io_off, + (flags & B_FREE) ? SE_EXCL : SE_SHARED); + } + + if (pp != NULL && pvn_getdirty(pp, flags)) { + int err; + + /* + * Found a dirty page to push + */ + err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); + if (err) + error = err; + } else { + io_len = PAGESIZE; + } + } +out: + if ((flags & B_ASYNC) == 0) + zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_dbuf == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + if (vn_has_cached_data(vp)) { + (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, + B_INVAL, cr); + } + + mutex_enter(&zp->z_lock); + vp->v_count = 0; /* count arrives as 1 */ + mutex_exit(&zp->z_lock); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + zfs_znode_free(zp); + return; + } + + /* + * Attempt to push any data in the page cache. If this fails + * we will get kicked out later in zfs_zinactive(). + */ + if (vn_has_cached_data(vp)) { + (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, + cr); + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_lock); + zp->z_atime_dirty = 0; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +/* + * Bounds-check the seek operation. + * + * IN: vp - vnode seeking within + * ooff - old file offset + * noffp - pointer to new file offset + * ct - caller context + * + * RETURN: 0 if success + * EINVAL if new offset invalid + */ +/* ARGSUSED */ +static int +zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (0); + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* + * Pre-filter the generic locking function to trap attempts to place + * a mandatory lock on a memory mapped file. + */ +static int +zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, + flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * We are following the UFS semantics with respect to mapcnt + * here: If we see that the file is mapped already, then we will + * return an error, but we don't worry about races between this + * function and zfs_map(). + */ + if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); + } + error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * If we can't find a page in the cache, we will create a new page + * and fill it with file data. For efficiency, we may try to fill + * multiple pages at once (klustering). + */ +static int +zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, + caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) +{ + znode_t *zp = VTOZ(vp); + page_t *pp, *cur_pp; + objset_t *os = zp->z_zfsvfs->z_os; + caddr_t va; + u_offset_t io_off, total; + uint64_t oid = zp->z_id; + size_t io_len; + uint64_t filesz; + int err; + + /* + * If we are only asking for a single page don't bother klustering. + */ + filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ + if (off >= filesz) + return (EFAULT); + if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { + io_off = off; + io_len = PAGESIZE; + pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); + } else { + /* + * Try to fill a kluster of pages (a blocks worth). + */ + size_t klen; + u_offset_t koff; + + if (!ISP2(zp->z_blksz)) { + /* Only one block in the file. */ + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = 0; + } else { + /* + * It would be ideal to align our offset to the + * blocksize but doing so has resulted in some + * strange application crashes. For now, we + * leave the offset as is and only adjust the + * length if we are off the end of the file. + */ + koff = off; + klen = plsz; + } + ASSERT(koff <= filesz); + if (koff + klen > filesz) + klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; + ASSERT3U(off, >=, koff); + ASSERT3U(off, <, koff + klen); + pp = pvn_read_kluster(vp, off, seg, addr, &io_off, + &io_len, koff, klen, 0); + } + if (pp == NULL) { + /* + * Some other thread entered the page before us. + * Return to zfs_getpage to retry the lookup. + */ + *pl = NULL; + return (0); + } + + /* + * Fill the pages in the kluster. + */ + cur_pp = pp; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + ASSERT3U(io_off, ==, cur_pp->p_offset); + va = zfs_map_page(cur_pp, S_WRITE); + err = dmu_read(os, oid, io_off, PAGESIZE, va); + zfs_unmap_page(cur_pp, va); + if (err) { + /* On error, toss the entire kluster */ + pvn_read_done(pp, B_ERROR); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = EIO; + return (err); + } + cur_pp = cur_pp->p_next; + } +out: + /* + * Fill in the page list array from the kluster. If + * there are too many pages in the kluster, return + * as many pages as possible starting from the desired + * offset `off'. + * NOTE: the page list will always be null terminated. + */ + pvn_plist_init(pp, pl, plsz, off, io_len, rw); + + return (0); +} + +/* + * Return pointers to the pages for the file region [off, off + len] + * in the pl array. If plsz is greater than len, this function may + * also return page pointers from before or after the specified + * region (i.e. some region [off', off' + plsz]). These additional + * pages are only returned if they are already in the cache, or were + * created as part of a klustered read. + * + * IN: vp - vnode of file to get data from. + * off - position in file to get data from. + * len - amount of data to retrieve. + * plsz - length of provided page list. + * seg - segment to obtain pages for. + * addr - virtual address of fault. + * rw - mode of created pages. + * cr - credentials of caller. + * ct - caller context. + * + * OUT: protp - protection mode of created pages. + * pl - list of pages created. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, + enum seg_rw rw, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t *pp, **pl0 = pl; + int need_unlock = 0, err = 0; + offset_t orig_off; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (protp) + *protp = PROT_ALL; + + /* no faultahead (for now) */ + if (pl == NULL) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* can't fault past EOF */ + if (off >= zp->z_phys->zp_size) { + ZFS_EXIT(zfsvfs); + return (EFAULT); + } + orig_off = off; + + /* + * If we already own the lock, then we must be page faulting + * in the middle of a write to this file (i.e., we are writing + * to this file using data from a mapped region of the file). + */ + if (rw_owner(&zp->z_map_lock) != curthread) { + rw_enter(&zp->z_map_lock, RW_WRITER); + need_unlock = TRUE; + } + + /* + * Loop through the requested range [off, off + len] looking + * for pages. If we don't find a page, we will need to create + * a new page and fill it with data from the file. + */ + while (len > 0) { + if (plsz < PAGESIZE) + break; + if (pp = page_lookup(vp, off, SE_SHARED)) { + *pl++ = pp; + off += PAGESIZE; + addr += PAGESIZE; + len -= PAGESIZE; + plsz -= PAGESIZE; + } else { + err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); + if (err) + goto out; + /* + * klustering may have changed our region + * to be block aligned. + */ + if (((pp = *pl) != 0) && (off != pp->p_offset)) { + int delta = off - pp->p_offset; + len += delta; + off -= delta; + addr -= delta; + } + while (*pl) { + pl++; + off += PAGESIZE; + addr += PAGESIZE; + plsz -= PAGESIZE; + if (len > PAGESIZE) + len -= PAGESIZE; + else + len = 0; + } + } + } + + /* + * Fill out the page array with any pages already in the cache. + */ + while (plsz > 0) { + pp = page_lookup_nowait(vp, off, SE_SHARED); + if (pp == NULL) + break; + *pl++ = pp; + off += PAGESIZE; + plsz -= PAGESIZE; + } + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); +out: + /* + * We can't grab the range lock for the page as reader which would + * stop truncation as this leads to deadlock. So we need to recheck + * the file size. + */ + if (orig_off >= zp->z_phys->zp_size) + err = EFAULT; + if (err) { + /* + * Release any pages we have previously locked. + */ + while (pl > pl0) + page_unlock(*--pl); + } + + *pl = NULL; + + if (need_unlock) + rw_exit(&zp->z_map_lock); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Request a memory map for a section of a file. This code interacts + * with common code and the VM system as follows: + * + * common code calls mmap(), which ends up in smmap_common() + * + * this calls VOP_MAP(), which takes you into (say) zfs + * + * zfs_map() calls as_map(), passing segvn_create() as the callback + * + * segvn_create() creates the new segment and calls VOP_ADDMAP() + * + * zfs_addmap() updates z_mapcnt + */ +/*ARGSUSED*/ +static int +zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + segvn_crargs_t vn_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((prot & PROT_WRITE) && + (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY | + ZFS_APPENDONLY))) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if ((prot & (PROT_READ | PROT_EXEC)) && + (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) { + ZFS_EXIT(zfsvfs); + return (EACCES); + } + + if (vp->v_flag & VNOMAP) { + ZFS_EXIT(zfsvfs); + return (ENOSYS); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (ENXIO); + } + + if (vp->v_type != VREG) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + + /* + * If file is locked, disallow mapping. + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); + } + + as_rangelock(as); + error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (error != 0) { + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); + } + + vn_a.vp = vp; + vn_a.offset = (u_offset_t)off; + vn_a.type = flags & MAP_TYPE; + vn_a.prot = prot; + vn_a.maxprot = maxprot; + vn_a.cred = cr; + vn_a.amp = NULL; + vn_a.flags = flags & ~MAP_TYPE; + vn_a.szc = 0; + vn_a.lgrp_mem_policy_flags = 0; + + error = as_map(as, *addrp, len, segvn_create, &vn_a); + + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +static int +zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + uint64_t pages = btopr(len); + + atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); + return (0); +} + +/* + * The reason we push dirty pages as part of zfs_delmap() is so that we get a + * more accurate mtime for the associated file. Since we don't have a way of + * detecting when the data was actually modified, we have to resort to + * heuristics. If an explicit msync() is done, then we mark the mtime when the + * last page is pushed. The problem occurs when the msync() call is omitted, + * which by far the most common case: + * + * open() + * mmap() + * <modify memory> + * munmap() + * close() + * <time lapse> + * putpage() via fsflush + * + * If we wait until fsflush to come along, we can have a modification time that + * is some arbitrary point in the future. In order to prevent this in the + * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is + * torn down. + */ +/* ARGSUSED */ +static int +zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + uint64_t pages = btopr(len); + + ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); + atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); + + if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && + vn_has_cached_data(vp)) + (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); + + return (0); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: vp - vnode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller [UNUSED]. + * ct - caller context. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated + */ +/* ARGSUSED */ +static int +zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (error = convoff(vp, bfp, 0, offset)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +static int +zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint32_t gen; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + gen = (uint32_t)zp->z_gen; + + size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + if (fidp->fid_len < size) { + fidp->fid_len = size; + ZFS_EXIT(zfsvfs); + return (ENOSPC); + } + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + if (size == LONG_FID_LEN) { + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + zfid_long_t *zlfid; + + zlfid = (zfid_long_t *)fidp; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + /* XXX - this should be the generation number for the objset */ + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp, *xzp; + zfsvfs_t *zfsvfs; + zfs_dirlock_t *dl; + int error; + + switch (cmd) { + case _PC_LINK_MAX: + *valp = ULONG_MAX; + return (0); + + case _PC_FILESIZEBITS: + *valp = 64; + return (0); + + case _PC_XATTR_EXISTS: + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + *valp = 0; + error = zfs_dirent_lock(&dl, zp, "", &xzp, + ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); + if (error == 0) { + zfs_dirent_unlock(dl); + if (!zfs_dirempty(xzp)) + *valp = 1; + VN_RELE(ZTOV(xzp)); + } else if (error == ENOENT) { + /* + * If there aren't extended attributes, it's the + * same as having zero of them. + */ + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); + + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_type == VREG || vp->v_type == VDIR); + return (0); + + case _PC_ACL_ENABLED: + *valp = _ACL_ACE_ENABLED; + return (0); + + case _PC_MIN_HOLE_SIZE: + *valp = (ulong_t)SPA_MINBLOCKSIZE; + return (0); + + default: + return (fs_pathconf(vp, cmd, valp, cr, ct)); + } +} + +/*ARGSUSED*/ +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +static int +zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Predeclare these here so that the compiler assumes that + * this is an "old style" function declaration that does + * not include arguments => we won't get type mismatch errors + * in the initializations that follow. + */ +static int zfs_inval(); +static int zfs_isdir(); + +static int +zfs_inval() +{ + return (EINVAL); +} + +static int +zfs_isdir() +{ + return (EISDIR); +} +/* + * Directory vnode operations template + */ +vnodeops_t *zfs_dvnodeops; +const fs_operation_def_t zfs_dvnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = zfs_open }, + VOPNAME_CLOSE, { .vop_close = zfs_close }, + VOPNAME_READ, { .error = zfs_isdir }, + VOPNAME_WRITE, { .error = zfs_isdir }, + VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, + VOPNAME_ACCESS, { .vop_access = zfs_access }, + VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, + VOPNAME_CREATE, { .vop_create = zfs_create }, + VOPNAME_REMOVE, { .vop_remove = zfs_remove }, + VOPNAME_LINK, { .vop_link = zfs_link }, + VOPNAME_RENAME, { .vop_rename = zfs_rename }, + VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, + VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, + VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, + VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, + VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, + VOPNAME_FID, { .vop_fid = zfs_fid }, + VOPNAME_SEEK, { .vop_seek = zfs_seek }, + VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, + VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, + VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; + +/* + * Regular file vnode operations template + */ +vnodeops_t *zfs_fvnodeops; +const fs_operation_def_t zfs_fvnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = zfs_open }, + VOPNAME_CLOSE, { .vop_close = zfs_close }, + VOPNAME_READ, { .vop_read = zfs_read }, + VOPNAME_WRITE, { .vop_write = zfs_write }, + VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, + VOPNAME_ACCESS, { .vop_access = zfs_access }, + VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, + VOPNAME_RENAME, { .vop_rename = zfs_rename }, + VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, + VOPNAME_FID, { .vop_fid = zfs_fid }, + VOPNAME_SEEK, { .vop_seek = zfs_seek }, + VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, + VOPNAME_SPACE, { .vop_space = zfs_space }, + VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, + VOPNAME_MAP, { .vop_map = zfs_map }, + VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, + VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, + VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; + +/* + * Symbolic link vnode operations template + */ +vnodeops_t *zfs_symvnodeops; +const fs_operation_def_t zfs_symvnodeops_template[] = { + VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, + VOPNAME_ACCESS, { .vop_access = zfs_access }, + VOPNAME_RENAME, { .vop_rename = zfs_rename }, + VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, + VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, + VOPNAME_FID, { .vop_fid = zfs_fid }, + VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; + +/* + * Extended attribute directory vnode operations template + * This template is identical to the directory vnodes + * operation template except for restricted operations: + * VOP_MKDIR() + * VOP_SYMLINK() + * Note that there are other restrictions embedded in: + * zfs_create() - restrict type to VREG + * zfs_link() - no links into/out of attribute space + * zfs_rename() - no moves into/out of attribute space + */ +vnodeops_t *zfs_xdvnodeops; +const fs_operation_def_t zfs_xdvnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = zfs_open }, + VOPNAME_CLOSE, { .vop_close = zfs_close }, + VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, + VOPNAME_ACCESS, { .vop_access = zfs_access }, + VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, + VOPNAME_CREATE, { .vop_create = zfs_create }, + VOPNAME_REMOVE, { .vop_remove = zfs_remove }, + VOPNAME_LINK, { .vop_link = zfs_link }, + VOPNAME_RENAME, { .vop_rename = zfs_rename }, + VOPNAME_MKDIR, { .error = zfs_inval }, + VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, + VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, + VOPNAME_SYMLINK, { .error = zfs_inval }, + VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, + VOPNAME_FID, { .vop_fid = zfs_fid }, + VOPNAME_SEEK, { .vop_seek = zfs_seek }, + VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, + VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, + VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; + +/* + * Error vnode operations template + */ +vnodeops_t *zfs_evnodeops; +const fs_operation_def_t zfs_evnodeops_template[] = { + VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, + VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, + NULL, NULL +}; diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c new file mode 100644 index 000000000..25751ae5f --- /dev/null +++ b/module/zfs/zfs_znode.c @@ -0,0 +1,1672 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Portions Copyright 2007 Jeremy Teo */ + +#ifdef _KERNEL +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/mntent.h> +#include <sys/mkdev.h> +#include <sys/u8_textprep.h> +#include <sys/dsl_dataset.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/mode.h> +#include <sys/atomic.h> +#include <vm/pvn.h> +#include "fs/fs_subr.h" +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_rlock.h> +#include <sys/zfs_fuid.h> +#include <sys/fs/zfs.h> +#include <sys/kidmap.h> +#endif /* _KERNEL */ + +#include <sys/dmu.h> +#include <sys/refcount.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/zfs_znode.h> + +#include "zfs_prop.h" + +/* + * Define ZNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define ZNODE_STATS +#endif /* DEBUG */ + +#ifdef ZNODE_STATS +#define ZNODE_STAT_ADD(stat) ((stat)++) +#else +#define ZNODE_STAT_ADD(stat) /* nothing */ +#endif /* ZNODE_STATS */ + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +static kmem_cache_t *znode_cache = NULL; + +/*ARGSUSED*/ +static void +znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) +{ + /* + * We should never drop all dbuf refs without first clearing + * the eviction callback. + */ + panic("evicting znode %p\n", user_ptr); +} + +/*ARGSUSED*/ +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + + zp->z_vnode = vn_alloc(kmflags); + if (zp->z_vnode == NULL) { + return (-1); + } + ZTOV(zp)->v_data = zp; + + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + + mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zp->z_range_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); + + zp->z_dbuf = NULL; + zp->z_dirlocks = NULL; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT(ZTOV(zp)->v_data == zp); + vn_free(ZTOV(zp)); + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_lock); + rw_destroy(&zp->z_map_lock); + rw_destroy(&zp->z_parent_lock); + rw_destroy(&zp->z_name_lock); + mutex_destroy(&zp->z_acl_lock); + avl_destroy(&zp->z_range_avl); + mutex_destroy(&zp->z_range_lock); + + ASSERT(zp->z_dbuf == NULL); + ASSERT(zp->z_dirlocks == NULL); +} + +#ifdef ZNODE_STATS +static struct { + uint64_t zms_zfsvfs_invalid; + uint64_t zms_zfsvfs_unmounted; + uint64_t zms_zfsvfs_recheck_invalid; + uint64_t zms_obj_held; + uint64_t zms_vnode_locked; + uint64_t zms_not_only_dnlc; +} znode_move_stats; +#endif /* ZNODE_STATS */ + +static void +zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) +{ + vnode_t *vp; + + /* Copy fields. */ + nzp->z_zfsvfs = ozp->z_zfsvfs; + + /* Swap vnodes. */ + vp = nzp->z_vnode; + nzp->z_vnode = ozp->z_vnode; + ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ + ZTOV(ozp)->v_data = ozp; + ZTOV(nzp)->v_data = nzp; + + nzp->z_id = ozp->z_id; + ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ + ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); + nzp->z_unlinked = ozp->z_unlinked; + nzp->z_atime_dirty = ozp->z_atime_dirty; + nzp->z_zn_prefetch = ozp->z_zn_prefetch; + nzp->z_blksz = ozp->z_blksz; + nzp->z_seq = ozp->z_seq; + nzp->z_mapcnt = ozp->z_mapcnt; + nzp->z_last_itx = ozp->z_last_itx; + nzp->z_gen = ozp->z_gen; + nzp->z_sync_cnt = ozp->z_sync_cnt; + nzp->z_phys = ozp->z_phys; + nzp->z_dbuf = ozp->z_dbuf; + + /* Update back pointers. */ + (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, + znode_evict_error); + + /* + * Invalidate the original znode by clearing fields that provide a + * pointer back to the znode. Set the low bit of the vfs pointer to + * ensure that zfs_znode_move() recognizes the znode as invalid in any + * subsequent callback. + */ + ozp->z_dbuf = NULL; + POINTER_INVALIDATE(&ozp->z_zfsvfs); +} + +/* + * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise + * returns a non-zero error code. + */ +static int +zfs_enter(zfsvfs_t *zfsvfs) +{ + ZFS_ENTER(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +static kmem_cbrc_t +zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + znode_t *ozp = buf, *nzp = newbuf; + zfsvfs_t *zfsvfs; + vnode_t *vp; + + /* + * The znode is on the file system's list of known znodes if the vfs + * pointer is valid. We set the low bit of the vfs pointer when freeing + * the znode to invalidate it, and the memory patterns written by kmem + * (baddcafe and deadbeef) set at least one of the two low bits. A newly + * created znode sets the vfs pointer last of all to indicate that the + * znode is known and in a valid state to be moved by this function. + */ + zfsvfs = ozp->z_zfsvfs; + if (!POINTER_IS_VALID(zfsvfs)) { + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the filesystem is not unmounted during the move. + */ + if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); + return (KMEM_CBRC_DONT_KNOW); + } + + mutex_enter(&zfsvfs->z_znodes_lock); + /* + * Recheck the vfs pointer in case the znode was removed just before + * acquiring the lock. + */ + if (zfsvfs != ozp->z_zfsvfs) { + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold z_znodes_lock, the + * znode cannot be freed and fields within the znode can be safely + * accessed. Now, prevent a race with zfs_zget(). + */ + if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); + return (KMEM_CBRC_LATER); + } + + vp = ZTOV(ozp); + if (mutex_tryenter(&vp->v_lock) == 0) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); + return (KMEM_CBRC_LATER); + } + + /* Only move znodes that are referenced _only_ by the DNLC. */ + if (vp->v_count != 1 || !vn_in_dnlc(vp)) { + mutex_exit(&vp->v_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); + return (KMEM_CBRC_LATER); + } + + /* + * The znode is known and in a valid state to move. We're holding the + * locks needed to execute the critical section. + */ + zfs_znode_move_impl(ozp, nzp); + mutex_exit(&vp->v_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + + list_link_replace(&ozp->z_link_node, &nzp->z_link_node); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + + return (KMEM_CBRC_YES); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, 0); + kmem_cache_set_move(znode_cache, zfs_znode_move); +} + +void +zfs_znode_fini(void) +{ + /* + * Cleanup vfs & vnode ops + */ + zfs_remove_op_tables(); + + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; +} + +struct vnodeops *zfs_dvnodeops; +struct vnodeops *zfs_fvnodeops; +struct vnodeops *zfs_symvnodeops; +struct vnodeops *zfs_xdvnodeops; +struct vnodeops *zfs_evnodeops; + +void +zfs_remove_op_tables() +{ + /* + * Remove vfs ops + */ + ASSERT(zfsfstype); + (void) vfs_freevfsops_by_type(zfsfstype); + zfsfstype = 0; + + /* + * Remove vnode ops + */ + if (zfs_dvnodeops) + vn_freevnodeops(zfs_dvnodeops); + if (zfs_fvnodeops) + vn_freevnodeops(zfs_fvnodeops); + if (zfs_symvnodeops) + vn_freevnodeops(zfs_symvnodeops); + if (zfs_xdvnodeops) + vn_freevnodeops(zfs_xdvnodeops); + if (zfs_evnodeops) + vn_freevnodeops(zfs_evnodeops); + + zfs_dvnodeops = NULL; + zfs_fvnodeops = NULL; + zfs_symvnodeops = NULL; + zfs_xdvnodeops = NULL; + zfs_evnodeops = NULL; +} + +extern const fs_operation_def_t zfs_dvnodeops_template[]; +extern const fs_operation_def_t zfs_fvnodeops_template[]; +extern const fs_operation_def_t zfs_xdvnodeops_template[]; +extern const fs_operation_def_t zfs_symvnodeops_template[]; +extern const fs_operation_def_t zfs_evnodeops_template[]; + +int +zfs_create_op_tables() +{ + int error; + + /* + * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() + * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). + * In this case we just return as the ops vectors are already set up. + */ + if (zfs_dvnodeops) + return (0); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, + &zfs_dvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, + &zfs_fvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, + &zfs_symvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, + &zfs_xdvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, + &zfs_evnodeops); + + return (error); +} + +/* + * zfs_init_fs - Initialize the zfsvfs struct and the file system + * incore "master" object. Verify version compatibility. + */ +int +zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) +{ + extern int zfsfstype; + + objset_t *os = zfsvfs->z_os; + int i, error; + uint64_t fsid_guid; + uint64_t zval; + + *zpp = NULL; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error) { + return (error); + } else if (zfsvfs->z_version > ZPL_VERSION) { + (void) printf("Mismatched versions: File system " + "is version %llu on-disk format, which is " + "incompatible with this software version %lld!", + (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); + return (ENOTSUP); + } + + if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) + return (error); + zfsvfs->z_norm = (int)zval; + if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) + return (error); + zfsvfs->z_utf8 = (zval != 0); + if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) + return (error); + zfsvfs->z_case = (uint_t)zval; + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; + zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + zfsfstype & 0xFF; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error) + return (error); + + /* + * Initialize zget mutex's + */ + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); + if (error) { + /* + * On error, we destroy the mutexes here since it's not + * possible for the caller to determine if the mutexes were + * initialized properly. + */ + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + return (error); + } + ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + error = 0; + + return (0); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ +#ifndef _LP64 + major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32; + return (((uint64_t)major << NBITSMINOR64) | + ((minor_t)dev & MAXMIN32)); +#else + return (dev); +#endif +} + +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ +#ifndef _LP64 + minor_t minor = (minor_t)dev & MAXMIN64; + major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; + + if (major > MAXMAJ32 || minor > MAXMIN32) + return (NODEV32); + + return (((dev32_t)major << NBITSMINOR32) | minor); +#else + return (dev); +#endif +} + +static void +zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) +{ + znode_t *nzp; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); + + mutex_enter(&zp->z_lock); + + ASSERT(zp->z_dbuf == NULL); + zp->z_dbuf = db; + nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); + + /* + * there should be no + * concurrent zgets on this object. + */ + if (nzp != NULL) + panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); + + /* + * Slap on VROOT if we are the root znode + */ + if (zp->z_id == zfsvfs->z_root) + ZTOV(zp)->v_flag |= VROOT; + + mutex_exit(&zp->z_lock); + vn_exists(ZTOV(zp)); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + dmu_buf_t *db = zp->z_dbuf; + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || + zp->z_unlinked || + RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); + ASSERT(zp->z_dbuf != NULL); + zp->z_dbuf = NULL; + VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); + dmu_buf_rele(db, NULL); +} + +/* + * Construct a new znode/vnode and intialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) +{ + znode_t *zp; + vnode_t *vp; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + + ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_dbuf == NULL); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ + zp->z_phys = NULL; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_mapcnt = 0; + zp->z_last_itx = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + vp = ZTOV(zp); + vn_reinit(vp); + + zfs_znode_dmu_init(zfsvfs, zp, db); + + zp->z_gen = zp->z_phys->zp_gen; + + vp->v_vfsp = zfsvfs->z_parent->z_vfs; + vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); + + switch (vp->v_type) { + case VDIR: + if (zp->z_phys->zp_flags & ZFS_XATTR) { + vn_setops(vp, zfs_xdvnodeops); + vp->v_flag |= V_XATTRDIR; + } else { + vn_setops(vp, zfs_dvnodeops); + } + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; + case VBLK: + case VCHR: + vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev); + /*FALLTHROUGH*/ + case VFIFO: + case VSOCK: + case VDOOR: + vn_setops(vp, zfs_fvnodeops); + break; + case VREG: + vp->v_flag |= VMODSORT; + vn_setops(vp, zfs_fvnodeops); + break; + case VLNK: + vn_setops(vp, zfs_symvnodeops); + break; + default: + vn_setops(vp, zfs_evnodeops); + break; + } + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + membar_producer(); + /* + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). + */ + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + + VFS_HOLD(zfsvfs->z_vfs); + return (zp); +} + +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * IS_REPLAY - intent log replay + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. + * + * OUT: zpp - allocated znode + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, + zfs_fuid_info_t **fuidp) +{ + dmu_buf_t *db; + znode_phys_t *pzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + timestruc_t now; + uint64_t gen, obj; + int err; + + ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); + + if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + obj = vap->va_nodeid; + flag |= IS_REPLAY; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + } else { + obj = 0; + gethrestime(&now); + gen = dmu_tx_get_txg(tx); + } + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be to needed allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (flag & IS_REPLAY) { + err = zap_create_claim_norm(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + ASSERT3U(err, ==, 0); + } else { + obj = zap_create_norm(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + } + } else { + if (flag & IS_REPLAY) { + err = dmu_object_claim(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + ASSERT3U(err, ==, 0); + } else { + obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + } + } + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); + dmu_buf_will_dirty(db, tx); + + /* + * Initialize the znode physical data to zero. + */ + ASSERT(db->db_size >= sizeof (znode_phys_t)); + bzero(db->db_data, db->db_size); + pzp = db->db_data; + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_dbuf = db; + dzp->z_phys = pzp; + dzp->z_id = obj; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp->z_phys->zp_flags & ZFS_XATTR) + flag |= IS_XATTR; + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + pzp->zp_rdev = zfs_expldev(vap->va_rdev); + } + + if (zfsvfs->z_use_fuids) + pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + + if (vap->va_type == VDIR) { + pzp->zp_size = 2; /* contents ("." and "..") */ + pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } + + pzp->zp_parent = dzp->z_id; + if (flag & IS_XATTR) + pzp->zp_flags |= ZFS_XATTR; + + pzp->zp_gen = gen; + + ZFS_TIME_ENCODE(&now, pzp->zp_crtime); + ZFS_TIME_ENCODE(&now, pzp->zp_ctime); + + if (vap->va_mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); + } else { + ZFS_TIME_ENCODE(&now, pzp->zp_atime); + } + + if (vap->va_mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + } else { + ZFS_TIME_ENCODE(&now, pzp->zp_mtime); + } + + pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); + if (!(flag & IS_ROOT_NODE)) { + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + *zpp = zfs_znode_alloc(zfsvfs, db, 0); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + } + zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); +} + +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) +{ + xoptattr_t *xoap; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp)); + zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + int err; + + *zpp = NULL; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_ZNODE || + doi.doi_bonus_size < sizeof (znode_phys_t)) { + dmu_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EINVAL); + } + + zp = dmu_buf_get_user(db); + if (zp != NULL) { + mutex_enter(&zp->z_lock); + + /* + * Since we do immediate eviction of the z_dbuf, we + * should never find a dbuf with a znode that doesn't + * know about the dbuf. + */ + ASSERT3P(zp->z_dbuf, ==, db); + ASSERT3U(zp->z_id, ==, obj_num); + if (zp->z_unlinked) { + err = ENOENT; + } else { + VN_HOLD(ZTOV(zp)); + *zpp = zp; + err = 0; + } + dmu_buf_rele(db, NULL); + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + /* + * Not found create new znode/vnode + */ + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + *zpp = zp; + return (0); +} + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + uint64_t obj_num = zp->z_id; + int err; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_ZNODE || + doi.doi_bonus_size < sizeof (znode_phys_t)) { + dmu_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EINVAL); + } + + if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { + dmu_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EIO); + } + + zfs_znode_dmu_init(zfsvfs, zp, db); + zp->z_unlinked = (zp->z_phys->zp_links == 0); + zp->z_blksz = doi.doi_data_block_size; + + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + if (acl_obj) + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + zfs_znode_free(zp); +} + +void +zfs_zinactive(znode_t *zp) +{ + vnode_t *vp = ZTOV(zp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + + ASSERT(zp->z_dbuf && zp->z_phys); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); + + mutex_enter(&zp->z_lock); + mutex_enter(&vp->v_lock); + vp->v_count--; + if (vp->v_count > 0 || vn_has_cached_data(vp)) { + /* + * If the hold count is greater than zero, somebody has + * obtained a new reference on this znode while we were + * processing it here, so we are done. If we still have + * mapped pages then we are also done, since we don't + * want to inactivate the znode until the pages get pushed. + * + * XXX - if vn_has_cached_data(vp) is true, but count == 0, + * this seems like it would leave the znode hanging with + * no chance to go inactive... + */ + mutex_exit(&vp->v_lock); + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + return; + } + mutex_exit(&vp->v_lock); + + /* + * If this was the last reference to a file with no links, + * remove the file from the file system. + */ + if (zp->z_unlinked) { + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_rmnode(zp); + return; + } + mutex_exit(&zp->z_lock); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_znode_free(zp); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + vn_invalid(ZTOV(zp)); + + ASSERT(ZTOV(zp)->v_count == 0); + + mutex_enter(&zfsvfs->z_znodes_lock); + POINTER_INVALIDATE(&zp->z_zfsvfs); + list_remove(&zfsvfs->z_all_znodes, zp); + mutex_exit(&zfsvfs->z_znodes_lock); + + kmem_cache_free(znode_cache, zp); + + VFS_RELE(zfsvfs->z_vfs); +} + +void +zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) +{ + timestruc_t now; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + + gethrestime(&now); + + if (tx) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & AT_ATIME) + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); + + if (flag & AT_MTIME) { + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); + } + + if (flag & AT_CTIME) { + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_phys->zp_flags |= ZFS_ARCHIVE; + } +} + +/* + * Update the requested znode timestamps with the current time. + * If we are in a transaction, then go ahead and mark the znode + * dirty in the transaction so the timestamps will go to disk. + * Otherwise, we will get pushed next time the znode is updated + * in a transaction, or when this znode eventually goes inactive. + * + * Why is this OK? + * 1 - Only the ACCESS time is ever updated outside of a transaction. + * 2 - Multiple consecutive updates will be collapsed into a single + * znode update by the transaction grouping semantics of the DMU. + */ +void +zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) +{ + mutex_enter(&zp->z_lock); + zfs_time_stamper_locked(zp, flag, tx); + mutex_exit(&zp->z_lock); +} + +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, + size, 0, tx); + if (error == ENOTSUP) + return; + ASSERT3U(error, ==, 0); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); +} + +/* + * This is a dummy interface used when pvn_vplist_dirty() should *not* + * be calling back into the fs for a putpage(). E.g.: when truncating + * a file, the pages being "thrown away* don't need to be written out. + */ +/* ARGSUSED */ +static int +zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, + int flags, cred_t *cr) +{ + ASSERT(0); + return (0); +} + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 if success + * error code if failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + rl_t *rl; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_phys->zp_size) { + zfs_range_unlock(rl); + return (0); + } +top: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, SPA_MAXBLOCKSIZE); + } else { + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_range_unlock(rl); + return (error); + } + dmu_buf_will_dirty(zp->z_dbuf, tx); + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_phys->zp_size = end; + + zfs_range_unlock(rl); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 if success + * error code if failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + rl_t *rl; + int error; + + /* + * Lock the range being freed. + */ + rl = zfs_range_lock(zp, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_phys->zp_size) { + zfs_range_unlock(rl); + return (0); + } + + if (off + len > zp->z_phys->zp_size) + len = zp->z_phys->zp_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + zfs_range_unlock(rl); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 if success + * error code if failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + rl_t *rl; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_phys->zp_size) { + zfs_range_unlock(rl); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); + if (error) { + zfs_range_unlock(rl); + return (error); + } +top: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_range_unlock(rl); + return (error); + } + dmu_buf_will_dirty(zp->z_dbuf, tx); + + zp->z_phys->zp_size = end; + + dmu_tx_commit(tx); + + zfs_range_unlock(rl); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (vn_has_cached_data(vp)) { + page_t *pp; + uint64_t start = end & PAGEMASK; + int poff = end & PAGEOFFSET; + + if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) { + /* + * We need to zero a partial page. + */ + pagezero(pp, poff, PAGESIZE - poff); + start += PAGESIZE; + page_unlock(pp); + } + error = pvn_vplist_dirty(vp, start, zfs_no_putpage, + B_INVAL | B_TRUNC, NULL); + ASSERT(error == 0); + } + rw_exit(&zp->z_map_lock); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 if success + * error code if failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + int error; + + if (off > zp->z_phys->zp_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + else + return (error); + } + + /* + * Check for any locks in the region to be freed. + */ + if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) { + uint64_t length = (len ? len : zp->z_phys->zp_size - off); + if (error = chklock(vp, FWRITE, off, length, flag, NULL)) + return (error); + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_phys->zp_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + return (error); +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto log; + } + dmu_tx_abort(tx); + return (error); + } + + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + return (0); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + zfsvfs_t zfsvfs; + uint64_t moid, doid, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int error; + znode_t *rootzp = NULL; + vnode_t *vp; + vattr_t vattr; + znode_t *zp; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + version = ZPL_VERSION; + else + version = ZPL_VERSION_FUID - 1; + error = zap_update(os, moid, ZPL_VERSION_STR, + 8, 1, &version, tx); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + version = val; + error = zap_update(os, moid, ZPL_VERSION_STR, + 8, 1, &version, tx); + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + + /* + * Create a delete queue. + */ + doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + + vp = ZTOV(rootzp); + vn_reinit(vp); + vp->v_type = VDIR; + + bzero(&zfsvfs, sizeof (zfsvfs_t)); + + zfsvfs.z_os = os; + zfsvfs.z_assign = TXG_NOWAIT; + zfsvfs.z_parent = &zfsvfs; + zfsvfs.z_version = version; + zfsvfs.z_use_fuids = USE_FUIDS(version, os); + zfsvfs.z_norm = norm; + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_zfsvfs = &zfsvfs; + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); + ASSERT3P(zp, ==, rootzp); + ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */ + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + + ZTOV(rootzp)->v_count = 0; + dmu_buf_rele(rootzp->z_dbuf, NULL); + rootzp->z_dbuf = NULL; + kmem_cache_free(znode_cache, rootzp); +} + +#endif /* _KERNEL */ +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) +{ + dmu_buf_t *db; + dmu_object_info_t doi; + znode_phys_t *zp; + int error; + + if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) + return (error); + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_ZNODE || + doi.doi_bonus_size < sizeof (znode_phys_t)) { + dmu_buf_rele(db, FTAG); + return (EINVAL); + } + + zp = db->db_data; + *pobjp = zp->zp_parent; + *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && + S_ISDIR(zp->zp_mode); + dmu_buf_rele(db, FTAG); + + return (0); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + char *path = buf + len - 1; + int error; + + *path = '\0'; + + for (;;) { + uint64_t pobj; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir; + + if ((error = zfs_obj_to_pobj(osp, obj, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, "<xattrdir>"); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + return (error); +} diff --git a/module/zfs/zil.c b/module/zfs/zil.c new file mode 100644 index 000000000..95101882b --- /dev/null +++ b/module/zfs/zil.c @@ -0,0 +1,1735 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/arc.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/dsl_dataset.h> +#include <sys/vdev.h> +#include <sys/dmu_tx.h> + +/* + * The zfs intent log (ZIL) saves transaction records of system calls + * that change the file system in memory with enough information + * to be able to replay them. These are stored in memory until + * either the DMU transaction group (txg) commits them to the stable pool + * and they can be discarded, or they are flushed to the stable log + * (also in the pool) due to a fsync, O_DSYNC or other synchronous + * requirement. In the event of a panic or power fail then those log + * records (transactions) are replayed. + * + * There is one ZIL per file system. Its on-disk (pool) format consists + * of 3 parts: + * + * - ZIL header + * - ZIL blocks + * - ZIL records + * + * A log record holds a system call transaction. Log blocks can + * hold many log records and the blocks are chained together. + * Each ZIL block contains a block pointer (blkptr_t) to the next + * ZIL block in the chain. The ZIL header points to the first + * block in the chain. Note there is not a fixed place in the pool + * to hold blocks. They are dynamically allocated and freed as + * needed from the blocks available. Figure X shows the ZIL structure: + */ + +/* + * This global ZIL switch affects all pools + */ +int zil_disable = 0; /* disable intent logging */ + +/* + * Tunable parameter for debugging or performance analysis. Setting + * zfs_nocacheflush will cause corruption on power loss if a volatile + * out-of-order write cache is enabled. + */ +boolean_t zfs_nocacheflush = B_FALSE; + +static kmem_cache_t *zil_lwb_cache; + +static int +zil_dva_compare(const void *x1, const void *x2) +{ + const dva_t *dva1 = x1; + const dva_t *dva2 = x2; + + if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) + return (-1); + if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) + return (1); + + if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) + return (-1); + if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) + return (1); + + return (0); +} + +static void +zil_dva_tree_init(avl_tree_t *t) +{ + avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t), + offsetof(zil_dva_node_t, zn_node)); +} + +static void +zil_dva_tree_fini(avl_tree_t *t) +{ + zil_dva_node_t *zn; + void *cookie = NULL; + + while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(zn, sizeof (zil_dva_node_t)); + + avl_destroy(t); +} + +static int +zil_dva_tree_add(avl_tree_t *t, dva_t *dva) +{ + zil_dva_node_t *zn; + avl_index_t where; + + if (avl_find(t, dva, &where) != NULL) + return (EEXIST); + + zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP); + zn->zn_dva = *dva; + avl_insert(t, zn, where); + + return (0); +} + +static zil_header_t * +zil_header_in_syncing_context(zilog_t *zilog) +{ + return ((zil_header_t *)zilog->zl_header); +} + +static void +zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) +{ + zio_cksum_t *zc = &bp->blk_cksum; + + zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); + zc->zc_word[ZIL_ZC_SEQ] = 1ULL; +} + +/* + * Read a log block, make sure it's valid, and byteswap it if necessary. + */ +static int +zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) +{ + blkptr_t blk = *bp; + zbookmark_t zb; + uint32_t aflags = ARC_WAIT; + int error; + + zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + + *abufpp = NULL; + + /* + * We shouldn't be doing any scrubbing while we're doing log + * replay, it's OK to not lock. + */ + error = arc_read_nolock(NULL, zilog->zl_spa, &blk, + arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb); + + if (error == 0) { + char *data = (*abufpp)->b_data; + uint64_t blksz = BP_GET_LSIZE(bp); + zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1; + zio_cksum_t cksum = bp->blk_cksum; + + /* + * Validate the checksummed log block. + * + * Sequence numbers should be... sequential. The checksum + * verifier for the next block should be bp's checksum plus 1. + * + * Also check the log chain linkage and size used. + */ + cksum.zc_word[ZIL_ZC_SEQ]++; + + if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) || + (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) { + error = ECKSUM; + } + + if (error) { + VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); + *abufpp = NULL; + } + } + + dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid); + + return (error); +} + +/* + * Parse the intent log, and call parse_func for each valid record within. + * Return the highest sequence number. + */ +uint64_t +zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) +{ + const zil_header_t *zh = zilog->zl_header; + uint64_t claim_seq = zh->zh_claim_seq; + uint64_t seq = 0; + uint64_t max_seq = 0; + blkptr_t blk = zh->zh_log; + arc_buf_t *abuf; + char *lrbuf, *lrp; + zil_trailer_t *ztp; + int reclen, error; + + if (BP_IS_HOLE(&blk)) + return (max_seq); + + /* + * Starting at the block pointed to by zh_log we read the log chain. + * For each block in the chain we strongly check that block to + * ensure its validity. We stop when an invalid block is found. + * For each block pointer in the chain we call parse_blk_func(). + * For each record in each valid block we call parse_lr_func(). + * If the log has been claimed, stop if we encounter a sequence + * number greater than the highest claimed sequence number. + */ + zil_dva_tree_init(&zilog->zl_dva_tree); + for (;;) { + seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + + if (claim_seq != 0 && seq > claim_seq) + break; + + ASSERT(max_seq < seq); + max_seq = seq; + + error = zil_read_log_block(zilog, &blk, &abuf); + + if (parse_blk_func != NULL) + parse_blk_func(zilog, &blk, arg, txg); + + if (error) + break; + + lrbuf = abuf->b_data; + ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; + blk = ztp->zit_next_blk; + + if (parse_lr_func == NULL) { + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + continue; + } + + for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) { + lr_t *lr = (lr_t *)lrp; + reclen = lr->lrc_reclen; + ASSERT3U(reclen, >=, sizeof (lr_t)); + parse_lr_func(zilog, lr, arg, txg); + } + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + } + zil_dva_tree_fini(&zilog->zl_dva_tree); + + return (max_seq); +} + +/* ARGSUSED */ +static void +zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +{ + spa_t *spa = zilog->zl_spa; + int err; + + /* + * Claim log block if not already committed and not already claimed. + */ + if (bp->blk_birth >= first_txg && + zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) { + err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED)); + ASSERT(err == 0); + } +} + +static void +zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg); + } +} + +/* ARGSUSED */ +static void +zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) +{ + zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx)); +} + +static void +zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) +{ + /* + * If we previously claimed it, we need to free it. + */ + if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + if (bp->blk_birth >= claim_txg && + !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) { + (void) arc_free(NULL, zilog->zl_spa, + dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT); + } + } +} + +/* + * Create an on-disk intent log. + */ +static void +zil_create(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb; + uint64_t txg = 0; + dmu_tx_t *tx = NULL; + blkptr_t blk; + int error = 0; + + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + ASSERT(zh->zh_claim_txg == 0); + ASSERT(zh->zh_replay_seq == 0); + + blk = zh->zh_log; + + /* + * If we don't already have an initial log block, allocate one now. + */ + if (BP_IS_HOLE(&blk)) { + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, + NULL, txg); + + if (error == 0) + zil_init_log_chain(zilog, &blk); + } + + /* + * Allocate a log write buffer (lwb) for the first log block. + */ + if (error == 0) { + lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb->lwb_zilog = zilog; + lwb->lwb_blk = blk; + lwb->lwb_nused = 0; + lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk); + lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); + lwb->lwb_max_txg = txg; + lwb->lwb_zio = NULL; + + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, lwb); + mutex_exit(&zilog->zl_lock); + } + + /* + * If we just allocated the first log block, commit our transaction + * and wait for zil_sync() to stuff the block poiner into zh_log. + * (zh is part of the MOS, so we cannot modify it in open context.) + */ + if (tx != NULL) { + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } + + ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); +} + +/* + * In one tx, free all log blocks and clear the log header. + * If keep_first is set, then we're replaying a log with no content. + * We want to keep the first block, however, so that the first + * synchronous transaction doesn't require a txg_wait_synced() + * in zil_create(). We don't need to txg_wait_synced() here either + * when keep_first is set, because both zil_create() and zil_destroy() + * will wait for any in-progress destroys to complete. + */ +void +zil_destroy(zilog_t *zilog, boolean_t keep_first) +{ + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb; + dmu_tx_t *tx; + uint64_t txg; + + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + if (BP_IS_HOLE(&zh->zh_log)) + return; + + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + mutex_enter(&zilog->zl_lock); + + /* + * It is possible for the ZIL to get the previously mounted zilog + * structure of the same dataset if quickly remounted and the dbuf + * eviction has not completed. In this case we can see a non + * empty lwb list and keep_first will be set. We fix this by + * clearing the keep_first. This will be slower but it's very rare. + */ + if (!list_is_empty(&zilog->zl_lwb_list) && keep_first) + keep_first = B_FALSE; + + ASSERT3U(zilog->zl_destroy_txg, <, txg); + zilog->zl_destroy_txg = txg; + zilog->zl_keep_first = keep_first; + + if (!list_is_empty(&zilog->zl_lwb_list)) { + ASSERT(zh->zh_claim_txg == 0); + ASSERT(!keep_first); + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg); + kmem_cache_free(zil_lwb_cache, lwb); + } + } else { + if (!keep_first) { + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zh->zh_claim_txg); + } + } + mutex_exit(&zilog->zl_lock); + + dmu_tx_commit(tx); +} + +/* + * zil_rollback_destroy() is only called by the rollback code. + * We already have a syncing tx. Rollback has exclusive access to the + * dataset, so we don't have to worry about concurrent zil access. + * The actual freeing of any log blocks occurs in zil_sync() later in + * this txg syncing phase. + */ +void +zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx) +{ + const zil_header_t *zh = zilog->zl_header; + uint64_t txg; + + if (BP_IS_HOLE(&zh->zh_log)) + return; + + txg = dmu_tx_get_txg(tx); + ASSERT3U(zilog->zl_destroy_txg, <, txg); + zilog->zl_destroy_txg = txg; + zilog->zl_keep_first = B_FALSE; + + /* + * Ensure there's no outstanding ZIL IO. No lwbs or just the + * unused one that allocated in advance is ok. + */ + ASSERT(zilog->zl_lwb_list.list_head.list_next == + zilog->zl_lwb_list.list_head.list_prev); + (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, + tx, zh->zh_claim_txg); +} + +int +zil_claim(char *osname, void *txarg) +{ + dmu_tx_t *tx = txarg; + uint64_t first_txg = dmu_tx_get_txg(tx); + zilog_t *zilog; + zil_header_t *zh; + objset_t *os; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) { + cmn_err(CE_WARN, "can't open objset for %s", osname); + return (0); + } + + zilog = dmu_objset_zil(os); + zh = zil_header_in_syncing_context(zilog); + + /* + * Claim all log blocks if we haven't already done so, and remember + * the highest claimed sequence number. This ensures that if we can + * read only part of the log now (e.g. due to a missing device), + * but we can read the entire log later, we will not try to replay + * or destroy beyond the last block we successfully claimed. + */ + ASSERT3U(zh->zh_claim_txg, <=, first_txg); + if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { + zh->zh_claim_txg = first_txg; + zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block, + zil_claim_log_record, tx, first_txg); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + + ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); + dmu_objset_close(os); + return (0); +} + +/* + * Check the log by walking the log chain. + * Checksum errors are ok as they indicate the end of the chain. + * Any other error (no device or read failure) returns an error. + */ +/* ARGSUSED */ +int +zil_check_log_chain(char *osname, void *txarg) +{ + zilog_t *zilog; + zil_header_t *zh; + blkptr_t blk; + arc_buf_t *abuf; + objset_t *os; + char *lrbuf; + zil_trailer_t *ztp; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) { + cmn_err(CE_WARN, "can't open objset for %s", osname); + return (0); + } + + zilog = dmu_objset_zil(os); + zh = zil_header_in_syncing_context(zilog); + blk = zh->zh_log; + if (BP_IS_HOLE(&blk)) { + dmu_objset_close(os); + return (0); /* no chain */ + } + + for (;;) { + error = zil_read_log_block(zilog, &blk, &abuf); + if (error) + break; + lrbuf = abuf->b_data; + ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; + blk = ztp->zit_next_blk; + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + } + dmu_objset_close(os); + if (error == ECKSUM) + return (0); /* normal end of chain */ + return (error); +} + +/* + * Clear a log chain + */ +/* ARGSUSED */ +int +zil_clear_log_chain(char *osname, void *txarg) +{ + zilog_t *zilog; + zil_header_t *zh; + objset_t *os; + dmu_tx_t *tx; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) { + cmn_err(CE_WARN, "can't open objset for %s", osname); + return (0); + } + + zilog = dmu_objset_zil(os); + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + zh = zil_header_in_syncing_context(zilog); + BP_ZERO(&zh->zh_log); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + dmu_tx_commit(tx); + dmu_objset_close(os); + return (0); +} + +static int +zil_vdev_compare(const void *x1, const void *x2) +{ + uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; + uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; + + if (v1 < v2) + return (-1); + if (v1 > v2) + return (1); + + return (0); +} + +void +zil_add_block(zilog_t *zilog, blkptr_t *bp) +{ + avl_tree_t *t = &zilog->zl_vdev_tree; + avl_index_t where; + zil_vdev_node_t *zv, zvsearch; + int ndvas = BP_GET_NDVAS(bp); + int i; + + if (zfs_nocacheflush) + return; + + ASSERT(zilog->zl_writer); + + /* + * Even though we're zl_writer, we still need a lock because the + * zl_get_data() callbacks may have dmu_sync() done callbacks + * that will run concurrently. + */ + mutex_enter(&zilog->zl_vdev_lock); + for (i = 0; i < ndvas; i++) { + zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); + if (avl_find(t, &zvsearch, &where) == NULL) { + zv = kmem_alloc(sizeof (*zv), KM_SLEEP); + zv->zv_vdev = zvsearch.zv_vdev; + avl_insert(t, zv, where); + } + } + mutex_exit(&zilog->zl_vdev_lock); +} + +void +zil_flush_vdevs(zilog_t *zilog) +{ + spa_t *spa = zilog->zl_spa; + avl_tree_t *t = &zilog->zl_vdev_tree; + void *cookie = NULL; + zil_vdev_node_t *zv; + zio_t *zio; + + ASSERT(zilog->zl_writer); + + /* + * We don't need zl_vdev_lock here because we're the zl_writer, + * and all zl_get_data() callbacks are done. + */ + if (avl_numnodes(t) == 0) + return; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { + vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); + if (vd != NULL) + zio_flush(zio, vd); + kmem_free(zv, sizeof (*zv)); + } + + /* + * Wait for all the flushes to complete. Not all devices actually + * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. + */ + (void) zio_wait(zio); + + spa_config_exit(spa, SCL_STATE, FTAG); +} + +/* + * Function called when a log block write completes + */ +static void +zil_lwb_write_done(zio_t *zio) +{ + lwb_t *lwb = zio->io_private; + zilog_t *zilog = lwb->lwb_zilog; + + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); + ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG); + ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); + ASSERT(!BP_IS_GANG(zio->io_bp)); + ASSERT(!BP_IS_HOLE(zio->io_bp)); + ASSERT(zio->io_bp->blk_fill == 0); + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. + */ + txg_rele_to_sync(&lwb->lwb_txgh); + + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + mutex_enter(&zilog->zl_lock); + lwb->lwb_buf = NULL; + if (zio->io_error) + zilog->zl_log_error = B_TRUE; + mutex_exit(&zilog->zl_lock); +} + +/* + * Initialize the io for a log block. + */ +static void +zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) +{ + zbookmark_t zb; + + zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + + if (zilog->zl_root_zio == NULL) { + zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } + if (lwb->lwb_zio == NULL) { + lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, + 0, &lwb->lwb_blk, lwb->lwb_buf, + lwb->lwb_sz, zil_lwb_write_done, lwb, + ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb); + } +} + +/* + * Start a log block write and advance to the next log block. + * Calls are serialized. + */ +static lwb_t * +zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) +{ + lwb_t *nlwb; + zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; + spa_t *spa = zilog->zl_spa; + blkptr_t *bp = &ztp->zit_next_blk; + uint64_t txg; + uint64_t zil_blksz; + int error; + + ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb)); + + /* + * Allocate the next block and save its address in this block + * before writing it in order to establish the log chain. + * Note that if the allocation of nlwb synced before we wrote + * the block that points at it (lwb), we'd leak it if we crashed. + * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done(). + */ + txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh); + txg_rele_to_quiesce(&lwb->lwb_txgh); + + /* + * Pick a ZIL blocksize. We request a size that is the + * maximum of the previous used size, the current used size and + * the amount waiting in the queue. + */ + zil_blksz = MAX(zilog->zl_prev_used, + zilog->zl_cur_used + sizeof (*ztp)); + zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp)); + zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t); + if (zil_blksz > ZIL_MAX_BLKSZ) + zil_blksz = ZIL_MAX_BLKSZ; + + BP_ZERO(bp); + /* pass the old blkptr in order to spread log blocks across devs */ + error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg); + if (error) { + dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg); + + /* + * We dirty the dataset to ensure that zil_sync() will + * be called to remove this lwb from our zl_lwb_list. + * Failing to do so, may leave an lwb with a NULL lwb_buf + * hanging around on the zl_lwb_list. + */ + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + dmu_tx_commit(tx); + + /* + * Since we've just experienced an allocation failure so we + * terminate the current lwb and send it on its way. + */ + ztp->zit_pad = 0; + ztp->zit_nused = lwb->lwb_nused; + ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; + zio_nowait(lwb->lwb_zio); + + /* + * By returning NULL the caller will call tx_wait_synced() + */ + return (NULL); + } + + ASSERT3U(bp->blk_birth, ==, txg); + ztp->zit_pad = 0; + ztp->zit_nused = lwb->lwb_nused; + ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; + + /* + * Allocate a new log write buffer (lwb). + */ + nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + + nlwb->lwb_zilog = zilog; + nlwb->lwb_blk = *bp; + nlwb->lwb_nused = 0; + nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk); + nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); + nlwb->lwb_max_txg = txg; + nlwb->lwb_zio = NULL; + + /* + * Put new lwb at the end of the log chain + */ + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, nlwb); + mutex_exit(&zilog->zl_lock); + + /* Record the block for later vdev flushing */ + zil_add_block(zilog, &lwb->lwb_blk); + + /* + * kick off the write for the old log block + */ + dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); + ASSERT(lwb->lwb_zio); + zio_nowait(lwb->lwb_zio); + + return (nlwb); +} + +static lwb_t * +zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +{ + lr_t *lrc = &itx->itx_lr; /* common log record */ + lr_write_t *lr = (lr_write_t *)lrc; + uint64_t txg = lrc->lrc_txg; + uint64_t reclen = lrc->lrc_reclen; + uint64_t dlen; + + if (lwb == NULL) + return (NULL); + ASSERT(lwb->lwb_buf != NULL); + + if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) + dlen = P2ROUNDUP_TYPED( + lr->lr_length, sizeof (uint64_t), uint64_t); + else + dlen = 0; + + zilog->zl_cur_used += (reclen + dlen); + + zil_lwb_write_init(zilog, lwb); + + /* + * If this record won't fit in the current log block, start a new one. + */ + if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) { + lwb = zil_lwb_write_start(zilog, lwb); + if (lwb == NULL) + return (NULL); + zil_lwb_write_init(zilog, lwb); + ASSERT(lwb->lwb_nused == 0); + if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + return (lwb); + } + } + + /* + * Update the lrc_seq, to be log record sequence number. See zil.h + * Then copy the record to the log buffer. + */ + lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ + bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen); + + /* + * If it's a write, fetch the data or get its blkptr as appropriate. + */ + if (lrc->lrc_txtype == TX_WRITE) { + if (txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, txg); + if (itx->itx_wr_state != WR_COPIED) { + char *dbuf; + int error; + + /* alignment is guaranteed */ + lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused); + if (dlen) { + ASSERT(itx->itx_wr_state == WR_NEED_COPY); + dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen; + lr->lr_common.lrc_reclen += dlen; + } else { + ASSERT(itx->itx_wr_state == WR_INDIRECT); + dbuf = NULL; + } + error = zilog->zl_get_data( + itx->itx_private, lr, dbuf, lwb->lwb_zio); + if (error) { + ASSERT(error == ENOENT || error == EEXIST || + error == EALREADY); + return (lwb); + } + } + } + + lwb->lwb_nused += reclen + dlen; + lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); + ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb)); + ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); + + return (lwb); +} + +itx_t * +zil_itx_create(uint64_t txtype, size_t lrsize) +{ + itx_t *itx; + + lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); + + itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); + itx->itx_lr.lrc_txtype = txtype; + itx->itx_lr.lrc_reclen = lrsize; + itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ + itx->itx_lr.lrc_seq = 0; /* defensive */ + + return (itx); +} + +uint64_t +zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) +{ + uint64_t seq; + + ASSERT(itx->itx_lr.lrc_seq == 0); + + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_itx_list, itx); + zilog->zl_itx_list_sz += itx->itx_sod; + itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); + itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; + mutex_exit(&zilog->zl_lock); + + return (seq); +} + +/* + * Free up all in-memory intent log transactions that have now been synced. + */ +static void +zil_itx_clean(zilog_t *zilog) +{ + uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); + uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); + list_t clean_list; + itx_t *itx; + + list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); + + mutex_enter(&zilog->zl_lock); + /* wait for a log writer to finish walking list */ + while (zilog->zl_writer) { + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + } + + /* + * Move the sync'd log transactions to a separate list so we can call + * kmem_free without holding the zl_lock. + * + * There is no need to set zl_writer as we don't drop zl_lock here + */ + while ((itx = list_head(&zilog->zl_itx_list)) != NULL && + itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { + list_remove(&zilog->zl_itx_list, itx); + zilog->zl_itx_list_sz -= itx->itx_sod; + list_insert_tail(&clean_list, itx); + } + cv_broadcast(&zilog->zl_cv_writer); + mutex_exit(&zilog->zl_lock); + + /* destroy sync'd log transactions */ + while ((itx = list_head(&clean_list)) != NULL) { + list_remove(&clean_list, itx); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + } + list_destroy(&clean_list); +} + +/* + * If there are any in-memory intent log transactions which have now been + * synced then start up a taskq to free them. + */ +void +zil_clean(zilog_t *zilog) +{ + itx_t *itx; + + mutex_enter(&zilog->zl_lock); + itx = list_head(&zilog->zl_itx_list); + if ((itx != NULL) && + (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { + (void) taskq_dispatch(zilog->zl_clean_taskq, + (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); + } + mutex_exit(&zilog->zl_lock); +} + +static void +zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) +{ + uint64_t txg; + uint64_t commit_seq = 0; + itx_t *itx, *itx_next = (itx_t *)-1; + lwb_t *lwb; + spa_t *spa; + + zilog->zl_writer = B_TRUE; + ASSERT(zilog->zl_root_zio == NULL); + spa = zilog->zl_spa; + + if (zilog->zl_suspend) { + lwb = NULL; + } else { + lwb = list_tail(&zilog->zl_lwb_list); + if (lwb == NULL) { + /* + * Return if there's nothing to flush before we + * dirty the fs by calling zil_create() + */ + if (list_is_empty(&zilog->zl_itx_list)) { + zilog->zl_writer = B_FALSE; + return; + } + mutex_exit(&zilog->zl_lock); + zil_create(zilog); + mutex_enter(&zilog->zl_lock); + lwb = list_tail(&zilog->zl_lwb_list); + } + } + + /* Loop through in-memory log transactions filling log blocks. */ + DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); + for (;;) { + /* + * Find the next itx to push: + * Push all transactions related to specified foid and all + * other transactions except TX_WRITE, TX_TRUNCATE, + * TX_SETATTR and TX_ACL for all other files. + */ + if (itx_next != (itx_t *)-1) + itx = itx_next; + else + itx = list_head(&zilog->zl_itx_list); + for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) { + if (foid == 0) /* push all foids? */ + break; + if (itx->itx_sync) /* push all O_[D]SYNC */ + break; + switch (itx->itx_lr.lrc_txtype) { + case TX_SETATTR: + case TX_WRITE: + case TX_TRUNCATE: + case TX_ACL: + /* lr_foid is same offset for these records */ + if (((lr_write_t *)&itx->itx_lr)->lr_foid + != foid) { + continue; /* skip this record */ + } + } + break; + } + if (itx == NULL) + break; + + if ((itx->itx_lr.lrc_seq > seq) && + ((lwb == NULL) || (lwb->lwb_nused == 0) || + (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) { + break; + } + + /* + * Save the next pointer. Even though we soon drop + * zl_lock all threads that may change the list + * (another writer or zil_itx_clean) can't do so until + * they have zl_writer. + */ + itx_next = list_next(&zilog->zl_itx_list, itx); + list_remove(&zilog->zl_itx_list, itx); + zilog->zl_itx_list_sz -= itx->itx_sod; + mutex_exit(&zilog->zl_lock); + txg = itx->itx_lr.lrc_txg; + ASSERT(txg); + + if (txg > spa_last_synced_txg(spa) || + txg > spa_freeze_txg(spa)) + lwb = zil_lwb_commit(zilog, itx, lwb); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + mutex_enter(&zilog->zl_lock); + } + DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); + /* determine commit sequence number */ + itx = list_head(&zilog->zl_itx_list); + if (itx) + commit_seq = itx->itx_lr.lrc_seq; + else + commit_seq = zilog->zl_itx_seq; + mutex_exit(&zilog->zl_lock); + + /* write the last block out */ + if (lwb != NULL && lwb->lwb_zio != NULL) + lwb = zil_lwb_write_start(zilog, lwb); + + zilog->zl_prev_used = zilog->zl_cur_used; + zilog->zl_cur_used = 0; + + /* + * Wait if necessary for the log blocks to be on stable storage. + */ + if (zilog->zl_root_zio) { + DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); + (void) zio_wait(zilog->zl_root_zio); + zilog->zl_root_zio = NULL; + DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); + zil_flush_vdevs(zilog); + } + + if (zilog->zl_log_error || lwb == NULL) { + zilog->zl_log_error = 0; + txg_wait_synced(zilog->zl_dmu_pool, 0); + } + + mutex_enter(&zilog->zl_lock); + zilog->zl_writer = B_FALSE; + + ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); + zilog->zl_commit_seq = commit_seq; +} + +/* + * Push zfs transactions to stable storage up to the supplied sequence number. + * If foid is 0 push out all transactions, otherwise push only those + * for that file or might have been used to create that file. + */ +void +zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) +{ + if (zilog == NULL || seq == 0) + return; + + mutex_enter(&zilog->zl_lock); + + seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */ + + while (zilog->zl_writer) { + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + if (seq < zilog->zl_commit_seq) { + mutex_exit(&zilog->zl_lock); + return; + } + } + zil_commit_writer(zilog, seq, foid); /* drops zl_lock */ + /* wake up others waiting on the commit */ + cv_broadcast(&zilog->zl_cv_writer); + mutex_exit(&zilog->zl_lock); +} + +/* + * Called in syncing context to free committed log blocks and update log header. + */ +void +zil_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = zilog->zl_spa; + lwb_t *lwb; + + mutex_enter(&zilog->zl_lock); + + ASSERT(zilog->zl_stop_sync == 0); + + zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + + if (zilog->zl_destroy_txg == txg) { + blkptr_t blk = zh->zh_log; + + ASSERT(list_head(&zilog->zl_lwb_list) == NULL); + ASSERT(spa_sync_pass(spa) == 1); + + bzero(zh, sizeof (zil_header_t)); + bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); + + if (zilog->zl_keep_first) { + /* + * If this block was part of log chain that couldn't + * be claimed because a device was missing during + * zil_claim(), but that device later returns, + * then this block could erroneously appear valid. + * To guard against this, assign a new GUID to the new + * log chain so it doesn't matter what blk points to. + */ + zil_init_log_chain(zilog, &blk); + zh->zh_log = blk; + } + } + + for (;;) { + lwb = list_head(&zilog->zl_lwb_list); + if (lwb == NULL) { + mutex_exit(&zilog->zl_lock); + return; + } + zh->zh_log = lwb->lwb_blk; + if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) + break; + list_remove(&zilog->zl_lwb_list, lwb); + zio_free_blk(spa, &lwb->lwb_blk, txg); + kmem_cache_free(zil_lwb_cache, lwb); + + /* + * If we don't have anything left in the lwb list then + * we've had an allocation failure and we need to zero + * out the zil_header blkptr so that we don't end + * up freeing the same block twice. + */ + if (list_head(&zilog->zl_lwb_list) == NULL) + BP_ZERO(&zh->zh_log); + } + mutex_exit(&zilog->zl_lock); +} + +void +zil_init(void) +{ + zil_lwb_cache = kmem_cache_create("zil_lwb_cache", + sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +zil_fini(void) +{ + kmem_cache_destroy(zil_lwb_cache); +} + +zilog_t * +zil_alloc(objset_t *os, zil_header_t *zh_phys) +{ + zilog_t *zilog; + + zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); + + zilog->zl_header = zh_phys; + zilog->zl_os = os; + zilog->zl_spa = dmu_objset_spa(os); + zilog->zl_dmu_pool = dmu_objset_pool(os); + zilog->zl_destroy_txg = TXG_INITIAL - 1; + + mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&zilog->zl_itx_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + + list_create(&zilog->zl_lwb_list, sizeof (lwb_t), + offsetof(lwb_t, lwb_node)); + + mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, + sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); + + cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); + cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); + + return (zilog); +} + +void +zil_free(zilog_t *zilog) +{ + lwb_t *lwb; + + zilog->zl_stop_sync = 1; + + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + kmem_cache_free(zil_lwb_cache, lwb); + } + list_destroy(&zilog->zl_lwb_list); + + avl_destroy(&zilog->zl_vdev_tree); + mutex_destroy(&zilog->zl_vdev_lock); + + ASSERT(list_head(&zilog->zl_itx_list) == NULL); + list_destroy(&zilog->zl_itx_list); + mutex_destroy(&zilog->zl_lock); + + cv_destroy(&zilog->zl_cv_writer); + cv_destroy(&zilog->zl_cv_suspend); + + kmem_free(zilog, sizeof (zilog_t)); +} + +/* + * return true if the initial log block is not valid + */ +static boolean_t +zil_empty(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + arc_buf_t *abuf = NULL; + + if (BP_IS_HOLE(&zh->zh_log)) + return (B_TRUE); + + if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) + return (B_TRUE); + + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + return (B_FALSE); +} + +/* + * Open an intent log. + */ +zilog_t * +zil_open(objset_t *os, zil_get_data_t *get_data) +{ + zilog_t *zilog = dmu_objset_zil(os); + + zilog->zl_get_data = get_data; + zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, + 2, 2, TASKQ_PREPOPULATE); + + return (zilog); +} + +/* + * Close an intent log. + */ +void +zil_close(zilog_t *zilog) +{ + /* + * If the log isn't already committed, mark the objset dirty + * (so zil_sync() will be called) and wait for that txg to sync. + */ + if (!zil_is_committed(zilog)) { + uint64_t txg; + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } + + taskq_destroy(zilog->zl_clean_taskq); + zilog->zl_clean_taskq = NULL; + zilog->zl_get_data = NULL; + + zil_itx_clean(zilog); + ASSERT(list_head(&zilog->zl_itx_list) == NULL); +} + +/* + * Suspend an intent log. While in suspended mode, we still honor + * synchronous semantics, but we rely on txg_wait_synced() to do it. + * We suspend the log briefly when taking a snapshot so that the snapshot + * contains all the data it's supposed to, and has an empty intent log. + */ +int +zil_suspend(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + + mutex_enter(&zilog->zl_lock); + if (zh->zh_claim_txg != 0) { /* unplayed log */ + mutex_exit(&zilog->zl_lock); + return (EBUSY); + } + if (zilog->zl_suspend++ != 0) { + /* + * Someone else already began a suspend. + * Just wait for them to finish. + */ + while (zilog->zl_suspending) + cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); + mutex_exit(&zilog->zl_lock); + return (0); + } + zilog->zl_suspending = B_TRUE; + mutex_exit(&zilog->zl_lock); + + zil_commit(zilog, UINT64_MAX, 0); + + /* + * Wait for any in-flight log writes to complete. + */ + mutex_enter(&zilog->zl_lock); + while (zilog->zl_writer) + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + mutex_exit(&zilog->zl_lock); + + zil_destroy(zilog, B_FALSE); + + mutex_enter(&zilog->zl_lock); + zilog->zl_suspending = B_FALSE; + cv_broadcast(&zilog->zl_cv_suspend); + mutex_exit(&zilog->zl_lock); + + return (0); +} + +void +zil_resume(zilog_t *zilog) +{ + mutex_enter(&zilog->zl_lock); + ASSERT(zilog->zl_suspend != 0); + zilog->zl_suspend--; + mutex_exit(&zilog->zl_lock); +} + +typedef struct zil_replay_arg { + objset_t *zr_os; + zil_replay_func_t **zr_replay; + zil_replay_cleaner_t *zr_replay_cleaner; + void *zr_arg; + uint64_t *zr_txgp; + boolean_t zr_byteswap; + char *zr_lrbuf; +} zil_replay_arg_t; + +static void +zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) +{ + zil_replay_arg_t *zr = zra; + const zil_header_t *zh = zilog->zl_header; + uint64_t reclen = lr->lrc_reclen; + uint64_t txtype = lr->lrc_txtype; + char *name; + int pass, error, sunk; + + if (zilog->zl_stop_replay) + return; + + if (lr->lrc_txg < claim_txg) /* already committed */ + return; + + if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ + return; + + /* Strip case-insensitive bit, still present in log record */ + txtype &= ~TX_CI; + + /* + * Make a copy of the data so we can revise and extend it. + */ + bcopy(lr, zr->zr_lrbuf, reclen); + + /* + * The log block containing this lr may have been byteswapped + * so that we can easily examine common fields like lrc_txtype. + * However, the log is a mix of different data types, and only the + * replay vectors know how to byteswap their records. Therefore, if + * the lr was byteswapped, undo it before invoking the replay vector. + */ + if (zr->zr_byteswap) + byteswap_uint64_array(zr->zr_lrbuf, reclen); + + /* + * If this is a TX_WRITE with a blkptr, suck in the data. + */ + if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { + lr_write_t *lrw = (lr_write_t *)lr; + blkptr_t *wbp = &lrw->lr_blkptr; + uint64_t wlen = lrw->lr_length; + char *wbuf = zr->zr_lrbuf + reclen; + + if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ + bzero(wbuf, wlen); + } else { + /* + * A subsequent write may have overwritten this block, + * in which case wbp may have been been freed and + * reallocated, and our read of wbp may fail with a + * checksum error. We can safely ignore this because + * the later write will provide the correct data. + */ + zbookmark_t zb; + + zb.zb_objset = dmu_objset_id(zilog->zl_os); + zb.zb_object = lrw->lr_foid; + zb.zb_level = -1; + zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); + + (void) zio_wait(zio_read(NULL, zilog->zl_spa, + wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); + (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); + } + } + + /* + * Replay of large truncates can end up needing additional txs + * and a different txg. If they are nested within the replay tx + * as below then a hang is possible. So we do the truncate here + * and redo the truncate later (a no-op) and update the sequence + * number whilst in the replay tx. Fortunately, it's safe to repeat + * a truncate if we crash and the truncate commits. A create over + * an existing file will also come in as a TX_TRUNCATE record. + * + * Note, remove of large files and renames over large files is + * handled by putting the deleted object on a stable list + * and if necessary force deleting the object outside of the replay + * transaction using the zr_replay_cleaner. + */ + if (txtype == TX_TRUNCATE) { + *zr->zr_txgp = TXG_NOWAIT; + error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf, + zr->zr_byteswap); + if (error) + goto bad; + zr->zr_byteswap = 0; /* only byteswap once */ + } + + /* + * We must now do two things atomically: replay this log record, + * and update the log header to reflect the fact that we did so. + * We use the DMU's ability to assign into a specific txg to do this. + */ + for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) { + uint64_t replay_txg; + dmu_tx_t *replay_tx; + + replay_tx = dmu_tx_create(zr->zr_os); + error = dmu_tx_assign(replay_tx, TXG_WAIT); + if (error) { + dmu_tx_abort(replay_tx); + break; + } + + replay_txg = dmu_tx_get_txg(replay_tx); + + if (txtype == 0 || txtype >= TX_MAX_TYPE) { + error = EINVAL; + } else { + /* + * On the first pass, arrange for the replay vector + * to fail its dmu_tx_assign(). That's the only way + * to ensure that those code paths remain well tested. + * + * Only byteswap (if needed) on the 1st pass. + */ + *zr->zr_txgp = replay_txg - (pass == 1); + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, + zr->zr_byteswap && pass == 1); + *zr->zr_txgp = TXG_NOWAIT; + } + + if (error == 0) { + dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx); + zilog->zl_replay_seq[replay_txg & TXG_MASK] = + lr->lrc_seq; + } + + dmu_tx_commit(replay_tx); + + if (!error) + return; + + /* + * The DMU's dnode layer doesn't see removes until the txg + * commits, so a subsequent claim can spuriously fail with + * EEXIST. So if we receive any error other than ERESTART + * we try syncing out any removes then retrying the + * transaction. + */ + if (error != ERESTART && !sunk) { + if (zr->zr_replay_cleaner) + zr->zr_replay_cleaner(zr->zr_arg); + txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); + sunk = B_TRUE; + continue; /* retry */ + } + + if (error != ERESTART) + break; + + if (pass != 1) + txg_wait_open(spa_get_dsl(zilog->zl_spa), + replay_txg + 1); + + dprintf("pass %d, retrying\n", pass); + } + +bad: + ASSERT(error && error != ERESTART); + name = kmem_alloc(MAXNAMELEN, KM_SLEEP); + dmu_objset_name(zr->zr_os, name); + cmn_err(CE_WARN, "ZFS replay transaction error %d, " + "dataset %s, seq 0x%llx, txtype %llu %s\n", + error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, + (lr->lrc_txtype & TX_CI) ? "CI" : ""); + zilog->zl_stop_replay = 1; + kmem_free(name, MAXNAMELEN); +} + +/* ARGSUSED */ +static void +zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zilog->zl_replay_blks++; +} + +/* + * If this dataset has a non-empty intent log, replay it and destroy it. + */ +void +zil_replay(objset_t *os, void *arg, uint64_t *txgp, + zil_replay_func_t *replay_func[TX_MAX_TYPE], + zil_replay_cleaner_t *replay_cleaner) +{ + zilog_t *zilog = dmu_objset_zil(os); + const zil_header_t *zh = zilog->zl_header; + zil_replay_arg_t zr; + + if (zil_empty(zilog)) { + zil_destroy(zilog, B_TRUE); + return; + } + + zr.zr_os = os; + zr.zr_replay = replay_func; + zr.zr_replay_cleaner = replay_cleaner; + zr.zr_arg = arg; + zr.zr_txgp = txgp; + zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); + zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + + /* + * Wait for in-progress removes to sync before starting replay. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + + zilog->zl_stop_replay = 0; + zilog->zl_replay_time = lbolt; + ASSERT(zilog->zl_replay_blks == 0); + (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, + zh->zh_claim_txg); + kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); + + zil_destroy(zilog, B_FALSE); + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); +} + +/* + * Report whether all transactions are committed + */ +int +zil_is_committed(zilog_t *zilog) +{ + lwb_t *lwb; + int ret; + + mutex_enter(&zilog->zl_lock); + while (zilog->zl_writer) + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + + /* recent unpushed intent log transactions? */ + if (!list_is_empty(&zilog->zl_itx_list)) { + ret = B_FALSE; + goto out; + } + + /* intent log never used? */ + lwb = list_head(&zilog->zl_lwb_list); + if (lwb == NULL) { + ret = B_TRUE; + goto out; + } + + /* + * more than 1 log buffer means zil_sync() hasn't yet freed + * entries after a txg has committed + */ + if (list_next(&zilog->zl_lwb_list, lwb)) { + ret = B_FALSE; + goto out; + } + + ASSERT(zil_empty(zilog)); + ret = B_TRUE; +out: + cv_broadcast(&zilog->zl_cv_writer); + mutex_exit(&zilog->zl_lock); + return (ret); +} diff --git a/module/zfs/zio.c b/module/zfs/zio.c new file mode 100644 index 000000000..d347920ea --- /dev/null +++ b/module/zfs/zio.c @@ -0,0 +1,2273 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zio_impl.h> +#include <sys/zio_compress.h> +#include <sys/zio_checksum.h> + +/* + * ========================================================================== + * I/O priority table + * ========================================================================== + */ +uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { + 0, /* ZIO_PRIORITY_NOW */ + 0, /* ZIO_PRIORITY_SYNC_READ */ + 0, /* ZIO_PRIORITY_SYNC_WRITE */ + 6, /* ZIO_PRIORITY_ASYNC_READ */ + 4, /* ZIO_PRIORITY_ASYNC_WRITE */ + 4, /* ZIO_PRIORITY_FREE */ + 0, /* ZIO_PRIORITY_CACHE_FILL */ + 0, /* ZIO_PRIORITY_LOG_WRITE */ + 10, /* ZIO_PRIORITY_RESILVER */ + 20, /* ZIO_PRIORITY_SCRUB */ +}; + +/* + * ========================================================================== + * I/O type descriptions + * ========================================================================== + */ +char *zio_type_name[ZIO_TYPES] = { + "null", "read", "write", "free", "claim", "ioctl" }; + +#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ +#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ +#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ + +/* + * ========================================================================== + * I/O kmem caches + * ========================================================================== + */ +kmem_cache_t *zio_cache; +kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; + +#ifdef _KERNEL +extern vmem_t *zio_alloc_arena; +#endif + +/* + * An allocating zio is one that either currently has the DVA allocate + * stage set or will have it later in its lifetime. + */ +#define IO_IS_ALLOCATING(zio) \ + ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) + +void +zio_init(void) +{ + size_t c; + vmem_t *data_alloc_arena = NULL; + +#ifdef _KERNEL + data_alloc_arena = zio_alloc_arena; +#endif + zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, + NULL, NULL, NULL, NULL, NULL, 0); + + /* + * For small buffers, we want a cache for each multiple of + * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache + * for each quarter-power of 2. For large buffers, we want + * a cache for each multiple of PAGESIZE. + */ + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + size_t size = (c + 1) << SPA_MINBLOCKSHIFT; + size_t p2 = size; + size_t align = 0; + + while (p2 & (p2 - 1)) + p2 &= p2 - 1; + + if (size <= 4 * SPA_MINBLOCKSIZE) { + align = SPA_MINBLOCKSIZE; + } else if (P2PHASE(size, PAGESIZE) == 0) { + align = PAGESIZE; + } else if (P2PHASE(size, p2 >> 2) == 0) { + align = p2 >> 2; + } + + if (align != 0) { + char name[36]; + (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); + + (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, data_alloc_arena, + KMC_NODEBUG); + } + } + + while (--c != 0) { + ASSERT(zio_buf_cache[c] != NULL); + if (zio_buf_cache[c - 1] == NULL) + zio_buf_cache[c - 1] = zio_buf_cache[c]; + + ASSERT(zio_data_buf_cache[c] != NULL); + if (zio_data_buf_cache[c - 1] == NULL) + zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; + } + + zio_inject_init(); +} + +void +zio_fini(void) +{ + size_t c; + kmem_cache_t *last_cache = NULL; + kmem_cache_t *last_data_cache = NULL; + + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + if (zio_buf_cache[c] != last_cache) { + last_cache = zio_buf_cache[c]; + kmem_cache_destroy(zio_buf_cache[c]); + } + zio_buf_cache[c] = NULL; + + if (zio_data_buf_cache[c] != last_data_cache) { + last_data_cache = zio_data_buf_cache[c]; + kmem_cache_destroy(zio_data_buf_cache[c]); + } + zio_data_buf_cache[c] = NULL; + } + + kmem_cache_destroy(zio_cache); + + zio_inject_fini(); +} + +/* + * ========================================================================== + * Allocate and free I/O buffers + * ========================================================================== + */ + +/* + * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a + * crashdump if the kernel panics, so use it judiciously. Obviously, it's + * useful to inspect ZFS metadata, but if possible, we should avoid keeping + * excess / transient data in-core during a crashdump. + */ +void * +zio_buf_alloc(size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); +} + +/* + * Use zio_data_buf_alloc to allocate data. The data will not appear in a + * crashdump if the kernel panics. This exists so that we will limit the amount + * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount + * of kernel heap dumped to disk when the kernel panics) + */ +void * +zio_data_buf_alloc(size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); +} + +void +zio_buf_free(void *buf, size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + kmem_cache_free(zio_buf_cache[c], buf); +} + +void +zio_data_buf_free(void *buf, size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + kmem_cache_free(zio_data_buf_cache[c], buf); +} + +/* + * ========================================================================== + * Push and pop I/O transform buffers + * ========================================================================== + */ +static void +zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, + zio_transform_func_t *transform) +{ + zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + + zt->zt_orig_data = zio->io_data; + zt->zt_orig_size = zio->io_size; + zt->zt_bufsize = bufsize; + zt->zt_transform = transform; + + zt->zt_next = zio->io_transform_stack; + zio->io_transform_stack = zt; + + zio->io_data = data; + zio->io_size = size; +} + +static void +zio_pop_transforms(zio_t *zio) +{ + zio_transform_t *zt; + + while ((zt = zio->io_transform_stack) != NULL) { + if (zt->zt_transform != NULL) + zt->zt_transform(zio, + zt->zt_orig_data, zt->zt_orig_size); + + zio_buf_free(zio->io_data, zt->zt_bufsize); + + zio->io_data = zt->zt_orig_data; + zio->io_size = zt->zt_orig_size; + zio->io_transform_stack = zt->zt_next; + + kmem_free(zt, sizeof (zio_transform_t)); + } +} + +/* + * ========================================================================== + * I/O transform callbacks for subblocks and decompression + * ========================================================================== + */ +static void +zio_subblock(zio_t *zio, void *data, uint64_t size) +{ + ASSERT(zio->io_size > size); + + if (zio->io_type == ZIO_TYPE_READ) + bcopy(zio->io_data, data, size); +} + +static void +zio_decompress(zio_t *zio, void *data, uint64_t size) +{ + if (zio->io_error == 0 && + zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_data, zio->io_size, data, size) != 0) + zio->io_error = EIO; +} + +/* + * ========================================================================== + * I/O parent/child relationships and pipeline interlocks + * ========================================================================== + */ + +static void +zio_add_child(zio_t *pio, zio_t *zio) +{ + mutex_enter(&pio->io_lock); + if (zio->io_stage < ZIO_STAGE_READY) + pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; + if (zio->io_stage < ZIO_STAGE_DONE) + pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; + zio->io_sibling_prev = NULL; + zio->io_sibling_next = pio->io_child; + if (pio->io_child != NULL) + pio->io_child->io_sibling_prev = zio; + pio->io_child = zio; + zio->io_parent = pio; + mutex_exit(&pio->io_lock); +} + +static void +zio_remove_child(zio_t *pio, zio_t *zio) +{ + zio_t *next, *prev; + + ASSERT(zio->io_parent == pio); + + mutex_enter(&pio->io_lock); + next = zio->io_sibling_next; + prev = zio->io_sibling_prev; + if (next != NULL) + next->io_sibling_prev = prev; + if (prev != NULL) + prev->io_sibling_next = next; + if (pio->io_child == zio) + pio->io_child = next; + mutex_exit(&pio->io_lock); +} + +static boolean_t +zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) +{ + uint64_t *countp = &zio->io_children[child][wait]; + boolean_t waiting = B_FALSE; + + mutex_enter(&zio->io_lock); + ASSERT(zio->io_stall == NULL); + if (*countp != 0) { + zio->io_stage--; + zio->io_stall = countp; + waiting = B_TRUE; + } + mutex_exit(&zio->io_lock); + + return (waiting); +} + +static void +zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) +{ + uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; + int *errorp = &pio->io_child_error[zio->io_child_type]; + + mutex_enter(&pio->io_lock); + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + *errorp = zio_worst_error(*errorp, zio->io_error); + pio->io_reexecute |= zio->io_reexecute; + ASSERT3U(*countp, >, 0); + if (--*countp == 0 && pio->io_stall == countp) { + pio->io_stall = NULL; + mutex_exit(&pio->io_lock); + zio_execute(pio); + } else { + mutex_exit(&pio->io_lock); + } +} + +static void +zio_inherit_child_errors(zio_t *zio, enum zio_child c) +{ + if (zio->io_child_error[c] != 0 && zio->io_error == 0) + zio->io_error = zio->io_child_error[c]; +} + +/* + * ========================================================================== + * Create the various types of I/O (read, write, free, etc) + * ========================================================================== + */ +static zio_t * +zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_done_func_t *done, void *private, + zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset, + const zbookmark_t *zb, uint8_t stage, uint32_t pipeline) +{ + zio_t *zio; + + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); + ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); + + ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); + ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); + ASSERT(vd || stage == ZIO_STAGE_OPEN); + + zio = kmem_cache_alloc(zio_cache, KM_SLEEP); + bzero(zio, sizeof (zio_t)); + + mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + + if (vd != NULL) + zio->io_child_type = ZIO_CHILD_VDEV; + else if (flags & ZIO_FLAG_GANG_CHILD) + zio->io_child_type = ZIO_CHILD_GANG; + else + zio->io_child_type = ZIO_CHILD_LOGICAL; + + if (bp != NULL) { + zio->io_bp = bp; + zio->io_bp_copy = *bp; + zio->io_bp_orig = *bp; + if (type != ZIO_TYPE_WRITE) + zio->io_bp = &zio->io_bp_copy; /* so caller can free */ + if (zio->io_child_type == ZIO_CHILD_LOGICAL) { + if (BP_IS_GANG(bp)) + pipeline |= ZIO_GANG_STAGES; + zio->io_logical = zio; + } + } + + zio->io_spa = spa; + zio->io_txg = txg; + zio->io_data = data; + zio->io_size = size; + zio->io_done = done; + zio->io_private = private; + zio->io_type = type; + zio->io_priority = priority; + zio->io_vd = vd; + zio->io_offset = offset; + zio->io_orig_flags = zio->io_flags = flags; + zio->io_orig_stage = zio->io_stage = stage; + zio->io_orig_pipeline = zio->io_pipeline = pipeline; + + if (zb != NULL) + zio->io_bookmark = *zb; + + if (pio != NULL) { + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT(zio->io_child_type <= pio->io_child_type); + if (zio->io_logical == NULL) + zio->io_logical = pio->io_logical; + zio_add_child(pio, zio); + } + + return (zio); +} + +static void +zio_destroy(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + uint8_t async_root = zio->io_async_root; + + mutex_destroy(&zio->io_lock); + cv_destroy(&zio->io_cv); + kmem_cache_free(zio_cache, zio); + + if (async_root) { + mutex_enter(&spa->spa_async_root_lock); + if (--spa->spa_async_root_count == 0) + cv_broadcast(&spa->spa_async_root_cv); + mutex_exit(&spa->spa_async_root_lock); + } +} + +zio_t * +zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, + int flags) +{ + zio_t *zio; + + zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, + ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); + + return (zio); +} + +zio_t * +zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) +{ + return (zio_null(NULL, spa, done, private, flags)); +} + +zio_t * +zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + void *data, uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb) +{ + zio_t *zio; + + zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, + data, size, done, private, + ZIO_TYPE_READ, priority, flags, NULL, 0, zb, + ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); + + return (zio); +} + +void +zio_skip_write(zio_t *zio) +{ + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_stage == ZIO_STAGE_READY); + ASSERT(!BP_IS_GANG(zio->io_bp)); + + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; +} + +zio_t * +zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb) +{ + zio_t *zio; + + ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && + zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && + zp->zp_compress >= ZIO_COMPRESS_OFF && + zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && + zp->zp_type < DMU_OT_NUMTYPES && + zp->zp_level < 32 && + zp->zp_ndvas > 0 && + zp->zp_ndvas <= spa_max_replication(spa)); + ASSERT(ready != NULL); + + zio = zio_create(pio, spa, txg, bp, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, + ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); + + zio->io_ready = ready; + zio->io_prop = *zp; + + return (zio); +} + +zio_t * +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, + uint64_t size, zio_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb) +{ + zio_t *zio; + + zio = zio_create(pio, spa, txg, bp, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, + ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); + + return (zio); +} + +zio_t * +zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, int flags) +{ + zio_t *zio; + + ASSERT(!BP_IS_HOLE(bp)); + + if (bp->blk_fill == BLK_FILL_ALREADY_FREED) + return (zio_null(pio, spa, NULL, NULL, flags)); + + if (txg == spa->spa_syncing_txg && + spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { + bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); + return (zio_null(pio, spa, NULL, NULL, flags)); + } + + zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, + NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); + + return (zio); +} + +zio_t * +zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, int flags) +{ + zio_t *zio; + + /* + * A claim is an allocation of a specific block. Claims are needed + * to support immediate writes in the intent log. The issue is that + * immediate writes contain committed data, but in a txg that was + * *not* committed. Upon opening the pool after an unclean shutdown, + * the intent log claims all blocks that contain immediate write data + * so that the SPA knows they're in use. + * + * All claims *must* be resolved in the first txg -- before the SPA + * starts allocating blocks -- so that nothing is allocated twice. + */ + ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); + ASSERT3U(spa_first_txg(spa), <=, txg); + + zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, + NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + + return (zio); +} + +zio_t * +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *private, int priority, int flags) +{ + zio_t *zio; + int c; + + if (vd->vdev_children == 0) { + zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, + ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); + + zio->io_cmd = cmd; + } else { + zio = zio_null(pio, spa, NULL, NULL, flags); + + for (c = 0; c < vd->vdev_children; c++) + zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, + done, private, priority, flags)); + } + + return (zio); +} + +zio_t * +zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + void *data, int checksum, zio_done_func_t *done, void *private, + int priority, int flags, boolean_t labels) +{ + zio_t *zio; + + ASSERT(vd->vdev_children == 0); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); + + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, + ZIO_TYPE_READ, priority, flags, vd, offset, NULL, + ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + + zio->io_prop.zp_checksum = checksum; + + return (zio); +} + +zio_t * +zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + void *data, int checksum, zio_done_func_t *done, void *private, + int priority, int flags, boolean_t labels) +{ + zio_t *zio; + + ASSERT(vd->vdev_children == 0); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); + + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, + ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + + zio->io_prop.zp_checksum = checksum; + + if (zio_checksum_table[checksum].ci_zbt) { + /* + * zbt checksums are necessarily destructive -- they modify + * the end of the write buffer to hold the verifier/checksum. + * Therefore, we must make a local copy in case the data is + * being written to multiple places in parallel. + */ + void *wbuf = zio_buf_alloc(size); + bcopy(data, wbuf, size); + zio_push_transform(zio, wbuf, size, size, NULL); + } + + return (zio); +} + +/* + * Create a child I/O to do some work for us. + */ +zio_t * +zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, + void *data, uint64_t size, int type, int priority, int flags, + zio_done_func_t *done, void *private) +{ + uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; + zio_t *zio; + + ASSERT(vd->vdev_parent == + (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); + + if (type == ZIO_TYPE_READ && bp != NULL) { + /* + * If we have the bp, then the child should perform the + * checksum and the parent need not. This pushes error + * detection as close to the leaves as possible and + * eliminates redundant checksums in the interior nodes. + */ + pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; + pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); + } + + if (vd->vdev_children == 0) + offset += VDEV_LABEL_START_SIZE; + + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, + done, private, type, priority, + (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags, + vd, offset, &pio->io_bookmark, + ZIO_STAGE_VDEV_IO_START - 1, pipeline); + + return (zio); +} + +zio_t * +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, + int type, int priority, int flags, zio_done_func_t *done, void *private) +{ + zio_t *zio; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + zio = zio_create(NULL, vd->vdev_spa, 0, NULL, + data, size, done, private, type, priority, + flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, + vd, offset, NULL, + ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE); + + return (zio); +} + +void +zio_flush(zio_t *zio, vdev_t *vd) +{ + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); +} + +/* + * ========================================================================== + * Prepare to read and write logical blocks + * ========================================================================== + */ + +static int +zio_read_bp_init(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) { + uint64_t csize = BP_GET_PSIZE(bp); + void *cbuf = zio_buf_alloc(csize); + + zio_push_transform(zio, cbuf, csize, csize, zio_decompress); + } + + if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_write_bp_init(zio_t *zio) +{ + zio_prop_t *zp = &zio->io_prop; + int compress = zp->zp_compress; + blkptr_t *bp = zio->io_bp; + void *cbuf; + uint64_t lsize = zio->io_size; + uint64_t csize = lsize; + uint64_t cbufsize = 0; + int pass = 1; + + /* + * If our children haven't all reached the ready stage, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || + zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + + if (!IO_IS_ALLOCATING(zio)) + return (ZIO_PIPELINE_CONTINUE); + + ASSERT(compress != ZIO_COMPRESS_INHERIT); + + if (bp->blk_birth == zio->io_txg) { + /* + * We're rewriting an existing block, which means we're + * working on behalf of spa_sync(). For spa_sync() to + * converge, it must eventually be the case that we don't + * have to allocate new blocks. But compression changes + * the blocksize, which forces a reallocate, and makes + * convergence take longer. Therefore, after the first + * few passes, stop compressing to ensure convergence. + */ + pass = spa_sync_pass(zio->io_spa); + ASSERT(pass > 1); + + if (pass > SYNC_PASS_DONT_COMPRESS) + compress = ZIO_COMPRESS_OFF; + + /* + * Only MOS (objset 0) data should need to be rewritten. + */ + ASSERT(zio->io_logical->io_bookmark.zb_objset == 0); + + /* Make sure someone doesn't change their mind on overwrites */ + ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp), + spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp)); + } + + if (compress != ZIO_COMPRESS_OFF) { + if (!zio_compress_data(compress, zio->io_data, zio->io_size, + &cbuf, &csize, &cbufsize)) { + compress = ZIO_COMPRESS_OFF; + } else if (csize != 0) { + zio_push_transform(zio, cbuf, csize, cbufsize, NULL); + } + } + + /* + * The final pass of spa_sync() must be all rewrites, but the first + * few passes offer a trade-off: allocating blocks defers convergence, + * but newly allocated blocks are sequential, so they can be written + * to disk faster. Therefore, we allow the first few passes of + * spa_sync() to allocate new blocks, but force rewrites after that. + * There should only be a handful of blocks after pass 1 in any case. + */ + if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && + pass > SYNC_PASS_REWRITE) { + ASSERT(csize != 0); + uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; + zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; + zio->io_flags |= ZIO_FLAG_IO_REWRITE; + } else { + BP_ZERO(bp); + zio->io_pipeline = ZIO_WRITE_PIPELINE; + } + + if (csize == 0) { + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + } else { + ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, csize); + BP_SET_COMPRESS(bp, compress); + BP_SET_CHECKSUM(bp, zp->zp_checksum); + BP_SET_TYPE(bp, zp->zp_type); + BP_SET_LEVEL(bp, zp->zp_level); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + } + + return (ZIO_PIPELINE_CONTINUE); +} + +/* + * ========================================================================== + * Execute the I/O pipeline + * ========================================================================== + */ + +static void +zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) +{ + zio_type_t t = zio->io_type; + + /* + * If we're a config writer, the normal issue and interrupt threads + * may all be blocked waiting for the config lock. In this case, + * select the otherwise-unused taskq for ZIO_TYPE_NULL. + */ + if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER) + t = ZIO_TYPE_NULL; + + /* + * A similar issue exists for the L2ARC write thread until L2ARC 2.0. + */ + if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) + t = ZIO_TYPE_NULL; + + (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q], + (task_func_t *)zio_execute, zio, TQ_SLEEP); +} + +static boolean_t +zio_taskq_member(zio_t *zio, enum zio_taskq_type q) +{ + kthread_t *executor = zio->io_executor; + spa_t *spa = zio->io_spa; + + for (zio_type_t t = 0; t < ZIO_TYPES; t++) + if (taskq_member(spa->spa_zio_taskq[t][q], executor)) + return (B_TRUE); + + return (B_FALSE); +} + +static int +zio_issue_async(zio_t *zio) +{ + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + + return (ZIO_PIPELINE_STOP); +} + +void +zio_interrupt(zio_t *zio) +{ + zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); +} + +/* + * Execute the I/O pipeline until one of the following occurs: + * (1) the I/O completes; (2) the pipeline stalls waiting for + * dependent child I/Os; (3) the I/O issues, so we're waiting + * for an I/O completion interrupt; (4) the I/O is delegated by + * vdev-level caching or aggregation; (5) the I/O is deferred + * due to vdev-level queueing; (6) the I/O is handed off to + * another thread. In all cases, the pipeline stops whenever + * there's no CPU work; it never burns a thread in cv_wait(). + * + * There's no locking on io_stage because there's no legitimate way + * for multiple threads to be attempting to process the same I/O. + */ +static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES]; + +void +zio_execute(zio_t *zio) +{ + zio->io_executor = curthread; + + while (zio->io_stage < ZIO_STAGE_DONE) { + uint32_t pipeline = zio->io_pipeline; + zio_stage_t stage = zio->io_stage; + int rv; + + ASSERT(!MUTEX_HELD(&zio->io_lock)); + + while (((1U << ++stage) & pipeline) == 0) + continue; + + ASSERT(stage <= ZIO_STAGE_DONE); + ASSERT(zio->io_stall == NULL); + + /* + * If we are in interrupt context and this pipeline stage + * will grab a config lock that is held across I/O, + * issue async to avoid deadlock. + */ + if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) && + zio->io_vd == NULL && + zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + return; + } + + zio->io_stage = stage; + rv = zio_pipeline[stage](zio); + + if (rv == ZIO_PIPELINE_STOP) + return; + + ASSERT(rv == ZIO_PIPELINE_CONTINUE); + } +} + +/* + * ========================================================================== + * Initiate I/O, either sync or async + * ========================================================================== + */ +int +zio_wait(zio_t *zio) +{ + int error; + + ASSERT(zio->io_stage == ZIO_STAGE_OPEN); + ASSERT(zio->io_executor == NULL); + + zio->io_waiter = curthread; + + zio_execute(zio); + + mutex_enter(&zio->io_lock); + while (zio->io_executor != NULL) + cv_wait(&zio->io_cv, &zio->io_lock); + mutex_exit(&zio->io_lock); + + error = zio->io_error; + zio_destroy(zio); + + return (error); +} + +void +zio_nowait(zio_t *zio) +{ + ASSERT(zio->io_executor == NULL); + + if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) { + /* + * This is a logical async I/O with no parent to wait for it. + * Attach it to the pool's global async root zio so that + * spa_unload() has a way of waiting for async I/O to finish. + */ + spa_t *spa = zio->io_spa; + zio->io_async_root = B_TRUE; + mutex_enter(&spa->spa_async_root_lock); + spa->spa_async_root_count++; + mutex_exit(&spa->spa_async_root_lock); + } + + zio_execute(zio); +} + +/* + * ========================================================================== + * Reexecute or suspend/resume failed I/O + * ========================================================================== + */ + +static void +zio_reexecute(zio_t *pio) +{ + zio_t *zio, *zio_next; + + pio->io_flags = pio->io_orig_flags; + pio->io_stage = pio->io_orig_stage; + pio->io_pipeline = pio->io_orig_pipeline; + pio->io_reexecute = 0; + pio->io_error = 0; + for (int c = 0; c < ZIO_CHILD_TYPES; c++) + pio->io_child_error[c] = 0; + + if (IO_IS_ALLOCATING(pio)) { + /* + * Remember the failed bp so that the io_ready() callback + * can update its accounting upon reexecution. The block + * was already freed in zio_done(); we indicate this with + * a fill count of -1 so that zio_free() knows to skip it. + */ + blkptr_t *bp = pio->io_bp; + ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg); + bp->blk_fill = BLK_FILL_ALREADY_FREED; + pio->io_bp_orig = *bp; + BP_ZERO(bp); + } + + /* + * As we reexecute pio's children, new children could be created. + * New children go to the head of the io_child list, however, + * so we will (correctly) not reexecute them. The key is that + * the remainder of the io_child list, from 'zio_next' onward, + * cannot be affected by any side effects of reexecuting 'zio'. + */ + for (zio = pio->io_child; zio != NULL; zio = zio_next) { + zio_next = zio->io_sibling_next; + mutex_enter(&pio->io_lock); + pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; + pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; + mutex_exit(&pio->io_lock); + zio_reexecute(zio); + } + + /* + * Now that all children have been reexecuted, execute the parent. + */ + zio_execute(pio); +} + +void +zio_suspend(spa_t *spa, zio_t *zio) +{ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) + fm_panic("Pool '%s' has encountered an uncorrectable I/O " + "failure and the failure mode property for this pool " + "is set to panic.", spa_name(spa)); + + zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); + + mutex_enter(&spa->spa_suspend_lock); + + if (spa->spa_suspend_zio_root == NULL) + spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0); + + spa->spa_suspended = B_TRUE; + + if (zio != NULL) { + ASSERT(zio != spa->spa_suspend_zio_root); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(zio->io_parent == NULL); + ASSERT(zio->io_stage == ZIO_STAGE_DONE); + zio_add_child(spa->spa_suspend_zio_root, zio); + } + + mutex_exit(&spa->spa_suspend_lock); +} + +void +zio_resume(spa_t *spa) +{ + zio_t *pio, *zio; + + /* + * Reexecute all previously suspended i/o. + */ + mutex_enter(&spa->spa_suspend_lock); + spa->spa_suspended = B_FALSE; + cv_broadcast(&spa->spa_suspend_cv); + pio = spa->spa_suspend_zio_root; + spa->spa_suspend_zio_root = NULL; + mutex_exit(&spa->spa_suspend_lock); + + if (pio == NULL) + return; + + while ((zio = pio->io_child) != NULL) { + zio_remove_child(pio, zio); + zio->io_parent = NULL; + zio_reexecute(zio); + } + + ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); + + (void) zio_wait(pio); +} + +void +zio_resume_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_suspend_lock); + while (spa_suspended(spa)) + cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); + mutex_exit(&spa->spa_suspend_lock); +} + +/* + * ========================================================================== + * Gang blocks. + * + * A gang block is a collection of small blocks that looks to the DMU + * like one large block. When zio_dva_allocate() cannot find a block + * of the requested size, due to either severe fragmentation or the pool + * being nearly full, it calls zio_write_gang_block() to construct the + * block from smaller fragments. + * + * A gang block consists of a gang header (zio_gbh_phys_t) and up to + * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like + * an indirect block: it's an array of block pointers. It consumes + * only one sector and hence is allocatable regardless of fragmentation. + * The gang header's bps point to its gang members, which hold the data. + * + * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> + * as the verifier to ensure uniqueness of the SHA256 checksum. + * Critically, the gang block bp's blk_cksum is the checksum of the data, + * not the gang header. This ensures that data block signatures (needed for + * deduplication) are independent of how the block is physically stored. + * + * Gang blocks can be nested: a gang member may itself be a gang block. + * Thus every gang block is a tree in which root and all interior nodes are + * gang headers, and the leaves are normal blocks that contain user data. + * The root of the gang tree is called the gang leader. + * + * To perform any operation (read, rewrite, free, claim) on a gang block, + * zio_gang_assemble() first assembles the gang tree (minus data leaves) + * in the io_gang_tree field of the original logical i/o by recursively + * reading the gang leader and all gang headers below it. This yields + * an in-core tree containing the contents of every gang header and the + * bps for every constituent of the gang block. + * + * With the gang tree now assembled, zio_gang_issue() just walks the gang tree + * and invokes a callback on each bp. To free a gang block, zio_gang_issue() + * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. + * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). + * zio_read_gang() is a wrapper around zio_read() that omits reading gang + * headers, since we already have those in io_gang_tree. zio_rewrite_gang() + * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() + * of the gang header plus zio_checksum_compute() of the data to update the + * gang header's blk_cksum as described above. + * + * The two-phase assemble/issue model solves the problem of partial failure -- + * what if you'd freed part of a gang block but then couldn't read the + * gang header for another part? Assembling the entire gang tree first + * ensures that all the necessary gang header I/O has succeeded before + * starting the actual work of free, claim, or write. Once the gang tree + * is assembled, free and claim are in-memory operations that cannot fail. + * + * In the event that a gang write fails, zio_dva_unallocate() walks the + * gang tree to immediately free (i.e. insert back into the space map) + * everything we've allocated. This ensures that we don't get ENOSPC + * errors during repeated suspend/resume cycles due to a flaky device. + * + * Gang rewrites only happen during sync-to-convergence. If we can't assemble + * the gang tree, we won't modify the block, so we can safely defer the free + * (knowing that the block is still intact). If we *can* assemble the gang + * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free + * each constituent bp and we can allocate a new block on the next sync pass. + * + * In all cases, the gang tree allows complete recovery from partial failure. + * ========================================================================== + */ + +static zio_t * +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +{ + if (gn != NULL) + return (pio); + + return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), + NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark)); +} + +zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +{ + zio_t *zio; + + if (gn != NULL) { + zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, + gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + /* + * As we rewrite each gang header, the pipeline will compute + * a new gang block header checksum for it; but no one will + * compute a new data checksum, so we do that here. The one + * exception is the gang leader: the pipeline already computed + * its data checksum because that stage precedes gang assembly. + * (Presently, nothing actually uses interior data checksums; + * this is just good hygiene.) + */ + if (gn != pio->io_logical->io_gang_tree) { + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), + data, BP_GET_PSIZE(bp)); + } + } else { + zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, + data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + } + + return (zio); +} + +/* ARGSUSED */ +zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +{ + return (zio_free(pio, pio->io_spa, pio->io_txg, bp, + NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); +} + +/* ARGSUSED */ +zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +{ + return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, + NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); +} + +static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { + NULL, + zio_read_gang, + zio_rewrite_gang, + zio_free_gang, + zio_claim_gang, + NULL +}; + +static void zio_gang_tree_assemble_done(zio_t *zio); + +static zio_gang_node_t * +zio_gang_node_alloc(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn; + + ASSERT(*gnpp == NULL); + + gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); + gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); + *gnpp = gn; + + return (gn); +} + +static void +zio_gang_node_free(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = *gnpp; + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + ASSERT(gn->gn_child[g] == NULL); + + zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); + kmem_free(gn, sizeof (*gn)); + *gnpp = NULL; +} + +static void +zio_gang_tree_free(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = *gnpp; + + if (gn == NULL) + return; + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + zio_gang_tree_free(&gn->gn_child[g]); + + zio_gang_node_free(gnpp); +} + +static void +zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + + ASSERT(lio->io_logical == lio); + ASSERT(BP_IS_GANG(bp)); + + zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh, + SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, + lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark)); +} + +static void +zio_gang_tree_assemble_done(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + zio_gang_node_t *gn = zio->io_private; + blkptr_t *bp = zio->io_bp; + + ASSERT(zio->io_parent == lio); + ASSERT(zio->io_child == NULL); + + if (zio->io_error) + return; + + if (BP_SHOULD_BYTESWAP(bp)) + byteswap_uint64_array(zio->io_data, zio->io_size); + + ASSERT(zio->io_data == gn->gn_gbh); + ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); + ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + if (!BP_IS_GANG(gbp)) + continue; + zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]); + } +} + +static void +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) +{ + zio_t *lio = pio->io_logical; + zio_t *zio; + + ASSERT(BP_IS_GANG(bp) == !!gn); + ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp)); + ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree); + + /* + * If you're a gang header, your data is in gn->gn_gbh. + * If you're a gang member, your data is in 'data' and gn == NULL. + */ + zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data); + + if (gn != NULL) { + ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + if (BP_IS_HOLE(gbp)) + continue; + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); + data = (char *)data + BP_GET_PSIZE(gbp); + } + } + + if (gn == lio->io_gang_tree) + ASSERT3P((char *)lio->io_data + lio->io_size, ==, data); + + if (zio != pio) + zio_nowait(zio); +} + +static int +zio_gang_assemble(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical); + + zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_gang_issue(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + blkptr_t *bp = zio->io_bp; + + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); + + ASSERT(BP_IS_GANG(bp) && zio == lio); + + if (zio->io_child_error[ZIO_CHILD_GANG] == 0) + zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data); + else + zio_gang_tree_free(&lio->io_gang_tree); + + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + return (ZIO_PIPELINE_CONTINUE); +} + +static void +zio_write_gang_member_ready(zio_t *zio) +{ + zio_t *pio = zio->io_parent; + zio_t *lio = zio->io_logical; + dva_t *cdva = zio->io_bp->blk_dva; + dva_t *pdva = pio->io_bp->blk_dva; + uint64_t asize; + + if (BP_IS_HOLE(zio->io_bp)) + return; + + ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); + + ASSERT(zio->io_child_type == ZIO_CHILD_GANG); + ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas); + ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); + ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); + + mutex_enter(&pio->io_lock); + for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { + ASSERT(DVA_GET_GANG(&pdva[d])); + asize = DVA_GET_ASIZE(&pdva[d]); + asize += DVA_GET_ASIZE(&cdva[d]); + DVA_SET_ASIZE(&pdva[d], asize); + } + mutex_exit(&pio->io_lock); +} + +static int +zio_write_gang_block(zio_t *pio) +{ + spa_t *spa = pio->io_spa; + blkptr_t *bp = pio->io_bp; + zio_t *lio = pio->io_logical; + zio_t *zio; + zio_gang_node_t *gn, **gnpp; + zio_gbh_phys_t *gbh; + uint64_t txg = pio->io_txg; + uint64_t resid = pio->io_size; + uint64_t lsize; + int ndvas = lio->io_prop.zp_ndvas; + int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); + zio_prop_t zp; + int error; + + error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, + bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp, + METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); + if (error) { + pio->io_error = error; + return (ZIO_PIPELINE_CONTINUE); + } + + if (pio == lio) { + gnpp = &lio->io_gang_tree; + } else { + gnpp = pio->io_private; + ASSERT(pio->io_ready == zio_write_gang_member_ready); + } + + gn = zio_gang_node_alloc(gnpp); + gbh = gn->gn_gbh; + bzero(gbh, SPA_GANGBLOCKSIZE); + + /* + * Create the gang header. + */ + zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + /* + * Create and nowait the gang children. + */ + for (int g = 0; resid != 0; resid -= lsize, g++) { + lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), + SPA_MINBLOCKSIZE); + ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); + + zp.zp_checksum = lio->io_prop.zp_checksum; + zp.zp_compress = ZIO_COMPRESS_OFF; + zp.zp_type = DMU_OT_NONE; + zp.zp_level = 0; + zp.zp_ndvas = lio->io_prop.zp_ndvas; + + zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], + (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, + zio_write_gang_member_ready, NULL, &gn->gn_child[g], + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark)); + } + + /* + * Set pio's pipeline to just wait for zio to finish. + */ + pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + zio_nowait(zio); + + return (ZIO_PIPELINE_CONTINUE); +} + +/* + * ========================================================================== + * Allocate and free blocks + * ========================================================================== + */ + +static int +zio_dva_allocate(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + metaslab_class_t *mc = spa->spa_normal_class; + blkptr_t *bp = zio->io_bp; + int error; + + ASSERT(BP_IS_HOLE(bp)); + ASSERT3U(BP_GET_NDVAS(bp), ==, 0); + ASSERT3U(zio->io_prop.zp_ndvas, >, 0); + ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa)); + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + + error = metaslab_alloc(spa, mc, zio->io_size, bp, + zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0); + + if (error) { + if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) + return (zio_write_gang_block(zio)); + zio->io_error = error; + } + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_dva_free(zio_t *zio) +{ + metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_dva_claim(zio_t *zio) +{ + int error; + + error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); + if (error) + zio->io_error = error; + + return (ZIO_PIPELINE_CONTINUE); +} + +/* + * Undo an allocation. This is used by zio_done() when an I/O fails + * and we want to give back the block we just allocated. + * This handles both normal blocks and gang blocks. + */ +static void +zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) +{ + spa_t *spa = zio->io_spa; + boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE); + + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + + if (zio->io_bp == bp && !now) { + /* + * This is a rewrite for sync-to-convergence. + * We can't do a metaslab_free(NOW) because bp wasn't allocated + * during this sync pass, which means that metaslab_sync() + * already committed the allocation. + */ + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), + BP_IDENTITY(&zio->io_bp_orig))); + ASSERT(spa_sync_pass(spa) > 1); + + if (BP_IS_GANG(bp) && gn == NULL) { + /* + * This is a gang leader whose gang header(s) we + * couldn't read now, so defer the free until later. + * The block should still be intact because without + * the headers, we'd never even start the rewrite. + */ + bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); + return; + } + } + + if (!BP_IS_HOLE(bp)) + metaslab_free(spa, bp, bp->blk_birth, now); + + if (gn != NULL) { + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + zio_dva_unallocate(zio, gn->gn_child[g], + &gn->gn_gbh->zg_blkptr[g]); + } + } +} + +/* + * Try to allocate an intent log block. Return 0 on success, errno on failure. + */ +int +zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, + uint64_t txg) +{ + int error; + + error = metaslab_alloc(spa, spa->spa_log_class, size, + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + + if (error) + error = metaslab_alloc(spa, spa->spa_normal_class, size, + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + + if (error == 0) { + BP_SET_LSIZE(new_bp, size); + BP_SET_PSIZE(new_bp, size); + BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); + BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); + BP_SET_LEVEL(new_bp, 0); + BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); + } + + return (error); +} + +/* + * Free an intent log block. We know it can't be a gang block, so there's + * nothing to do except metaslab_free() it. + */ +void +zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) +{ + ASSERT(!BP_IS_GANG(bp)); + + metaslab_free(spa, bp, txg, B_FALSE); +} + +/* + * ========================================================================== + * Read and write to physical devices + * ========================================================================== + */ + +static void +zio_vdev_io_probe_done(zio_t *zio) +{ + zio_t *dio; + vdev_t *vd = zio->io_private; + + mutex_enter(&vd->vdev_probe_lock); + ASSERT(vd->vdev_probe_zio == zio); + vd->vdev_probe_zio = NULL; + mutex_exit(&vd->vdev_probe_lock); + + while ((dio = zio->io_delegate_list) != NULL) { + zio->io_delegate_list = dio->io_delegate_next; + dio->io_delegate_next = NULL; + if (!vdev_accessible(vd, dio)) + dio->io_error = ENXIO; + zio_execute(dio); + } +} + +/* + * Probe the device to determine whether I/O failure is specific to this + * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged). + */ +static int +zio_vdev_io_probe(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + zio_t *pio = NULL; + boolean_t created_pio = B_FALSE; + + /* + * Don't probe the probe. + */ + if (zio->io_flags & ZIO_FLAG_PROBE) + return (ZIO_PIPELINE_CONTINUE); + + /* + * To prevent 'probe storms' when a device fails, we create + * just one probe i/o at a time. All zios that want to probe + * this vdev will join the probe zio's io_delegate_list. + */ + mutex_enter(&vd->vdev_probe_lock); + + if ((pio = vd->vdev_probe_zio) == NULL) { + vd->vdev_probe_zio = pio = zio_root(zio->io_spa, + zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL); + created_pio = B_TRUE; + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_PROBE); + } + + zio->io_delegate_next = pio->io_delegate_list; + pio->io_delegate_list = zio; + + mutex_exit(&vd->vdev_probe_lock); + + if (created_pio) { + zio_nowait(vdev_probe(vd, pio)); + zio_nowait(pio); + } + + return (ZIO_PIPELINE_STOP); +} + +static int +zio_vdev_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t align; + spa_t *spa = zio->io_spa; + + ASSERT(zio->io_error == 0); + ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); + + if (vd == NULL) { + if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) + spa_config_enter(spa, SCL_ZIO, zio, RW_READER); + + /* + * The mirror_ops handle multiple DVAs in a single BP. + */ + return (vdev_mirror_ops.vdev_op_io_start(zio)); + } + + align = 1ULL << vd->vdev_top->vdev_ashift; + + if (P2PHASE(zio->io_size, align) != 0) { + uint64_t asize = P2ROUNDUP(zio->io_size, align); + char *abuf = zio_buf_alloc(asize); + ASSERT(vd == vd->vdev_top); + if (zio->io_type == ZIO_TYPE_WRITE) { + bcopy(zio->io_data, abuf, zio->io_size); + bzero(abuf + zio->io_size, asize - zio->io_size); + } + zio_push_transform(zio, abuf, asize, asize, zio_subblock); + } + + ASSERT(P2PHASE(zio->io_offset, align) == 0); + ASSERT(P2PHASE(zio->io_size, align) == 0); + ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); + + if (vd->vdev_ops->vdev_op_leaf && + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { + + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return (ZIO_PIPELINE_STOP); + + if ((zio = vdev_queue_io(zio)) == NULL) + return (ZIO_PIPELINE_STOP); + + if (!vdev_accessible(vd, zio)) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); + } + + } + + return (vd->vdev_ops->vdev_op_io_start(zio)); +} + +static int +zio_vdev_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; + boolean_t unexpected_error = B_FALSE; + + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(vd, EIO); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_label_injection(zio, EIO); + + if (zio->io_error) { + if (!vdev_accessible(vd, zio)) { + zio->io_error = ENXIO; + } else { + unexpected_error = B_TRUE; + } + } + } + + ops->vdev_op_io_done(zio); + + if (unexpected_error) + return (zio_vdev_io_probe(zio)); + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_vdev_io_assess(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); + + if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) + spa_config_exit(zio->io_spa, SCL_ZIO, zio); + + if (zio->io_vsd != NULL) { + zio->io_vsd_free(zio); + zio->io_vsd = NULL; + } + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_fault_injection(zio, EIO); + + /* + * If the I/O failed, determine whether we should attempt to retry it. + */ + if (zio->io_error && vd == NULL && + !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { + ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ + zio->io_error = 0; + zio->io_flags |= ZIO_FLAG_IO_RETRY | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; + zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + return (ZIO_PIPELINE_STOP); + } + + /* + * If we got an error on a leaf device, convert it to ENXIO + * if the device is not accessible at all. + */ + if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && + !vdev_accessible(vd, zio)) + zio->io_error = ENXIO; + + /* + * If we can't write to an interior vdev (mirror or RAID-Z), + * set vdev_cant_write so that we stop trying to allocate from it. + */ + if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && + vd != NULL && !vd->vdev_ops->vdev_op_leaf) + vd->vdev_cant_write = B_TRUE; + + if (zio->io_error) + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + return (ZIO_PIPELINE_CONTINUE); +} + +void +zio_vdev_io_reissue(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_stage--; +} + +void +zio_vdev_io_redone(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); + + zio->io_stage--; +} + +void +zio_vdev_io_bypass(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_flags |= ZIO_FLAG_IO_BYPASS; + zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; +} + +/* + * ========================================================================== + * Generate and verify checksums + * ========================================================================== + */ +static int +zio_checksum_generate(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + enum zio_checksum checksum; + + if (bp == NULL) { + /* + * This is zio_write_phys(). + * We're either generating a label checksum, or none at all. + */ + checksum = zio->io_prop.zp_checksum; + + if (checksum == ZIO_CHECKSUM_OFF) + return (ZIO_PIPELINE_CONTINUE); + + ASSERT(checksum == ZIO_CHECKSUM_LABEL); + } else { + if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { + ASSERT(!IO_IS_ALLOCATING(zio)); + checksum = ZIO_CHECKSUM_GANG_HEADER; + } else { + checksum = BP_GET_CHECKSUM(bp); + } + } + + zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_checksum_verify(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + int error; + + if (bp == NULL) { + /* + * This is zio_read_phys(). + * We're either verifying a label checksum, or nothing at all. + */ + if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) + return (ZIO_PIPELINE_CONTINUE); + + ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); + } + + if ((error = zio_checksum_error(zio)) != 0) { + zio->io_error = error; + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, zio->io_vd, zio, 0, 0); + } + } + + return (ZIO_PIPELINE_CONTINUE); +} + +/* + * Called by RAID-Z to ensure we don't compute the checksum twice. + */ +void +zio_checksum_verified(zio_t *zio) +{ + zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); +} + +/* + * ========================================================================== + * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. + * An error of 0 indictes success. ENXIO indicates whole-device failure, + * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO + * indicate errors that are specific to one I/O, and most likely permanent. + * Any other error is presumed to be worse because we weren't expecting it. + * ========================================================================== + */ +int +zio_worst_error(int e1, int e2) +{ + static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; + int r1, r2; + + for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) + if (e1 == zio_error_rank[r1]) + break; + + for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) + if (e2 == zio_error_rank[r2]) + break; + + return (r1 > r2 ? e1 : e2); +} + +/* + * ========================================================================== + * I/O completion + * ========================================================================== + */ +static int +zio_ready(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + zio_t *pio = zio->io_parent; + + if (zio->io_ready) { + if (BP_IS_GANG(bp) && + zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); + + zio->io_ready(zio); + } + + if (bp != NULL && bp != &zio->io_bp_copy) + zio->io_bp_copy = *bp; + + if (zio->io_error) + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (pio != NULL) + zio_notify_parent(pio, zio, ZIO_WAIT_READY); + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_t *pio = zio->io_parent; + zio_t *lio = zio->io_logical; + blkptr_t *bp = zio->io_bp; + vdev_t *vd = zio->io_vd; + uint64_t psize = zio->io_size; + + /* + * If our of children haven't all completed, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || + zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || + zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); + + for (int c = 0; c < ZIO_CHILD_TYPES; c++) + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + ASSERT(zio->io_children[c][w] == 0); + + if (bp != NULL) { + ASSERT(bp->blk_pad[0] == 0); + ASSERT(bp->blk_pad[1] == 0); + ASSERT(bp->blk_pad[2] == 0); + ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || + (pio != NULL && bp == pio->io_bp)); + if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp)); + ASSERT(BP_COUNT_GANG(bp) == 0 || + (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); + } + } + + /* + * If there were child vdev or gang errors, they apply to us now. + */ + zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); + zio_inherit_child_errors(zio, ZIO_CHILD_GANG); + + zio_pop_transforms(zio); /* note: may set zio->io_error */ + + vdev_stat_update(zio, psize); + + if (zio->io_error) { + /* + * If this I/O is attached to a particular vdev, + * generate an error message describing the I/O failure + * at the block level. We ignore these errors if the + * device is currently unavailable. + */ + if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) + zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); + + if ((zio->io_error == EIO || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) { + /* + * For logical I/O requests, tell the SPA to log the + * error and generate a logical data ereport. + */ + spa_log_error(spa, zio); + zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, + 0, 0); + } + } + + if (zio->io_error && zio == lio) { + /* + * Determine whether zio should be reexecuted. This will + * propagate all the way to the root via zio_notify_parent(). + */ + ASSERT(vd == NULL && bp != NULL); + + if (IO_IS_ALLOCATING(zio)) + if (zio->io_error != ENOSPC) + zio->io_reexecute |= ZIO_REEXECUTE_NOW; + else + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + if ((zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_FREE) && + zio->io_error == ENXIO && + spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + } + + /* + * If there were logical child errors, they apply to us now. + * We defer this until now to avoid conflating logical child + * errors with errors that happened to the zio itself when + * updating vdev stats and reporting FMA events above. + */ + zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); + + if (zio->io_reexecute) { + /* + * This is a logical I/O that wants to reexecute. + * + * Reexecute is top-down. When an i/o fails, if it's not + * the root, it simply notifies its parent and sticks around. + * The parent, seeing that it still has children in zio_done(), + * does the same. This percolates all the way up to the root. + * The root i/o will reexecute or suspend the entire tree. + * + * This approach ensures that zio_reexecute() honors + * all the original i/o dependency relationships, e.g. + * parents not executing until children are ready. + */ + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (IO_IS_ALLOCATING(zio)) + zio_dva_unallocate(zio, zio->io_gang_tree, bp); + + zio_gang_tree_free(&zio->io_gang_tree); + + if (pio != NULL) { + /* + * We're not a root i/o, so there's nothing to do + * but notify our parent. Don't propagate errors + * upward since we haven't permanently failed yet. + */ + zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; + zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { + /* + * We'd fail again if we reexecuted now, so suspend + * until conditions improve (e.g. device comes online). + */ + zio_suspend(spa, zio); + } else { + /* + * Reexecution is potentially a huge amount of work. + * Hand it off to the otherwise-unused claim taskq. + */ + (void) taskq_dispatch( + spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], + (task_func_t *)zio_reexecute, zio, TQ_SLEEP); + } + return (ZIO_PIPELINE_STOP); + } + + ASSERT(zio->io_child == NULL); + ASSERT(zio->io_reexecute == 0); + ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); + + if (zio->io_done) + zio->io_done(zio); + + zio_gang_tree_free(&zio->io_gang_tree); + + ASSERT(zio->io_delegate_list == NULL); + ASSERT(zio->io_delegate_next == NULL); + + if (pio != NULL) { + zio_remove_child(pio, zio); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + } + + if (zio->io_waiter != NULL) { + mutex_enter(&zio->io_lock); + zio->io_executor = NULL; + cv_broadcast(&zio->io_cv); + mutex_exit(&zio->io_lock); + } else { + zio_destroy(zio); + } + + return (ZIO_PIPELINE_STOP); +} + +/* + * ========================================================================== + * I/O pipeline definition + * ========================================================================== + */ +static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = { + NULL, + zio_issue_async, + zio_read_bp_init, + zio_write_bp_init, + zio_checksum_generate, + zio_gang_assemble, + zio_gang_issue, + zio_dva_allocate, + zio_dva_free, + zio_dva_claim, + zio_ready, + zio_vdev_io_start, + zio_vdev_io_done, + zio_vdev_io_assess, + zio_checksum_verify, + zio_done +}; diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c new file mode 100644 index 000000000..bf7fe733f --- /dev/null +++ b/module/zfs/zio_checksum.c @@ -0,0 +1,206 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +/* + * Checksum vectors. + * + * In the SPA, everything is checksummed. We support checksum vectors + * for three distinct reasons: + * + * 1. Different kinds of data need different levels of protection. + * For SPA metadata, we always want a very strong checksum. + * For user data, we let users make the trade-off between speed + * and checksum strength. + * + * 2. Cryptographic hash and MAC algorithms are an area of active research. + * It is likely that in future hash functions will be at least as strong + * as current best-of-breed, and may be substantially faster as well. + * We want the ability to take advantage of these new hashes as soon as + * they become available. + * + * 3. If someone develops hardware that can compute a strong hash quickly, + * we want the ability to take advantage of that hardware. + * + * Of course, we don't want a checksum upgrade to invalidate existing + * data, so we store the checksum *function* in five bits of the DVA. + * This gives us room for up to 32 different checksum functions. + * + * When writing a block, we always checksum it with the latest-and-greatest + * checksum function of the appropriate strength. When reading a block, + * we compare the expected checksum against the actual checksum, which we + * compute via the checksum function specified in the DVA encoding. + */ + +/*ARGSUSED*/ +static void +zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { + {{NULL, NULL}, 0, 0, "inherit"}, + {{NULL, NULL}, 0, 0, "on"}, + {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"}, + {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"}, + {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"}, + {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"}, +}; + +uint8_t +zio_checksum_select(uint8_t child, uint8_t parent) +{ + ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); + + if (child == ZIO_CHECKSUM_INHERIT) + return (parent); + + if (child == ZIO_CHECKSUM_ON) + return (ZIO_CHECKSUM_ON_VALUE); + + return (child); +} + +/* + * Set the external verifier for a gang block based on <vdev, offset, txg>, + * a tuple which is guaranteed to be unique for the life of the pool. + */ +static void +zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) +{ + dva_t *dva = BP_IDENTITY(bp); + uint64_t txg = bp->blk_birth; + + ASSERT(BP_IS_GANG(bp)); + + ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); +} + +/* + * Set the external verifier for a label block based on its offset. + * The vdev is implicit, and the txg is unknowable at pool open time -- + * hence the logic in vdev_uberblock_load() to find the most recent copy. + */ +static void +zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) +{ + ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); +} + +/* + * Generate the checksum. + */ +void +zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, + void *data, uint64_t size) +{ + blkptr_t *bp = zio->io_bp; + uint64_t offset = zio->io_offset; + zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t zbt_cksum; + + ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(ci->ci_func[0] != NULL); + + if (ci->ci_zbt) { + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_checksum_gang_verifier(&zbt->zbt_cksum, bp); + else if (checksum == ZIO_CHECKSUM_LABEL) + zio_checksum_label_verifier(&zbt->zbt_cksum, offset); + else + bp->blk_cksum = zbt->zbt_cksum; + zbt->zbt_magic = ZBT_MAGIC; + ci->ci_func[0](data, size, &zbt_cksum); + zbt->zbt_cksum = zbt_cksum; + } else { + ci->ci_func[0](data, size, &bp->blk_cksum); + } +} + +int +zio_checksum_error(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int byteswap; + void *data = zio->io_data; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t actual_cksum, expected_cksum, verifier; + + if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) + return (EINVAL); + + if (ci->ci_zbt) { + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_checksum_gang_verifier(&verifier, bp); + else if (checksum == ZIO_CHECKSUM_LABEL) + zio_checksum_label_verifier(&verifier, offset); + else + verifier = bp->blk_cksum; + + byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)); + + if (byteswap) + byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + + expected_cksum = zbt->zbt_cksum; + zbt->zbt_cksum = verifier; + ci->ci_func[byteswap](data, size, &actual_cksum); + zbt->zbt_cksum = expected_cksum; + + if (byteswap) + byteswap_uint64_array(&expected_cksum, + sizeof (zio_cksum_t)); + } else { + ASSERT(!BP_IS_GANG(bp)); + byteswap = BP_SHOULD_BYTESWAP(bp); + expected_cksum = bp->blk_cksum; + ci->ci_func[byteswap](data, size, &actual_cksum); + } + + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) + return (ECKSUM); + + if (zio_injection_enabled && !zio->io_error) + return (zio_handle_fault_injection(zio, ECKSUM)); + + return (0); +} diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c new file mode 100644 index 000000000..c563be4eb --- /dev/null +++ b/module/zfs/zio_compress.c @@ -0,0 +1,148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/compress.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_compress.h> + +/* + * Compression vectors. + */ + +zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { + {NULL, NULL, 0, "inherit"}, + {NULL, NULL, 0, "on"}, + {NULL, NULL, 0, "uncompressed"}, + {lzjb_compress, lzjb_decompress, 0, "lzjb"}, + {NULL, NULL, 0, "empty"}, + {gzip_compress, gzip_decompress, 1, "gzip-1"}, + {gzip_compress, gzip_decompress, 2, "gzip-2"}, + {gzip_compress, gzip_decompress, 3, "gzip-3"}, + {gzip_compress, gzip_decompress, 4, "gzip-4"}, + {gzip_compress, gzip_decompress, 5, "gzip-5"}, + {gzip_compress, gzip_decompress, 6, "gzip-6"}, + {gzip_compress, gzip_decompress, 7, "gzip-7"}, + {gzip_compress, gzip_decompress, 8, "gzip-8"}, + {gzip_compress, gzip_decompress, 9, "gzip-9"}, +}; + +uint8_t +zio_compress_select(uint8_t child, uint8_t parent) +{ + ASSERT(child < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON); + + if (child == ZIO_COMPRESS_INHERIT) + return (parent); + + if (child == ZIO_COMPRESS_ON) + return (ZIO_COMPRESS_ON_VALUE); + + return (child); +} + +int +zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp, + uint64_t *destsizep, uint64_t *destbufsizep) +{ + uint64_t *word, *word_end; + uint64_t ciosize, gapsize, destbufsize; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + char *dest; + uint_t allzero; + + ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); + ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); + + /* + * If the data is all zeroes, we don't even need to allocate + * a block for it. We indicate this by setting *destsizep = 0. + */ + allzero = 1; + word = src; + word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize); + while (word < word_end) { + if (*word++ != 0) { + allzero = 0; + break; + } + } + if (allzero) { + *destp = NULL; + *destsizep = 0; + *destbufsizep = 0; + return (1); + } + + if (cpfunc == ZIO_COMPRESS_EMPTY) + return (0); + + /* Compress at least 12.5% */ + destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE); + if (destbufsize == 0) + return (0); + dest = zio_buf_alloc(destbufsize); + + ciosize = ci->ci_compress(src, dest, (size_t)srcsize, + (size_t)destbufsize, ci->ci_level); + if (ciosize > destbufsize) { + zio_buf_free(dest, destbufsize); + return (0); + } + + /* Cool. We compressed at least as much as we were hoping to. */ + + /* For security, make sure we don't write random heap crap to disk */ + gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize; + if (gapsize != 0) { + bzero(dest + ciosize, gapsize); + ciosize += gapsize; + } + + ASSERT3U(ciosize, <=, destbufsize); + ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0); + *destp = dest; + *destsizep = ciosize; + *destbufsizep = destbufsize; + + return (1); +} + +int +zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, uint64_t destsize) +{ + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); + + return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); +} diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c new file mode 100644 index 000000000..b3469fdd5 --- /dev/null +++ b/module/zfs/zio_inject.c @@ -0,0 +1,370 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * ZFS fault injection + * + * To handle fault injection, we keep track of a series of zinject_record_t + * structures which describe which logical block(s) should be injected with a + * fault. These are kept in a global list. Each record corresponds to a given + * spa_t and maintains a special hold on the spa_t so that it cannot be deleted + * or exported while the injection record exists. + * + * Device level injection is done using the 'zi_guid' field. If this is set, it + * means that the error is destined for a particular device, not a piece of + * data. + * + * This is a rather poor data structure and algorithm, but we don't expect more + * than a few faults at any one time, so it should be sufficient for our needs. + */ + +#include <sys/arc.h> +#include <sys/zio_impl.h> +#include <sys/zfs_ioctl.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> + +uint32_t zio_injection_enabled; + +typedef struct inject_handler { + int zi_id; + spa_t *zi_spa; + zinject_record_t zi_record; + list_node_t zi_link; +} inject_handler_t; + +static list_t inject_handlers; +static krwlock_t inject_lock; +static int inject_next_id = 1; + +/* + * Returns true if the given record matches the I/O in progress. + */ +static boolean_t +zio_match_handler(zbookmark_t *zb, uint64_t type, + zinject_record_t *record, int error) +{ + /* + * Check for a match against the MOS, which is based on type + */ + if (zb->zb_objset == 0 && record->zi_objset == 0 && + record->zi_object == 0) { + if (record->zi_type == DMU_OT_NONE || + type == record->zi_type) + return (record->zi_freq == 0 || + spa_get_random(100) < record->zi_freq); + else + return (B_FALSE); + } + + /* + * Check for an exact match. + */ + if (zb->zb_objset == record->zi_objset && + zb->zb_object == record->zi_object && + zb->zb_level == record->zi_level && + zb->zb_blkid >= record->zi_start && + zb->zb_blkid <= record->zi_end && + error == record->zi_error) + return (record->zi_freq == 0 || + spa_get_random(100) < record->zi_freq); + + return (B_FALSE); +} + +/* + * Determine if the I/O in question should return failure. Returns the errno + * to be returned to the caller. + */ +int +zio_handle_fault_injection(zio_t *zio, int error) +{ + int ret = 0; + inject_handler_t *handler; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + /* + * Currently, we only support fault injection on reads. + */ + if (zio->io_type != ZIO_TYPE_READ) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + /* Ignore errors not destined for this pool */ + if (zio->io_spa != handler->zi_spa) + continue; + + /* Ignore device errors */ + if (handler->zi_record.zi_guid != 0) + continue; + + /* If this handler matches, return EIO */ + if (zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + &handler->zi_record, error)) { + ret = error; + break; + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +/* + * Determine if the zio is part of a label update and has an injection + * handler associated with that portion of the label. Currently, we + * allow error injection in either the nvlist or the uberblock region of + * of the vdev label. + */ +int +zio_handle_label_injection(zio_t *zio, int error) +{ + inject_handler_t *handler; + vdev_t *vd = zio->io_vd; + uint64_t offset = zio->io_offset; + int label; + int ret = 0; + + if (offset + zio->io_size > VDEV_LABEL_START_SIZE && + offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + uint64_t start = handler->zi_record.zi_start; + uint64_t end = handler->zi_record.zi_end; + + /* Ignore device only faults */ + if (handler->zi_record.zi_start == 0) + continue; + + /* + * The injection region is the relative offsets within a + * vdev label. We must determine the label which is being + * updated and adjust our region accordingly. + */ + label = vdev_label_number(vd->vdev_psize, offset); + start = vdev_label_offset(vd->vdev_psize, label, start); + end = vdev_label_offset(vd->vdev_psize, label, end); + + if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && + (offset >= start && offset <= end)) { + ret = error; + break; + } + } + rw_exit(&inject_lock); + return (ret); +} + + +int +zio_handle_device_injection(vdev_t *vd, int error) +{ + inject_handler_t *handler; + int ret = 0; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + /* Ignore label specific faults */ + if (handler->zi_record.zi_start != 0) + continue; + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + if (handler->zi_record.zi_error == error) { + /* + * For a failed open, pretend like the device + * has gone away. + */ + if (error == ENXIO) + vd->vdev_stat.vs_aux = + VDEV_AUX_OPEN_FAILED; + ret = error; + break; + } + if (handler->zi_record.zi_error == ENXIO) { + ret = EIO; + break; + } + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +/* + * Create a new handler for the given record. We add it to the list, adding + * a reference to the spa_t in the process. We increment zio_injection_enabled, + * which is the switch to trigger all fault injection. + */ +int +zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) +{ + inject_handler_t *handler; + int error; + spa_t *spa; + + /* + * If this is pool-wide metadata, make sure we unload the corresponding + * spa_t, so that the next attempt to load it will trigger the fault. + * We call spa_reset() to unload the pool appropriately. + */ + if (flags & ZINJECT_UNLOAD_SPA) + if ((error = spa_reset(name)) != 0) + return (error); + + if (!(flags & ZINJECT_NULL)) { + /* + * spa_inject_ref() will add an injection reference, which will + * prevent the pool from being removed from the namespace while + * still allowing it to be unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (ENOENT); + + handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); + + rw_enter(&inject_lock, RW_WRITER); + + *id = handler->zi_id = inject_next_id++; + handler->zi_spa = spa; + handler->zi_record = *record; + list_insert_tail(&inject_handlers, handler); + atomic_add_32(&zio_injection_enabled, 1); + + rw_exit(&inject_lock); + } + + /* + * Flush the ARC, so that any attempts to read this data will end up + * going to the ZIO layer. Note that this is a little overkill, but + * we don't have the necessary ARC interfaces to do anything else, and + * fault injection isn't a performance critical path. + */ + if (flags & ZINJECT_FLUSH_ARC) + arc_flush(NULL); + + return (0); +} + +/* + * Returns the next record with an ID greater than that supplied to the + * function. Used to iterate over all handlers in the system. + */ +int +zio_inject_list_next(int *id, char *name, size_t buflen, + zinject_record_t *record) +{ + inject_handler_t *handler; + int ret; + + mutex_enter(&spa_namespace_lock); + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id > *id) + break; + + if (handler) { + *record = handler->zi_record; + *id = handler->zi_id; + (void) strncpy(name, spa_name(handler->zi_spa), buflen); + ret = 0; + } else { + ret = ENOENT; + } + + rw_exit(&inject_lock); + mutex_exit(&spa_namespace_lock); + + return (ret); +} + +/* + * Clear the fault handler with the given identifier, or return ENOENT if none + * exists. + */ +int +zio_clear_fault(int id) +{ + inject_handler_t *handler; + int ret; + + rw_enter(&inject_lock, RW_WRITER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id == id) + break; + + if (handler == NULL) { + ret = ENOENT; + } else { + list_remove(&inject_handlers, handler); + spa_inject_delref(handler->zi_spa); + kmem_free(handler, sizeof (inject_handler_t)); + atomic_add_32(&zio_injection_enabled, -1); + ret = 0; + } + + rw_exit(&inject_lock); + + return (ret); +} + +void +zio_inject_init(void) +{ + rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); + list_create(&inject_handlers, sizeof (inject_handler_t), + offsetof(inject_handler_t, zi_link)); +} + +void +zio_inject_fini(void) +{ + list_destroy(&inject_handlers); + rw_destroy(&inject_lock); +} diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c new file mode 100644 index 000000000..4e993060c --- /dev/null +++ b/module/zfs/zvol.c @@ -0,0 +1,1722 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev/zvol/dsk/<pool_name>/<dataset_name> + * /dev/zvol/rdsk/<pool_name>/<dataset_name> + * + * These links are created by the ZFS-specific devfsadm link generator. + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/buf.h> +#include <sys/modctl.h> +#include <sys/open.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_traverse.h> +#include <sys/dnode.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dkio.h> +#include <sys/efi_partition.h> +#include <sys/byteorder.h> +#include <sys/pathname.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/crc32.h> +#include <sys/dirent.h> +#include <sys/policy.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ioctl.h> +#include <sys/mkdev.h> +#include <sys/zil.h> +#include <sys/refcount.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_rlock.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_impl.h> +#include <sys/zvol.h> +#include <sys/dumphdr.h> + +#include "zfs_namecheck.h" + +static void *zvol_state; + +#define ZVOL_DUMPSIZE "dumpsize" + +/* + * This lock protects the zvol_state structure from being modified + * while it's being used, e.g. an open that comes in before a create + * finishes. It also protects temporary opens of the dataset so that, + * e.g., an open doesn't get a spurious EBUSY. + */ +static kmutex_t zvol_state_lock; +static uint32_t zvol_minors; + +typedef struct zvol_extent { + list_node_t ze_node; + dva_t ze_dva; /* dva associated with this extent */ + uint64_t ze_nblks; /* number of blocks in extent */ +} zvol_extent_t; + +/* + * The in-core state of each volume. + */ +typedef struct zvol_state { + char zv_name[MAXPATHLEN]; /* pool/dd name */ + uint64_t zv_volsize; /* amount of space we advertise */ + uint64_t zv_volblocksize; /* volume block size */ + minor_t zv_minor; /* minor number */ + uint8_t zv_min_bs; /* minimum addressable block shift */ + uint8_t zv_flags; /* readonly; dumpified */ + objset_t *zv_objset; /* objset handle */ + uint32_t zv_mode; /* DS_MODE_* flags at open time */ + uint32_t zv_open_count[OTYPCNT]; /* open counts */ + uint32_t zv_total_opens; /* total open count */ + zilog_t *zv_zilog; /* ZIL handle */ + list_t zv_extents; /* List of extents for dump */ + uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ + znode_t zv_znode; /* for range locking */ +} zvol_state_t; + +/* + * zvol specific flags + */ +#define ZVOL_RDONLY 0x1 +#define ZVOL_DUMPIFIED 0x2 +#define ZVOL_EXCL 0x4 + +/* + * zvol maximum transfer in one DMU tx. + */ +int zvol_maxphys = DMU_MAX_ACCESS/2; + +extern int zfs_set_prop_nvlist(const char *, nvlist_t *); +static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); +static int zvol_dumpify(zvol_state_t *zv); +static int zvol_dump_fini(zvol_state_t *zv); +static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); + +static void +zvol_size_changed(zvol_state_t *zv, major_t maj) +{ + dev_t dev = makedevice(maj, zv->zv_minor); + + VERIFY(ddi_prop_update_int64(dev, zfs_dip, + "Size", zv->zv_volsize) == DDI_SUCCESS); + VERIFY(ddi_prop_update_int64(dev, zfs_dip, + "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS); + + /* Notify specfs to invalidate the cached size */ + spec_size_invalidate(dev, VBLK); + spec_size_invalidate(dev, VCHR); +} + +int +zvol_check_volsize(uint64_t volsize, uint64_t blocksize) +{ + if (volsize == 0) + return (EINVAL); + + if (volsize % blocksize != 0) + return (EINVAL); + +#ifdef _ILP32 + if (volsize - 1 > SPEC_MAXOFFSET_T) + return (EOVERFLOW); +#endif + return (0); +} + +int +zvol_check_volblocksize(uint64_t volblocksize) +{ + if (volblocksize < SPA_MINBLOCKSIZE || + volblocksize > SPA_MAXBLOCKSIZE || + !ISP2(volblocksize)) + return (EDOM); + + return (0); +} + +static void +zvol_readonly_changed_cb(void *arg, uint64_t newval) +{ + zvol_state_t *zv = arg; + + if (newval) + zv->zv_flags |= ZVOL_RDONLY; + else + zv->zv_flags &= ~ZVOL_RDONLY; +} + +int +zvol_get_stats(objset_t *os, nvlist_t *nv) +{ + int error; + dmu_object_info_t doi; + uint64_t val; + + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); + if (error) + return (error); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); + + error = dmu_object_info(os, ZVOL_OBJ, &doi); + + if (error == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, + doi.doi_data_block_size); + } + + return (error); +} + +/* + * Find a free minor number. + */ +static minor_t +zvol_minor_alloc(void) +{ + minor_t minor; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) + if (ddi_get_soft_state(zvol_state, minor) == NULL) + return (minor); + + return (0); +} + +static zvol_state_t * +zvol_minor_lookup(const char *name) +{ + minor_t minor; + zvol_state_t *zv; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) + continue; + if (strcmp(zv->zv_name, name) == 0) + break; + } + + return (zv); +} + +/* extent mapping arg */ +struct maparg { + zvol_state_t *ma_zv; + uint64_t ma_blks; +}; + +/*ARGSUSED*/ +static int +zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) +{ + struct maparg *ma = arg; + zvol_extent_t *ze; + int bs = ma->ma_zv->zv_volblocksize; + + if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) + return (0); + + VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); + ma->ma_blks++; + + /* Abort immediately if we have encountered gang blocks */ + if (BP_IS_GANG(bp)) + return (EFRAGS); + + /* + * See if the block is at the end of the previous extent. + */ + ze = list_tail(&ma->ma_zv->zv_extents); + if (ze && + DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) && + DVA_GET_OFFSET(BP_IDENTITY(bp)) == + DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) { + ze->ze_nblks++; + return (0); + } + + dprintf_bp(bp, "%s", "next blkptr:"); + + /* start a new extent */ + ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP); + ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ + ze->ze_nblks = 1; + list_insert_tail(&ma->ma_zv->zv_extents, ze); + return (0); +} + +static void +zvol_free_extents(zvol_state_t *zv) +{ + zvol_extent_t *ze; + + while (ze = list_head(&zv->zv_extents)) { + list_remove(&zv->zv_extents, ze); + kmem_free(ze, sizeof (zvol_extent_t)); + } +} + +static int +zvol_get_lbas(zvol_state_t *zv) +{ + struct maparg ma; + int err; + + ma.ma_zv = zv; + ma.ma_blks = 0; + zvol_free_extents(zv); + + err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); + if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { + zvol_free_extents(zv); + return (err ? err : EIO); + } + + return (0); +} + +/* ARGSUSED */ +void +zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + nvlist_t *nvprops = zct->zct_props; + int error; + uint64_t volblocksize, volsize; + + VERIFY(nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); + if (nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) + volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + + /* + * These properties must be removed from the list so the generic + * property setting step won't apply to them. + */ + VERIFY(nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); + (void) nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); + + error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); + ASSERT(error == 0); +} + +/* + * Replay a TX_WRITE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) +{ + objset_t *os = zv->zv_objset; + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + uint64_t off = lr->lr_offset; + uint64_t len = lr->lr_length; + dmu_tx_t *tx; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); + error = dmu_tx_assign(tx, zv->zv_txg_assign); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, len, data, tx); + dmu_tx_commit(tx); + } + + return (error); +} + +/* ARGSUSED */ +static int +zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) +{ + return (ENOTSUP); +} + +/* + * Callback vectors for replaying records. + * Only TX_WRITE is needed for zvol. + */ +zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { + zvol_replay_err, /* 0 no such transaction type */ + zvol_replay_err, /* TX_CREATE */ + zvol_replay_err, /* TX_MKDIR */ + zvol_replay_err, /* TX_MKXATTR */ + zvol_replay_err, /* TX_SYMLINK */ + zvol_replay_err, /* TX_REMOVE */ + zvol_replay_err, /* TX_RMDIR */ + zvol_replay_err, /* TX_LINK */ + zvol_replay_err, /* TX_RENAME */ + zvol_replay_write, /* TX_WRITE */ + zvol_replay_err, /* TX_TRUNCATE */ + zvol_replay_err, /* TX_SETATTR */ + zvol_replay_err, /* TX_ACL */ +}; + +/* + * Create a minor node (plus a whole lot more) for the specified volume. + */ +int +zvol_create_minor(const char *name, major_t maj) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t doi; + uint64_t volsize; + minor_t minor = 0; + struct pathname linkpath; + int ds_mode = DS_MODE_OWNER; + vnode_t *vp = NULL; + char *devpath; + size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1; + char chrbuf[30], blkbuf[30]; + int error; + + mutex_enter(&zvol_state_lock); + + if ((zv = zvol_minor_lookup(name)) != NULL) { + mutex_exit(&zvol_state_lock); + return (EEXIST); + } + + if (strchr(name, '@') != 0) + ds_mode |= DS_MODE_READONLY; + + error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); + + if (error) { + mutex_exit(&zvol_state_lock); + return (error); + } + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + + if (error) { + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (error); + } + + /* + * If there's an existing /dev/zvol symlink, try to use the + * same minor number we used last time. + */ + devpath = kmem_alloc(devpathlen, KM_SLEEP); + + (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name); + + error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp); + + kmem_free(devpath, devpathlen); + + if (error == 0 && vp->v_type != VLNK) + error = EINVAL; + + if (error == 0) { + pn_alloc(&linkpath); + error = pn_getsymlink(vp, &linkpath, kcred); + if (error == 0) { + char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV); + if (ms != NULL) { + ms += strlen(ZVOL_PSEUDO_DEV); + minor = stoi(&ms); + } + } + pn_free(&linkpath); + } + + if (vp != NULL) + VN_RELE(vp); + + /* + * If we found a minor but it's already in use, we must pick a new one. + */ + if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL) + minor = 0; + + if (minor == 0) + minor = zvol_minor_alloc(); + + if (minor == 0) { + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) { + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (EAGAIN); + } + + (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, + (char *)name); + + (void) sprintf(chrbuf, "%uc,raw", minor); + + if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_soft_state_free(zvol_state, minor); + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (EAGAIN); + } + + (void) sprintf(blkbuf, "%uc", minor); + + if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_remove_minor_node(zfs_dip, chrbuf); + ddi_soft_state_free(zvol_state, minor); + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (EAGAIN); + } + + zv = ddi_get_soft_state(zvol_state, minor); + + (void) strcpy(zv->zv_name, name); + zv->zv_min_bs = DEV_BSHIFT; + zv->zv_minor = minor; + zv->zv_volsize = volsize; + zv->zv_objset = os; + zv->zv_mode = ds_mode; + zv->zv_zilog = zil_open(os, zvol_get_data); + mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); + list_create(&zv->zv_extents, sizeof (zvol_extent_t), + offsetof(zvol_extent_t, ze_node)); + /* get and cache the blocksize */ + error = dmu_object_info(os, ZVOL_OBJ, &doi); + ASSERT(error == 0); + zv->zv_volblocksize = doi.doi_data_block_size; + + zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL); + zvol_size_changed(zv, maj); + + /* XXX this should handle the possible i/o error */ + VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), + "readonly", zvol_readonly_changed_cb, zv) == 0); + + zvol_minors++; + + mutex_exit(&zvol_state_lock); + + return (0); +} + +/* + * Remove minor node for the specified volume. + */ +int +zvol_remove_minor(const char *name) +{ + zvol_state_t *zv; + char namebuf[30]; + + mutex_enter(&zvol_state_lock); + + if ((zv = zvol_minor_lookup(name)) == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + if (zv->zv_total_opens != 0) { + mutex_exit(&zvol_state_lock); + return (EBUSY); + } + + (void) sprintf(namebuf, "%uc,raw", zv->zv_minor); + ddi_remove_minor_node(zfs_dip, namebuf); + + (void) sprintf(namebuf, "%uc", zv->zv_minor); + ddi_remove_minor_node(zfs_dip, namebuf); + + VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), + "readonly", zvol_readonly_changed_cb, zv) == 0); + + zil_close(zv->zv_zilog); + zv->zv_zilog = NULL; + dmu_objset_close(zv->zv_objset); + zv->zv_objset = NULL; + avl_destroy(&zv->zv_znode.z_range_avl); + mutex_destroy(&zv->zv_znode.z_range_lock); + + ddi_soft_state_free(zvol_state, zv->zv_minor); + + zvol_minors--; + + mutex_exit(&zvol_state_lock); + + return (0); +} + +int +zvol_prealloc(zvol_state_t *zv) +{ + objset_t *os = zv->zv_objset; + dmu_tx_t *tx; + uint64_t refd, avail, usedobjs, availobjs; + uint64_t resid = zv->zv_volsize; + uint64_t off = 0; + + /* Check the space usage before attempting to allocate the space */ + dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); + if (avail < zv->zv_volsize) + return (ENOSPC); + + /* Free old extents if they exist */ + zvol_free_extents(zv); + + while (resid != 0) { + int error; + uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); + return (error); + } + dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); + dmu_tx_commit(tx); + off += bytes; + resid -= bytes; + } + txg_wait_synced(dmu_objset_pool(os), 0); + + return (0); +} + +int +zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize) +{ + dmu_tx_t *tx; + int error; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, + &volsize, tx); + dmu_tx_commit(tx); + + if (error == 0) + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, volsize, DMU_OBJECT_END); + + /* + * If we are using a faked-up state (zv_minor == 0) then don't + * try to update the in-core zvol state. + */ + if (error == 0 && zv->zv_minor) { + zv->zv_volsize = volsize; + zvol_size_changed(zv, maj); + } + return (error); +} + +int +zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) +{ + zvol_state_t *zv; + int error; + dmu_object_info_t doi; + uint64_t old_volsize = 0ULL; + zvol_state_t state = { 0 }; + + mutex_enter(&zvol_state_lock); + + if ((zv = zvol_minor_lookup(name)) == NULL) { + /* + * If we are doing a "zfs clone -o volsize=", then the + * minor node won't exist yet. + */ + error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER, + &state.zv_objset); + if (error != 0) + goto out; + zv = &state; + } + old_volsize = zv->zv_volsize; + + if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || + (error = zvol_check_volsize(volsize, + doi.doi_data_block_size)) != 0) + goto out; + + if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { + error = EROFS; + goto out; + } + + error = zvol_update_volsize(zv, maj, volsize); + + /* + * Reinitialize the dump area to the new size. If we + * failed to resize the dump area then restore the it back to + * it's original size. + */ + if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) { + if ((error = zvol_dumpify(zv)) != 0 || + (error = dumpvp_resize()) != 0) { + (void) zvol_update_volsize(zv, maj, old_volsize); + error = zvol_dumpify(zv); + } + } + +out: + if (state.zv_objset) + dmu_objset_close(state.zv_objset); + + mutex_exit(&zvol_state_lock); + + return (error); +} + +int +zvol_set_volblocksize(const char *name, uint64_t volblocksize) +{ + zvol_state_t *zv; + dmu_tx_t *tx; + int error; + boolean_t needlock; + + /* + * The lock may already be held if we are being called from + * zvol_dump_init(). + */ + needlock = !MUTEX_HELD(&zvol_state_lock); + if (needlock) + mutex_enter(&zvol_state_lock); + + if ((zv = zvol_minor_lookup(name)) == NULL) { + if (needlock) + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { + if (needlock) + mutex_exit(&zvol_state_lock); + return (EROFS); + } + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, + volblocksize, 0, tx); + if (error == ENOTSUP) + error = EBUSY; + dmu_tx_commit(tx); + if (error == 0) + zv->zv_volblocksize = volblocksize; + } + + if (needlock) + mutex_exit(&zvol_state_lock); + + return (error); +} + +/*ARGSUSED*/ +int +zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) +{ + minor_t minor = getminor(*devp); + zvol_state_t *zv; + + if (minor == 0) /* This is the control device */ + return (0); + + mutex_enter(&zvol_state_lock); + + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + ASSERT(zv->zv_objset != NULL); + + if ((flag & FWRITE) && + (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))) { + mutex_exit(&zvol_state_lock); + return (EROFS); + } + if (zv->zv_flags & ZVOL_EXCL) { + mutex_exit(&zvol_state_lock); + return (EBUSY); + } + if (flag & FEXCL) { + if (zv->zv_total_opens != 0) { + mutex_exit(&zvol_state_lock); + return (EBUSY); + } + zv->zv_flags |= ZVOL_EXCL; + } + + if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { + zv->zv_open_count[otyp]++; + zv->zv_total_opens++; + } + + mutex_exit(&zvol_state_lock); + + return (0); +} + +/*ARGSUSED*/ +int +zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) +{ + minor_t minor = getminor(dev); + zvol_state_t *zv; + + if (minor == 0) /* This is the control device */ + return (0); + + mutex_enter(&zvol_state_lock); + + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_total_opens == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_open_count[otyp] != 0); + ASSERT(zv->zv_total_opens != 0); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count[otyp]--; + zv->zv_total_opens--; + + mutex_exit(&zvol_state_lock); + + return (0); +} + +static void +zvol_get_done(dmu_buf_t *db, void *vzgd) +{ + zgd_t *zgd = (zgd_t *)vzgd; + rl_t *rl = zgd->zgd_rl; + + dmu_buf_rele(db, vzgd); + zfs_range_unlock(rl); + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + kmem_free(zgd, sizeof (zgd_t)); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +static int +zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +{ + zvol_state_t *zv = arg; + objset_t *os = zv->zv_objset; + dmu_buf_t *db; + rl_t *rl; + zgd_t *zgd; + uint64_t boff; /* block starting offset */ + int dlen = lr->lr_length; /* length of user data */ + int error; + + ASSERT(zio); + ASSERT(dlen != 0); + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) /* immediate write */ + return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); + + zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_zilog = zv->zv_zilog; + zgd->zgd_bp = &lr->lr_blkptr; + + /* + * Lock the range of the block to ensure that when the data is + * written out and its checksum is being calculated that no other + * thread can change the block. + */ + boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); + rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize, + RL_READER); + zgd->zgd_rl = rl; + + VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); + error = dmu_sync(zio, db, &lr->lr_blkptr, + lr->lr_common.lrc_txg, zvol_get_done, zgd); + if (error == 0) + zil_add_block(zv->zv_zilog, &lr->lr_blkptr); + /* + * If we get EINPROGRESS, then we need to wait for a + * write IO initiated by dmu_sync() to complete before + * we can release this dbuf. We will finish everything + * up in the zvol_get_done() callback. + */ + if (error == EINPROGRESS) + return (0); + dmu_buf_rele(db, zgd); + zfs_range_unlock(rl); + kmem_free(zgd, sizeof (zgd_t)); + return (error); +} + +/* + * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. + * + * We store data in the log buffers if it's small enough. + * Otherwise we will later flush the data out via dmu_sync(). + */ +ssize_t zvol_immediate_write_sz = 32768; + +static void +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) +{ + uint32_t blocksize = zv->zv_volblocksize; + lr_write_t *lr; + + while (len) { + ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); + itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + + itx->itx_wr_state = + len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; + itx->itx_private = zv; + lr = (lr_write_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = nbytes; + lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); + BP_ZERO(&lr->lr_blkptr); + + (void) zil_itx_assign(zv->zv_zilog, itx, tx); + len -= nbytes; + off += nbytes; + } +} + +static int +zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, + boolean_t doread, boolean_t isdump) +{ + vdev_disk_t *dvd; + int c; + int numerrors = 0; + + for (c = 0; c < vd->vdev_children; c++) { + ASSERT(vd->vdev_ops == &vdev_mirror_ops); + int err = zvol_dumpio_vdev(vd->vdev_child[c], + addr, offset, size, doread, isdump); + if (err != 0) { + numerrors++; + } else if (doread) { + break; + } + } + + if (!vd->vdev_ops->vdev_op_leaf) + return (numerrors < vd->vdev_children ? 0 : EIO); + + if (doread && !vdev_readable(vd)) + return (EIO); + else if (!doread && !vdev_writeable(vd)) + return (EIO); + + dvd = vd->vdev_tsd; + ASSERT3P(dvd, !=, NULL); + offset += VDEV_LABEL_START_SIZE; + + if (ddi_in_panic() || isdump) { + ASSERT(!doread); + if (doread) + return (EIO); + return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), + lbtodb(size))); + } else { + return (vdev_disk_physio(dvd->vd_lh, addr, size, offset, + doread ? B_READ : B_WRITE)); + } +} + +static int +zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, + boolean_t doread, boolean_t isdump) +{ + vdev_t *vd; + int error; + zvol_extent_t *ze; + spa_t *spa = dmu_objset_spa(zv->zv_objset); + + /* Must be sector aligned, and not stradle a block boundary. */ + if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) || + P2BOUNDARY(offset, size, zv->zv_volblocksize)) { + return (EINVAL); + } + ASSERT(size <= zv->zv_volblocksize); + + /* Locate the extent this belongs to */ + ze = list_head(&zv->zv_extents); + while (offset >= ze->ze_nblks * zv->zv_volblocksize) { + offset -= ze->ze_nblks * zv->zv_volblocksize; + ze = list_next(&zv->zv_extents, ze); + } + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); + offset += DVA_GET_OFFSET(&ze->ze_dva); + error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump); + spa_config_exit(spa, SCL_STATE, FTAG); + return (error); +} + +int +zvol_strategy(buf_t *bp) +{ + zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); + uint64_t off, volsize; + size_t resid; + char *addr; + objset_t *os; + rl_t *rl; + int error = 0; + boolean_t doread = bp->b_flags & B_READ; + boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED; + + if (zv == NULL) { + bioerror(bp, ENXIO); + biodone(bp); + return (0); + } + + if (getminor(bp->b_edev) == 0) { + bioerror(bp, EINVAL); + biodone(bp); + return (0); + } + + if (!(bp->b_flags & B_READ) && + (zv->zv_flags & ZVOL_RDONLY || + zv->zv_mode & DS_MODE_READONLY)) { + bioerror(bp, EROFS); + biodone(bp); + return (0); + } + + off = ldbtob(bp->b_blkno); + volsize = zv->zv_volsize; + + os = zv->zv_objset; + ASSERT(os != NULL); + + bp_mapin(bp); + addr = bp->b_un.b_addr; + resid = bp->b_bcount; + + if (resid > 0 && (off < 0 || off >= volsize)) { + bioerror(bp, EIO); + biodone(bp); + return (0); + } + + /* + * There must be no buffer changes when doing a dmu_sync() because + * we can't change the data whilst calculating the checksum. + */ + rl = zfs_range_lock(&zv->zv_znode, off, resid, + doread ? RL_READER : RL_WRITER); + + while (resid != 0 && off < volsize) { + size_t size = MIN(resid, zvol_maxphys); + if (is_dump) { + size = MIN(size, P2END(off, zv->zv_volblocksize) - off); + error = zvol_dumpio(zv, addr, off, size, + doread, B_FALSE); + } else if (doread) { + error = dmu_read(os, ZVOL_OBJ, off, size, addr); + } else { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size); + dmu_tx_commit(tx); + } + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; + break; + } + off += size; + addr += size; + resid -= size; + } + zfs_range_unlock(rl); + + if ((bp->b_resid = resid) == bp->b_bcount) + bioerror(bp, off > volsize ? EINVAL : error); + + if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable && !is_dump) + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); + biodone(bp); + + return (0); +} + +/* + * Set the buffer count to the zvol maximum transfer. + * Using our own routine instead of the default minphys() + * means that for larger writes we write bigger buffers on X86 + * (128K instead of 56K) and flush the disk write cache less often + * (every zvol_maxphys - currently 1MB) instead of minphys (currently + * 56K on X86 and 128K on sparc). + */ +void +zvol_minphys(struct buf *bp) +{ + if (bp->b_bcount > zvol_maxphys) + bp->b_bcount = zvol_maxphys; +} + +int +zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) +{ + minor_t minor = getminor(dev); + zvol_state_t *zv; + int error = 0; + uint64_t size; + uint64_t boff; + uint64_t resid; + + if (minor == 0) /* This is the control device */ + return (ENXIO); + + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) + return (ENXIO); + + boff = ldbtob(blkno); + resid = ldbtob(nblocks); + + VERIFY3U(boff + resid, <=, zv->zv_volsize); + + while (resid) { + size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); + error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE); + if (error) + break; + boff += size; + addr += size; + resid -= size; + } + + return (error); +} + +/*ARGSUSED*/ +int +zvol_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + minor_t minor = getminor(dev); + zvol_state_t *zv; + uint64_t volsize; + rl_t *rl; + int error = 0; + + if (minor == 0) /* This is the control device */ + return (ENXIO); + + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) + return (ENXIO); + + volsize = zv->zv_volsize; + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) + return (EIO); + + if (zv->zv_flags & ZVOL_DUMPIFIED) { + error = physio(zvol_strategy, NULL, dev, B_READ, + zvol_minphys, uio); + return (error); + } + + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, + RL_READER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - uio->uio_loffset) + bytes = volsize - uio->uio_loffset; + + error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; + break; + } + } + zfs_range_unlock(rl); + return (error); +} + +/*ARGSUSED*/ +int +zvol_write(dev_t dev, uio_t *uio, cred_t *cr) +{ + minor_t minor = getminor(dev); + zvol_state_t *zv; + uint64_t volsize; + rl_t *rl; + int error = 0; + + if (minor == 0) /* This is the control device */ + return (ENXIO); + + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) + return (ENXIO); + + volsize = zv->zv_volsize; + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) + return (EIO); + + if (zv->zv_flags & ZVOL_DUMPIFIED) { + error = physio(zvol_strategy, NULL, dev, B_WRITE, + zvol_minphys, uio); + return (error); + } + + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, + RL_WRITER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + uint64_t off = uio->uio_loffset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); + if (error == 0) + zvol_log_write(zv, tx, off, bytes); + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_range_unlock(rl); + return (error); +} + +int +zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) +{ + struct uuid uuid = EFI_RESERVED; + efi_gpe_t gpe = { 0 }; + uint32_t crc; + dk_efi_t efi; + int length; + char *ptr; + + if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag)) + return (EFAULT); + ptr = (char *)(uintptr_t)efi.dki_data_64; + length = efi.dki_length; + /* + * Some clients may attempt to request a PMBR for the + * zvol. Currently this interface will return EINVAL to + * such requests. These requests could be supported by + * adding a check for lba == 0 and consing up an appropriate + * PMBR. + */ + if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0) + return (EINVAL); + + gpe.efi_gpe_StartingLBA = LE_64(34ULL); + gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1); + UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); + + if (efi.dki_lba == 1) { + efi_gpt_t gpt = { 0 }; + + gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); + gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); + gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); + gpt.efi_gpt_MyLBA = LE_64(1ULL); + gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); + gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1); + gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); + gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); + gpt.efi_gpt_SizeOfPartitionEntry = + LE_32(sizeof (efi_gpe_t)); + CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); + gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); + CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); + gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); + if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length), + flag)) + return (EFAULT); + ptr += sizeof (gpt); + length -= sizeof (gpt); + } + if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe), + length), flag)) + return (EFAULT); + return (0); +} + +/* + * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). + */ +/*ARGSUSED*/ +int +zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +{ + zvol_state_t *zv; + struct dk_cinfo dki; + struct dk_minfo dkm; + struct dk_callback *dkc; + int error = 0; + rl_t *rl; + + mutex_enter(&zvol_state_lock); + + zv = ddi_get_soft_state(zvol_state, getminor(dev)); + + if (zv == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + switch (cmd) { + + case DKIOCINFO: + bzero(&dki, sizeof (dki)); + (void) strcpy(dki.dki_cname, "zvol"); + (void) strcpy(dki.dki_dname, "zvol"); + dki.dki_ctype = DKC_UNKNOWN; + dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs); + mutex_exit(&zvol_state_lock); + if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) + error = EFAULT; + return (error); + + case DKIOCGMEDIAINFO: + bzero(&dkm, sizeof (dkm)); + dkm.dki_lbsize = 1U << zv->zv_min_bs; + dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; + dkm.dki_media_type = DK_UNKNOWN; + mutex_exit(&zvol_state_lock); + if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) + error = EFAULT; + return (error); + + case DKIOCGETEFI: + { + uint64_t vs = zv->zv_volsize; + uint8_t bs = zv->zv_min_bs; + + mutex_exit(&zvol_state_lock); + error = zvol_getefi((void *)arg, flag, vs, bs); + return (error); + } + + case DKIOCFLUSHWRITECACHE: + dkc = (struct dk_callback *)arg; + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); + if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { + (*dkc->dkc_callback)(dkc->dkc_cookie, error); + error = 0; + } + break; + + case DKIOCGGEOM: + case DKIOCGVTOC: + /* + * commands using these (like prtvtoc) expect ENOTSUP + * since we're emulating an EFI label + */ + error = ENOTSUP; + break; + + case DKIOCDUMPINIT: + rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, + RL_WRITER); + error = zvol_dumpify(zv); + zfs_range_unlock(rl); + break; + + case DKIOCDUMPFINI: + rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, + RL_WRITER); + error = zvol_dump_fini(zv); + zfs_range_unlock(rl); + break; + + default: + error = ENOTTY; + break; + + } + mutex_exit(&zvol_state_lock); + return (error); +} + +int +zvol_busy(void) +{ + return (zvol_minors != 0); +} + +void +zvol_init(void) +{ + VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0); + mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +zvol_fini(void) +{ + mutex_destroy(&zvol_state_lock); + ddi_soft_state_fini(&zvol_state); +} + +static boolean_t +zvol_is_swap(zvol_state_t *zv) +{ + vnode_t *vp; + boolean_t ret = B_FALSE; + char *devpath; + size_t devpathlen; + int error; + + devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1; + devpath = kmem_alloc(devpathlen, KM_SLEEP); + (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name); + error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + kmem_free(devpath, devpathlen); + + ret = !error && IS_SWAPVP(common_specvp(vp)); + + if (vp != NULL) + VN_RELE(vp); + + return (ret); +} + +static int +zvol_dump_init(zvol_state_t *zv, boolean_t resize) +{ + dmu_tx_t *tx; + int error = 0; + objset_t *os = zv->zv_objset; + nvlist_t *nv = NULL; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + /* + * If we are resizing the dump device then we only need to + * update the refreservation to match the newly updated + * zvolsize. Otherwise, we save off the original state of the + * zvol so that we can restore them if the zvol is ever undumpified. + */ + if (resize) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, + &zv->zv_volsize, tx); + } else { + uint64_t checksum, compress, refresrv, vbs; + + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); + error = error ? error : dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); + error = error ? error : dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); + error = error ? error : dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL); + + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, + &compress, tx); + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx); + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, + &refresrv, tx); + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, + &vbs, tx); + } + dmu_tx_commit(tx); + + /* Truncate the file */ + if (!error) + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, 0, DMU_OBJECT_END); + + if (error) + return (error); + + /* + * We only need update the zvol's property if we are initializing + * the dump area for the first time. + */ + if (!resize) { + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), + ZIO_COMPRESS_OFF) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), + ZIO_CHECKSUM_OFF) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + SPA_MAXBLOCKSIZE) == 0); + + error = zfs_set_prop_nvlist(zv->zv_name, nv); + nvlist_free(nv); + + if (error) + return (error); + } + + /* Allocate the space for the dump */ + error = zvol_prealloc(zv); + return (error); +} + +static int +zvol_dumpify(zvol_state_t *zv) +{ + int error = 0; + uint64_t dumpsize = 0; + dmu_tx_t *tx; + objset_t *os = zv->zv_objset; + + if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) + return (EROFS); + + /* + * We do not support swap devices acting as dump devices. + */ + if (zvol_is_swap(zv)) + return (ENOTSUP); + + if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, + 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { + boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE; + + if ((error = zvol_dump_init(zv, resize)) != 0) { + (void) zvol_dump_fini(zv); + return (error); + } + } + + /* + * Build up our lba mapping. + */ + error = zvol_get_lbas(zv); + if (error) { + (void) zvol_dump_fini(zv); + return (error); + } + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + (void) zvol_dump_fini(zv); + return (error); + } + + zv->zv_flags |= ZVOL_DUMPIFIED; + error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, + &zv->zv_volsize, tx); + dmu_tx_commit(tx); + + if (error) { + (void) zvol_dump_fini(zv); + return (error); + } + + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + +static int +zvol_dump_fini(zvol_state_t *zv) +{ + dmu_tx_t *tx; + objset_t *os = zv->zv_objset; + nvlist_t *nv; + int error = 0; + uint64_t checksum, compress, refresrv, vbs; + + /* + * Attempt to restore the zvol back to its pre-dumpified state. + * This is a best-effort attempt as it's possible that not all + * of these properties were initialized during the dumpify process + * (i.e. error during zvol_dump_init). + */ + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); + dmu_tx_commit(tx); + + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs); + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs); + (void) zfs_set_prop_nvlist(zv->zv_name, nv); + nvlist_free(nv); + + zvol_free_extents(zv); + zv->zv_flags &= ~ZVOL_DUMPIFIED; + (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); + + return (0); +} |