diff options
Diffstat (limited to 'module/zfs')
29 files changed, 1020 insertions, 707 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 464fe9fdd..3a9598a92 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -152,6 +152,12 @@ typedef enum arc_reclaim_strategy { /* number of seconds before growing cache again */ static int arc_grow_retry = 60; +/* shift of arc_c for calculating both min and max arc_p */ +static int arc_p_min_shift = 4; + +/* log2(fraction of arc to reclaim) */ +static int arc_shrink_shift = 5; + /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) @@ -172,6 +178,9 @@ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; int zfs_mdcomp_disable = 0; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_arc_p_min_shift = 0; /* * Note that buffers can be in one of 6 states: @@ -250,10 +259,14 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_data_size; + kstat_named_t arcstat_other_size; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; @@ -299,10 +312,14 @@ static arc_stats_t arc_stats = { { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, + { "data_size", KSTAT_DATA_UINT64 }, + { "other_size", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_read_bytes", KSTAT_DATA_UINT64 }, + { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, @@ -425,7 +442,7 @@ struct arc_buf_hdr { /* immutable */ arc_buf_contents_t b_type; uint64_t b_size; - spa_t *b_spa; + uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -447,7 +464,7 @@ static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ @@ -476,6 +493,7 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) @@ -529,8 +547,9 @@ uint64_t zfs_crc64_table[256]; */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 4 /* num of writes */ -#define L2ARC_FEED_SECS 1 /* caching interval */ +#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval secs */ +#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -542,7 +561,10 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ +boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals @@ -557,6 +579,7 @@ typedef struct l2arc_dev { uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ list_t *l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; @@ -607,9 +630,8 @@ static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static uint64_t -buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { - uintptr_t spav = (uintptr_t)spa; uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; @@ -619,7 +641,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; - crc ^= (spav>>8) ^ birth; + crc ^= (spa>>8) ^ birth; return (crc); } @@ -635,7 +657,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); @@ -755,8 +777,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag) refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); return (0); } @@ -768,6 +790,8 @@ buf_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_t)); rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); + return (0); } @@ -784,8 +808,7 @@ hdr_dest(void *vbuf, void *unused) refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); mutex_destroy(&buf->b_freeze_lock); - - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); + arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } /* ARGSUSED */ @@ -795,6 +818,7 @@ buf_dest(void *vbuf, void *unused) arc_buf_t *buf = vbuf; rw_destroy(&buf->b_lock); + arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* @@ -1081,15 +1105,49 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } void -arc_space_consume(uint64_t space) +arc_space_consume(uint64_t space, arc_space_type_t type) { + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, space); + break; + } + atomic_add_64(&arc_meta_used, space); atomic_add_64(&arc_size, space); } void -arc_space_return(uint64_t space) +arc_space_return(uint64_t space, arc_space_type_t type) { + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, -space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, -space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, -space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, -space); + break; + } + ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; @@ -1126,7 +1184,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; - hdr->b_spa = spa; + hdr->b_spa = spa_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -1189,6 +1247,7 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); @@ -1236,11 +1295,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf->b_hdr, zio_buf_free, buf->b_data, size); - arc_space_return(size); + arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf->b_hdr, zio_data_buf_free, buf->b_data, size); + ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } } @@ -1440,7 +1500,7 @@ arc_buf_size(arc_buf_t *buf) * It may also return without evicting as much space as requested. */ static void * -arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, +arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; @@ -1566,7 +1626,7 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; @@ -1635,61 +1695,63 @@ top: static void arc_adjust(void) { - int64_t top_sz, mru_over, arc_over, todelete; + int64_t adjustment, delta; + + /* + * Adjust MRU size + */ - top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; + adjustment = MIN(arc_size - arc_c, + arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p); - if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - int64_t toevict = - MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); - (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; } - if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t toevict = - MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); - (void) arc_evict(arc_mru, NULL, toevict, FALSE, + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_METADATA); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; } - mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; + /* + * Adjust MFU size + */ - if (mru_over > 0) { - if (arc_mru_ghost->arcs_size > 0) { - todelete = MIN(arc_mru_ghost->arcs_size, mru_over); - arc_evict_ghost(arc_mru_ghost, NULL, todelete); - } + adjustment = arc_size - arc_c; + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; } - if ((arc_over = arc_size - arc_c) > 0) { - int64_t tbl_over; + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t delta = MIN(adjustment, + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, + ARC_BUFC_METADATA); + } - if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - int64_t toevict = - MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); - (void) arc_evict(arc_mfu, NULL, toevict, FALSE, - ARC_BUFC_DATA); - arc_over = arc_size - arc_c; - } + /* + * Adjust ghost lists + */ - if (arc_over > 0 && - arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t toevict = - MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], - arc_over); - (void) arc_evict(arc_mfu, NULL, toevict, FALSE, - ARC_BUFC_METADATA); - } + adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + + if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { + delta = MIN(arc_mru_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mru_ghost, NULL, delta); + } - tbl_over = arc_size + arc_mru_ghost->arcs_size + - arc_mfu_ghost->arcs_size - arc_c * 2; + adjustment = + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; - if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { - todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); - arc_evict_ghost(arc_mfu_ghost, NULL, todelete); - } + if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { + delta = MIN(arc_mfu_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mfu_ghost, NULL, delta); } } @@ -1723,29 +1785,34 @@ arc_do_user_evicts(void) void arc_flush(spa_t *spa) { + uint64_t guid = 0; + + if (spa) + guid = spa_guid(spa); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } - arc_evict_ghost(arc_mru_ghost, spa, -1); - arc_evict_ghost(arc_mfu_ghost, spa, -1); + arc_evict_ghost(arc_mru_ghost, guid, -1); + arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); @@ -1753,8 +1820,6 @@ arc_flush(spa_t *spa) ASSERT(spa || arc_eviction_list == NULL); } -int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ - void arc_shrink(void) { @@ -1953,6 +2018,7 @@ static void arc_adapt(int bytes, arc_state_t *state) { int mult; + uint64_t arc_p_min = (arc_c >> arc_p_min_shift); if (state == arc_l2c_only) return; @@ -1970,12 +2036,15 @@ arc_adapt(int bytes, arc_state_t *state) mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); - arc_p = MIN(arc_c, arc_p + bytes * mult); + arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { + uint64_t delta; + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); - arc_p = MAX(0, (int64_t)arc_p - bytes * mult); + delta = MIN(bytes * mult, arc_p); + arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); @@ -2073,10 +2142,11 @@ arc_get_data_buf(arc_buf_t *buf) if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); + arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } goto out; @@ -2093,21 +2163,22 @@ arc_get_data_buf(arc_buf_t *buf) if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] > 0 && + state = (arc_mfu->arcs_lsize[type] >= size && arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] > 0 && + state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); + arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } ARCSTAT_BUMP(arcstat_recycle_miss); @@ -2309,7 +2380,7 @@ arc_read_done(zio_t *zio) * reason for it not to be found is if we were freed during the * read. */ - found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || @@ -2456,9 +2527,10 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *buf; kmutex_t *hash_lock; zio_t *rzio; + uint64_t guid = spa_guid(spa); top: - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); if (hdr && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; @@ -2481,7 +2553,7 @@ top: acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, zio_flags); + spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; @@ -2533,6 +2605,7 @@ top: arc_callback_t *acb; vdev_t *vd = NULL; daddr_t addr; + boolean_t devw = B_FALSE; if (hdr == NULL) { /* this block is not in the cache */ @@ -2611,6 +2684,7 @@ top: if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr->b_dev->l2ad_writing; addr = hdr->b_l2hdr->b_daddr; /* * Lock out device removal. @@ -2630,7 +2704,7 @@ top: demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); - if (vd != NULL) { + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. @@ -2638,9 +2712,11 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. + * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (hdr->b_l2hdr != NULL && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && + !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); @@ -2666,6 +2742,7 @@ top: ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); @@ -2685,6 +2762,14 @@ top: ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } + } else { + if (vd != NULL) + spa_config_exit(spa, SCL_L2ARC, vd); + if (l2arc_ndev != 0) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } } rzio = zio_read(pio, spa, bp, buf->b_data, size, @@ -2710,9 +2795,10 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data) { arc_buf_hdr_t *hdr; kmutex_t *hash_mtx; + uint64_t guid = spa_guid(spa); int rc = 0; - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { arc_buf_t *buf = hdr->b_buf; @@ -2872,7 +2958,7 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; - spa_t *spa = hdr->b_spa; + uint64_t spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; uint32_t flags = hdr->b_flags; @@ -3156,12 +3242,13 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_hdr_t *ab; kmutex_t *hash_lock; zio_t *zio; + uint64_t guid = spa_guid(spa); /* * If this buffer is in the cache, release it, so it * can be re-used. */ - ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); if (ab != NULL) { /* * The checksum of blocks to free is not always @@ -3385,6 +3472,15 @@ arc_init(void) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; + if (zfs_arc_grow_retry > 0) + arc_grow_retry = zfs_arc_grow_retry; + + if (zfs_arc_shrink_shift > 0) + arc_shrink_shift = zfs_arc_shrink_shift; + + if (zfs_arc_p_min_shift > 0) + arc_p_min_shift = zfs_arc_p_min_shift; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -3623,8 +3719,70 @@ arc_fini(void) * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. + * + * There are three key functions that control how the L2ARC warms up: + * + * l2arc_write_eligible() check if a buffer is eligible to cache + * l2arc_write_size() calculate how much to write + * l2arc_write_interval() calculate sleep delay between writes + * + * These three functions determine what to write, how much, and how quickly + * to send writes. */ +static boolean_t +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +{ + /* + * A buffer is *not* eligible for the L2ARC if it: + * 1. belongs to a different spa. + * 2. has no attached buffer. + * 3. is already cached on the L2ARC. + * 4. has an I/O in progress (it may be an incomplete read). + * 5. is flagged not eligible (zfs property). + */ + if (ab->b_spa != spa_guid || ab->b_buf == NULL || ab->b_l2hdr != NULL || + HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + return (B_FALSE); + + return (B_TRUE); +} + +static uint64_t +l2arc_write_size(l2arc_dev_t *dev) +{ + uint64_t size; + + size = dev->l2ad_write; + + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + return (size); + +} + +static clock_t +l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) +{ + clock_t interval, next; + + /* + * If the ARC lists are busy, increase our write rate; if the + * lists are stale, idle back. This is achieved by checking + * how much we previously wrote - if it was more than half of + * what we wanted, schedule the next write much sooner. + */ + if (l2arc_feed_again && wrote > (wanted / 2)) + interval = (hz * l2arc_feed_min_ms) / 1000; + else + interval = hz * l2arc_feed_secs; + + next = MAX(lbolt, MIN(lbolt + interval, began + interval)); + + return (next); +} + static void l2arc_hdr_stat_add(void) { @@ -3857,11 +4015,15 @@ l2arc_read_done(zio_t *zio) * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ - if (zio->io_waiter == NULL) - zio_nowait(zio_read(zio->io_parent, - cb->l2rcb_spa, &cb->l2rcb_bp, + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, buf->b_data, zio->io_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } } kmem_free(cb, sizeof (l2arc_read_callback_t)); @@ -4045,7 +4207,7 @@ top: * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. */ -static void +static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *ab, *ab_prev, *head; @@ -4057,6 +4219,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; + uint64_t guid = spa_guid(spa); ASSERT(dev->l2ad_vdev != NULL); @@ -4110,20 +4273,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (ab->b_spa != spa) { - mutex_exit(hash_lock); - continue; - } - - if (ab->b_l2hdr != NULL) { - /* - * Already in L2ARC. - */ - mutex_exit(hash_lock); - continue; - } - - if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } @@ -4134,12 +4284,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (ab->b_buf == NULL) { - DTRACE_PROBE1(l2arc__buf__null, void *, ab); - mutex_exit(hash_lock); - continue; - } - if (pio == NULL) { /* * Insert a dummy header on the buflist so @@ -4206,11 +4350,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) if (pio == NULL) { ASSERT3U(write_sz, ==, 0); kmem_cache_free(hdr_cache, head); - return; + return (0); } ASSERT3U(write_sz, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); ARCSTAT_INCR(arcstat_l2_size, write_sz); spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); @@ -4226,7 +4371,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) dev->l2ad_first = B_FALSE; } + dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); + dev->l2ad_writing = B_FALSE; + + return (write_sz); } /* @@ -4239,20 +4388,19 @@ l2arc_feed_thread(void) callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; - uint64_t size; + uint64_t size, wrote; + clock_t begin, next = lbolt; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { - /* - * Pause for l2arc_feed_secs seconds between writes. - */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - lbolt + (hz * l2arc_feed_secs)); + next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + next = lbolt + hz; /* * Quick check for L2ARC devices. @@ -4263,6 +4411,7 @@ l2arc_feed_thread(void) continue; } mutex_exit(&l2arc_dev_mtx); + begin = lbolt; /* * This selects the next l2arc device to write to, and in @@ -4291,9 +4440,7 @@ l2arc_feed_thread(void) ARCSTAT_BUMP(arcstat_l2_feeds); - size = dev->l2ad_write; - if (arc_warm == B_FALSE) - size += dev->l2ad_boost; + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. @@ -4303,7 +4450,12 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - l2arc_write_buffers(spa, dev, size); + wrote = l2arc_write_buffers(spa, dev, size); + + /* + * Calculate interval between writes. + */ + next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } @@ -4353,6 +4505,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; + adddev->l2ad_writing = B_FALSE; ASSERT3U(adddev->l2ad_write, >, 0); /* diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d04610317..113fa1f03 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -469,7 +469,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, @@ -665,7 +665,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (db->db_blkid == DB_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; @@ -1341,7 +1341,7 @@ dbuf_clear(dmu_buf_impl_t *db) ASSERT(db->db.db_data != NULL); if (db->db_blkid == DB_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; db->db_state = DB_UNCACHED; @@ -1463,7 +1463,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db.db_offset = DB_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ - arc_space_consume(sizeof (dmu_buf_impl_t)); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else { int blocksize = @@ -1490,7 +1490,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, list_insert_head(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); - arc_space_consume(sizeof (dmu_buf_impl_t)); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); @@ -1559,7 +1559,7 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t)); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } void @@ -1980,7 +1980,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db_data_pending = NULL; drp = &db->db_last_dirty; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 7981e0682..c9e00d511 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1213,6 +1213,39 @@ dmu_objset_find_spa(spa_t *spa, const char *name, return (err); } +/* ARGSUSED */ +int +dmu_objset_prefetch(char *name, void *arg) +{ + dsl_dataset_t *ds; + + if (dsl_dataset_hold(name, FTAG, &ds)) + return (0); + + if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { + mutex_enter(&ds->ds_opening_lock); + if (!dsl_dataset_get_user_ptr(ds)) { + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + zbookmark_t zb; + + zb.zb_objset = ds->ds_object; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = 0; + + (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds), + &ds->ds_phys->ds_bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zb); + } + mutex_exit(&ds->ds_opening_lock); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + void dmu_objset_set_user(objset_t *os, void *user_ptr) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 857b9a343..6043df0aa 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -816,10 +816,11 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) /* currently allocated, want to be allocated */ dmu_tx_hold_bonus(tx, drro->drr_object); /* - * We may change blocksize, so need to - * hold_write + * We may change blocksize and delete old content, + * so need to hold_write and hold_free. */ dmu_tx_hold_write(tx, drro->drr_object, 0, 1); + dmu_tx_hold_free(tx, drro->drr_object, 0, DMU_OBJECT_END); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 8686ab983..538a141b0 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -302,7 +302,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, list_insert_head(&os->os_dnodes, dn); mutex_exit(&os->os_lock); - arc_space_consume(sizeof (dnode_t)); + arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } @@ -337,7 +337,7 @@ dnode_destroy(dnode_t *dn) dn->dn_bonus = NULL; } kmem_cache_free(dnode_cache, dn); - arc_space_return(sizeof (dnode_t)); + arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); } void @@ -417,7 +417,7 @@ void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - int i, old_nblkptr; + int i, nblkptr; dmu_buf_impl_t *db = NULL; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); @@ -447,6 +447,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dnode_free_range(dn, 0, -1ULL, tx); } + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + /* change blocksize */ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (blocksize != dn->dn_datablksz && @@ -459,6 +461,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dnode_setdirty(dn, tx); dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + if (dn->dn_nblkptr != nblkptr) + dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; rw_exit(&dn->dn_struct_rwlock); if (db) dbuf_rele(db, FTAG); @@ -468,19 +472,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, /* change bonus size and type */ mutex_enter(&dn->dn_mtx); - old_nblkptr = dn->dn_nblkptr; dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; - dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - /* XXX - for now, we can't make nblkptr smaller */ - ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr); - - /* fix up the bonus db_size if dn_nblkptr has changed */ - if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) { + /* fix up the bonus db_size */ + if (dn->dn_bonus) { dn->dn_bonus->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 779cfc96f..23dcb4c7b 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/dbuf.h> #include <sys/dnode.h> @@ -532,18 +530,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) /* XXX shouldn't the phys already be zeroed? */ bzero(dnp, DNODE_CORE_SIZE); dnp->dn_nlevels = 1; + dnp->dn_nblkptr = dn->dn_nblkptr; } - if (dn->dn_nblkptr > dnp->dn_nblkptr) { - /* zero the new blkptrs we are gaining */ - bzero(dnp->dn_blkptr + dnp->dn_nblkptr, - sizeof (blkptr_t) * - (dn->dn_nblkptr - dnp->dn_nblkptr)); - } dnp->dn_type = dn->dn_type; dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; - dnp->dn_nblkptr = dn->dn_nblkptr; } ASSERT(dnp->dn_nlevels > 1 || @@ -603,6 +595,30 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) return; } + if (dn->dn_next_nblkptr[txgoff]) { + /* this should only happen on a realloc */ + ASSERT(dn->dn_allocated_txg == tx->tx_txg); + if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { + /* zero the new blkptrs we are gaining */ + bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + sizeof (blkptr_t) * + (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); +#ifdef ZFS_DEBUG + } else { + int i; + ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); + /* the blkptrs we are losing better be unallocated */ + for (i = dn->dn_next_nblkptr[txgoff]; + i < dnp->dn_nblkptr; i++) + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); +#endif + } + mutex_enter(&dn->dn_mtx); + dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; + dn->dn_next_nblkptr[txgoff] = 0; + mutex_exit(&dn->dn_mtx); + } + if (dn->dn_next_nlevels[txgoff]) { dnode_increase_indirection(dn, tx); dn->dn_next_nlevels[txgoff] = 0; diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index e488b2bdd..a68b12d33 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -2204,6 +2204,12 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) err = dsl_dir_open(oldname, FTAG, &dd, &tail); if (err) return (err); + /* + * If there are more than 2 references there may be holds + * hanging around that haven't been cleared out yet. + */ + if (dmu_buf_refcount(dd->dd_dbuf) > 2) + txg_wait_synced(dd->dd_pool, 0); if (tail == NULL) { int delta = strlen(newname) - strlen(oldname); @@ -3024,12 +3030,8 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dataset_t *ds = arg1; uint64_t *reservationp = arg2; uint64_t new_reservation = *reservationp; - int64_t delta; uint64_t unique; - if (new_reservation > INT64_MAX) - return (EOVERFLOW); - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFRESERVATION) return (ENOTSUP); @@ -3046,15 +3048,18 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) mutex_enter(&ds->ds_lock); unique = dsl_dataset_unique(ds); - delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved); mutex_exit(&ds->ds_lock); - if (delta > 0 && - delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - if (delta > 0 && ds->ds_quota > 0 && - new_reservation > ds->ds_quota) - return (ENOSPC); + if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, new_reservation) - + MAX(unique, ds->ds_reserved); + + if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + if (ds->ds_quota > 0 && + new_reservation > ds->ds_quota) + return (ENOSPC); + } return (0); } diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 48d87f97f..e5e18f428 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1077,10 +1077,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) uint64_t *reservationp = arg2; uint64_t new_reservation = *reservationp; uint64_t used, avail; - int64_t delta; - - if (new_reservation > INT64_MAX) - return (EOVERFLOW); /* * If we are doing the preliminary check in open context, the @@ -1091,8 +1087,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; - delta = MAX(used, new_reservation) - - MAX(used, dd->dd_phys->dd_reserved); mutex_exit(&dd->dd_lock); if (dd->dd_parent) { @@ -1102,11 +1096,17 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } - if (delta > 0 && delta > avail) - return (ENOSPC); - if (delta > 0 && dd->dd_phys->dd_quota > 0 && - new_reservation > dd->dd_phys->dd_quota) - return (ENOSPC); + if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) { + uint64_t delta = MAX(used, new_reservation) - + MAX(used, dd->dd_phys->dd_reserved); + + if (delta > avail) + return (ENOSPC); + if (dd->dd_phys->dd_quota > 0 && + new_reservation > dd->dd_phys->dd_quota) + return (ENOSPC); + } + return (0); } diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c index dbdfe8c75..783604101 100644 --- a/module/zfs/dsl_scrub.c +++ b/module/zfs/dsl_scrub.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -95,6 +95,9 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ESC_ZFS_RESILVER_START); dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, tx->tx_txg); + } else { + spa_event_notify(dp->dp_spa, NULL, + ESC_ZFS_SCRUB_START); } /* zero out the scrub stats in all vdev_stat_t's */ @@ -212,8 +215,9 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) */ vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); - if (dp->dp_scrub_min_txg && *completep) - spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); + if (*completep) + spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); spa_errlog_rotate(dp->dp_spa); /* diff --git a/module/zfs/include/sys/arc.h b/module/zfs/include/sys/arc.h index 749bf53e5..c402d3d58 100644 --- a/module/zfs/include/sys/arc.h +++ b/module/zfs/include/sys/arc.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -68,8 +68,19 @@ typedef enum arc_buf_contents { #define ARC_CACHED (1 << 4) /* I/O was already in cache */ #define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ -void arc_space_consume(uint64_t space); -void arc_space_return(uint64_t space); +/* + * The following breakdows of arc_size exist for kstat only. + */ +typedef enum arc_space_type { + ARC_SPACE_DATA, + ARC_SPACE_HDRS, + ARC_SPACE_L2HDRS, + ARC_SPACE_OTHER, + ARC_SPACE_NUMTYPES +} arc_space_type_t; + +void arc_space_consume(uint64_t space, arc_space_type_t type); +void arc_space_return(uint64_t space, arc_space_type_t type); void *arc_data_buf_alloc(uint64_t space); void arc_data_buf_free(void *buf, uint64_t space); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h index 15df29a17..1d6572780 100644 --- a/module/zfs/include/sys/dmu_objset.h +++ b/module/zfs/include/sys/dmu_objset.h @@ -26,8 +26,6 @@ #ifndef _SYS_DMU_OBJSET_H #define _SYS_DMU_OBJSET_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/arc.h> #include <sys/txg.h> @@ -118,6 +116,7 @@ int dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags); int dmu_objset_find_spa(spa_t *spa, const char *name, int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); +int dmu_objset_prefetch(char *name, void *arg); void dmu_objset_byteswap(void *buf, size_t size); int dmu_objset_evict_dbufs(objset_t *os); diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h index c79ff48a6..be9e56908 100644 --- a/module/zfs/include/sys/dnode.h +++ b/module/zfs/include/sys/dnode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -160,6 +160,7 @@ typedef struct dnode { uint16_t dn_datablkszsec; /* in 512b sectors */ uint32_t dn_datablksz; /* in bytes */ uint64_t dn_maxblkid; + uint8_t dn_next_nblkptr[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; uint16_t dn_next_bonuslen[TXG_SIZE]; diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index 519b1d0c0..029123dfe 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -532,6 +532,7 @@ extern void spa_boot_init(); extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); +extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h index a5416525c..9192abcd7 100644 --- a/module/zfs/include/sys/zfs_znode.h +++ b/module/zfs/include/sys/zfs_znode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -182,7 +182,6 @@ typedef struct znode { vnode_t *z_vnode; uint64_t z_id; /* object ID for this znode */ kmutex_t z_lock; /* znode modification lock */ - krwlock_t z_map_lock; /* page map lock */ krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h index 21b0fbc6b..67adc3b4c 100644 --- a/module/zfs/include/sys/zio.h +++ b/module/zfs/include/sys/zio.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -265,6 +265,13 @@ typedef int zio_pipe_stage_t(zio_t *zio); #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 +typedef struct zio_link { + zio_t *zl_parent; + zio_t *zl_child; + list_node_t zl_parent_node; + list_node_t zl_child_node; +} zio_link_t; + struct zio { /* Core information about this I/O */ zbookmark_t io_bookmark; @@ -275,14 +282,14 @@ struct zio { uint8_t io_priority; uint8_t io_reexecute; uint8_t io_async_root; + uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t io_bp_copy; - zio_t *io_parent; - zio_t *io_child; - zio_t *io_sibling_prev; - zio_t *io_sibling_next; + list_t io_parent_list; + list_t io_child_list; + zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; @@ -305,8 +312,6 @@ struct zio { avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; - zio_t *io_delegate_list; - zio_t *io_delegate_next; /* Internal pipeline state */ int io_flags; @@ -329,7 +334,7 @@ struct zio { uint64_t io_ena; }; -extern zio_t *zio_null(zio_t *pio, spa_t *spa, +extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, int flags); extern zio_t *zio_root(spa_t *spa, @@ -379,6 +384,11 @@ extern void zio_nowait(zio_t *zio); extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); +extern zio_t *zio_walk_parents(zio_t *cio); +extern zio_t *zio_walk_children(zio_t *pio); +extern zio_t *zio_unique_parent(zio_t *cio); +extern void zio_add_child(zio_t *pio, zio_t *cio); + extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index ef04b7c94..df1b7e125 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,6 +60,10 @@ #include <sys/sunddi.h> #include <sys/spa_boot.h> +#ifdef _KERNEL +#include <sys/zone.h> +#endif /* _KERNEL */ + #include "zfs_prop.h" #include "zfs_comutil.h" @@ -110,38 +114,38 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { - uint64_t size = spa_get_space(spa); - uint64_t used = spa_get_alloc(spa); + uint64_t size; + uint64_t used; uint64_t cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa->spa_props_lock)); - /* - * readonly properties - */ - spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); - - cap = (size == 0) ? 0 : (used * 100 / size); - spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + if (spa->spa_root_vdev != NULL) { + size = spa_get_space(spa); + used = spa_get_alloc(spa); + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, + size - used, src); + + cap = (size == 0) ? 0 : (used * 100 / size); + spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + spa->spa_root_vdev->vdev_state, src); + + version = spa_version(spa); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, - spa->spa_root_vdev->vdev_state, src); - - /* - * settable properties that are not stored in the pool property object. - */ - version = spa_version(spa); - if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) - src = ZPROP_SRC_DEFAULT; - else - src = ZPROP_SRC_LOCAL; - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, @@ -412,16 +416,60 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) return (error); } +void +spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) +{ + char *cachefile; + spa_config_dirent_t *dp; + + if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), + &cachefile) != 0) + return; + + dp = kmem_alloc(sizeof (spa_config_dirent_t), + KM_SLEEP); + + if (cachefile[0] == '\0') + dp->scd_path = spa_strdup(spa_config_path); + else if (strcmp(cachefile, "none") == 0) + dp->scd_path = NULL; + else + dp->scd_path = spa_strdup(cachefile); + + list_insert_head(&spa->spa_config_list, dp); + if (need_sync) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; + nvpair_t *elem; + boolean_t need_sync = B_FALSE; + zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); + elem = NULL; + while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { + if ((prop = zpool_name_to_prop( + nvpair_name(elem))) == ZPROP_INVAL) + return (EINVAL); + + if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) + continue; + + need_sync = B_TRUE; + break; + } + + if (need_sync) + return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, + spa, nvp, 3)); + else + return (0); } /* @@ -1178,9 +1226,17 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) VERIFY(nvlist_lookup_string(newconfig, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); +#ifdef _KERNEL + myhostid = zone_get_hostid(NULL); +#else /* _KERNEL */ + /* + * We're emulating the system's hostid in userland, so + * we can't use zone_get_hostid(). + */ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); +#endif /* _KERNEL */ if (hostid != 0 && myhostid != 0 && - (unsigned long)hostid != myhostid) { + hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " @@ -2071,8 +2127,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); - if (props) + if (props != NULL) { + spa_configfile_set(spa, props, B_FALSE); spa_sync_props(spa, props, CRED(), tx); + } dmu_tx_commit(tx); @@ -2090,10 +2148,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); - mutex_exit(&spa_namespace_lock); - spa->spa_minref = refcount_count(&spa->spa_refcount); + mutex_exit(&spa_namespace_lock); + return (0); } @@ -2176,6 +2234,9 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { @@ -2494,6 +2555,7 @@ spa_tryimport(nvlist_t *tryconfig) char *poolname; spa_t *spa; uint64_t state; + int error; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); @@ -2513,7 +2575,7 @@ spa_tryimport(nvlist_t *tryconfig) * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ - (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); + error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. @@ -2532,7 +2594,7 @@ spa_tryimport(nvlist_t *tryconfig) * copy it out so that external consumers can tell which * pools are bootable. */ - if (spa->spa_bootfs) { + if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* @@ -3794,7 +3856,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) zpool_prop_t prop; const char *propname; zprop_type_t proptype; - spa_config_dirent_t *dp; mutex_enter(&spa->spa_props_lock); @@ -3827,23 +3888,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) case ZPOOL_PROP_CACHEFILE: /* - * 'cachefile' is a non-persistent property, but note - * an async request that the config cache needs to be - * udpated. + * 'cachefile' is also a non-persisitent property. */ - VERIFY(nvpair_value_string(elem, &strval) == 0); - - dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); - - if (strval[0] == '\0') - dp->scd_path = spa_strdup(spa_config_path); - else if (strcmp(strval, "none") == 0) - dp->scd_path = NULL; - else - dp->scd_path = spa_strdup(strval); - - list_insert_head(&spa->spa_config_list, dp); - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); break; default: /* diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 252869d69..1c47efd0c 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,6 +36,7 @@ #include <sys/sunddi.h> #ifdef _KERNEL #include <sys/kobj.h> +#include <sys/zone.h> #endif /* @@ -352,7 +353,15 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)) == 0); +#ifdef _KERNEL + hostid = zone_get_hostid(NULL); +#else /* _KERNEL */ + /* + * We're emulating the system's hostid in userland, so we can't use + * zone_get_hostid(). + */ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); +#endif /* _KERNEL */ if (hostid != 0) { VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid) == 0); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index d9689e803..2554f96a9 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -816,23 +816,22 @@ typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; - zio_t *vps_root; - vdev_t *vps_vd; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; + vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; - vdev_t *vd = vps->vps_vd; + + ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { - ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { - zio_nowait(zio_write_phys(vps->vps_root, vd, + zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_data, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); @@ -840,13 +839,11 @@ vdev_probe_done(zio_t *zio) zio_buf_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { - ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_writeable = 1; zio_buf_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { - ASSERT(zio->io_vd == NULL); - ASSERT(zio == vps->vps_root); + zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; @@ -860,6 +857,16 @@ vdev_probe_done(zio_t *zio) spa, vd, NULL, 0, 0); zio->io_error = ENXIO; } + + mutex_enter(&vd->vdev_probe_lock); + ASSERT(vd->vdev_probe_zio == zio); + vd->vdev_probe_zio = NULL; + mutex_exit(&vd->vdev_probe_lock); + + while ((pio = zio_walk_parents(zio)) != NULL) + if (!vdev_accessible(vd, pio)) + pio->io_error = ENXIO; + kmem_free(vps, sizeof (*vps)); } } @@ -870,45 +877,78 @@ vdev_probe_done(zio_t *zio) * but the first (which we leave alone in case it contains a VTOC). */ zio_t * -vdev_probe(vdev_t *vd, zio_t *pio) +vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; - vdev_probe_stats_t *vps; - zio_t *zio; + vdev_probe_stats_t *vps = NULL; + zio_t *pio; - vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + ASSERT(vd->vdev_ops->vdev_op_leaf); - vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY; + /* + * Don't probe the probe. + */ + if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) + return (NULL); - if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { - /* - * vdev_cant_read and vdev_cant_write can only transition - * from TRUE to FALSE when we have the SCL_ZIO lock as writer; - * otherwise they can only transition from FALSE to TRUE. - * This ensures that any zio looking at these values can - * assume that failures persist for the life of the I/O. - * That's important because when a device has intermittent - * connectivity problems, we want to ensure that they're - * ascribed to the device (ENXIO) and not the zio (EIO). - * - * Since we hold SCL_ZIO as writer here, clear both values - * so the probe can reevaluate from first principles. - */ - vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; - vd->vdev_cant_read = B_FALSE; - vd->vdev_cant_write = B_FALSE; + /* + * To prevent 'probe storms' when a device fails, we create + * just one probe i/o at a time. All zios that want to probe + * this vdev will become parents of the probe io. + */ + mutex_enter(&vd->vdev_probe_lock); + + if ((pio = vd->vdev_probe_zio) == NULL) { + vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + + vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_DONT_RETRY; + + if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { + /* + * vdev_cant_read and vdev_cant_write can only + * transition from TRUE to FALSE when we have the + * SCL_ZIO lock as writer; otherwise they can only + * transition from FALSE to TRUE. This ensures that + * any zio looking at these values can assume that + * failures persist for the life of the I/O. That's + * important because when a device has intermittent + * connectivity problems, we want to ensure that + * they're ascribed to the device (ENXIO) and not + * the zio (EIO). + * + * Since we hold SCL_ZIO as writer here, clear both + * values so the probe can reevaluate from first + * principles. + */ + vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + } + + vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, + vdev_probe_done, vps, + vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + + if (zio != NULL) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_PROBE); + } } - ASSERT(vd->vdev_ops->vdev_op_leaf); + if (zio != NULL) + zio_add_child(zio, pio); - zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags); + mutex_exit(&vd->vdev_probe_lock); - vps->vps_root = zio; - vps->vps_vd = vd; + if (vps == NULL) { + ASSERT(zio != NULL); + return (NULL); + } for (int l = 1; l < VDEV_LABELS; l++) { - zio_nowait(zio_read_phys(zio, vd, + zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad)), VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), @@ -916,7 +956,11 @@ vdev_probe(vdev_t *vd, zio_t *pio) ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } - return (zio); + if (zio == NULL) + return (pio); + + zio_nowait(pio); + return (NULL); } /* diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index 5a7b59f6e..9b3a9f5a2 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -203,23 +203,23 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) * Fill a previously allocated cache entry with data. */ static void -vdev_cache_fill(zio_t *zio) +vdev_cache_fill(zio_t *fio) { - vdev_t *vd = zio->io_vd; + vdev_t *vd = fio->io_vd; vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = zio->io_private; - zio_t *dio; + vdev_cache_entry_t *ve = fio->io_private; + zio_t *pio; - ASSERT(zio->io_size == VCBS); + ASSERT(fio->io_size == VCBS); /* * Add data to the cache. */ mutex_enter(&vc->vc_lock); - ASSERT(ve->ve_fill_io == zio); - ASSERT(ve->ve_offset == zio->io_offset); - ASSERT(ve->ve_data == zio->io_data); + ASSERT(ve->ve_fill_io == fio); + ASSERT(ve->ve_offset == fio->io_offset); + ASSERT(ve->ve_data == fio->io_data); ve->ve_fill_io = NULL; @@ -228,20 +228,13 @@ vdev_cache_fill(zio_t *zio) * any reads that were queued up before the missed update are still * valid, so we can satisfy them from this line before we evict it. */ - for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) - vdev_cache_hit(vc, ve, dio); + while ((pio = zio_walk_parents(fio)) != NULL) + vdev_cache_hit(vc, ve, pio); - if (zio->io_error || ve->ve_missed_update) + if (fio->io_error || ve->ve_missed_update) vdev_cache_evict(vc, ve); mutex_exit(&vc->vc_lock); - - while ((dio = zio->io_delegate_list) != NULL) { - zio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = zio->io_error; - zio_execute(dio); - } } /* @@ -284,9 +277,8 @@ vdev_cache_read(zio_t *zio) } if ((fio = ve->ve_fill_io) != NULL) { - zio->io_delegate_next = fio->io_delegate_list; - fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_delegations); return (0); @@ -296,7 +288,6 @@ vdev_cache_read(zio_t *zio) zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); - zio_execute(zio); VDCSTAT_BUMP(vdc_stat_hits); return (0); } @@ -313,8 +304,8 @@ vdev_cache_read(zio_t *zio) ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; - fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); zio_nowait(fio); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index f8f90196b..f61de3c4e 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -961,7 +961,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - zio_t *vio = zio_null(zio, spa, + zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 184da82ab..fff7e0842 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -180,11 +180,16 @@ vdev_mirror_scrub_done(zio_t *zio) mirror_child_t *mc = zio->io_private; if (zio->io_error == 0) { - zio_t *pio = zio->io_parent; - mutex_enter(&pio->io_lock); - ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); - mutex_exit(&pio->io_lock); + zio_t *pio; + + mutex_enter(&zio->io_lock); + while ((pio = zio_walk_parents(zio)) != NULL) { + mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); + bcopy(zio->io_data, pio->io_data, pio->io_size); + mutex_exit(&pio->io_lock); + } + mutex_exit(&zio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 137afdd42..4e3c20acd 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -149,20 +149,12 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_agg_io_done(zio_t *aio) { - zio_t *dio; - uint64_t offset = 0; + zio_t *pio; - while ((dio = aio->io_delegate_list) != NULL) { + while ((pio = zio_walk_parents(aio)) != NULL) if (aio->io_type == ZIO_TYPE_READ) - bcopy((char *)aio->io_data + offset, dio->io_data, - dio->io_size); - offset += dio->io_size; - aio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = aio->io_error; - zio_execute(dio); - } - ASSERT3U(offset, ==, aio->io_size); + bcopy((char *)aio->io_data + (pio->io_offset - + aio->io_offset), pio->io_data, pio->io_size); zio_buf_free(aio->io_data, aio->io_size); } @@ -173,8 +165,8 @@ vdev_queue_agg_io_done(zio_t *aio) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { - zio_t *fio, *lio, *aio, *dio; - avl_tree_t *tree; + zio_t *fio, *lio, *aio, *dio, *nio; + avl_tree_t *t; uint64_t size; int flags; @@ -186,7 +178,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) fio = lio = avl_first(&vq->vq_deadline_tree); - tree = fio->io_vdev_tree; + t = fio->io_vdev_tree; size = fio->io_size; flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; @@ -198,55 +190,54 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) * of the I/O, such as whether it's a normal I/O or a * scrub/resilver, can be preserved in the aggregate. */ - while ((dio = AVL_PREV(tree, fio)) != NULL && + while ((dio = AVL_PREV(t, fio)) != NULL && IS_ADJACENT(dio, fio) && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && size + dio->io_size <= zfs_vdev_aggregation_limit) { - dio->io_delegate_next = fio; fio = dio; size += dio->io_size; } - while ((dio = AVL_NEXT(tree, lio)) != NULL && + while ((dio = AVL_NEXT(t, lio)) != NULL && IS_ADJACENT(lio, dio) && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && size + dio->io_size <= zfs_vdev_aggregation_limit) { - lio->io_delegate_next = dio; lio = dio; size += dio->io_size; } } if (fio != lio) { - char *buf = zio_buf_alloc(size); - uint64_t offset = 0; - ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - buf, size, fio->io_type, ZIO_PRIORITY_NOW, + zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); - aio->io_delegate_list = fio; - - for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { + /* We want to process lio, then stop */ + lio = AVL_NEXT(t, lio); + for (dio = fio; dio != lio; dio = nio) { ASSERT(dio->io_type == aio->io_type); - ASSERT(dio->io_vdev_tree == tree); + ASSERT(dio->io_vdev_tree == t); + if (dio->io_type == ZIO_TYPE_WRITE) - bcopy(dio->io_data, buf + offset, dio->io_size); - offset += dio->io_size; + bcopy(dio->io_data, (char *)aio->io_data + + (dio->io_offset - aio->io_offset), + dio->io_size); + nio = AVL_NEXT(t, dio); + + zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); + zio_execute(dio); } - ASSERT(offset == size); - avl_add(&vq->vq_pending_tree, aio); return (aio); } - ASSERT(fio->io_vdev_tree == tree); + ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); avl_add(&vq->vq_pending_tree, fio); diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 208fc3629..0da026100 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZFS control directory (a.k.a. ".zfs") * @@ -275,8 +273,13 @@ static int zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) { - if (mode & VWRITE) - return (EACCES); + if (flags & V_ACE_MASK) { + if (mode & ACE_ALL_WRITE_PERMS) + return (EACCES); + } else { + if (mode & VWRITE) + return (EACCES); + } return (0); } @@ -411,6 +414,22 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, return (err); } +static int +zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + /* + * We only care about ACL_ENABLED so that libsec can + * display ACL correctly and not default to POSIX draft. + */ + if (cmd == _PC_ACL_ENABLED) { + *valp = _ACL_ACE_ENABLED; + return (0); + } + + return (fs_pathconf(vp, cmd, valp, cr, ct)); +} + static const fs_operation_def_t zfsctl_tops_root[] = { { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, @@ -421,6 +440,7 @@ static const fs_operation_def_t zfsctl_tops_root[] = { { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_PATHCONF, { .vop_pathconf = zfsctl_pathconf } }, { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, { NULL } }; diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 49ee55265..d032648b5 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -90,7 +90,7 @@ typedef struct zfs_ioc_vec { boolean_t zvec_his_log; } zfs_ioc_vec_t; -static void clear_props(char *dataset, nvlist_t *props); +static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, nvlist_t *); @@ -1322,6 +1322,14 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); + if (zc->zc_cookie == 0) { + uint64_t cookie = 0; + int len = sizeof (zc->zc_name) - (p - zc->zc_name); + + while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) + dmu_objset_prefetch(p, NULL); + } + do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, @@ -1365,6 +1373,9 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) if (error) return (error == ENOENT ? ESRCH : error); + if (zc->zc_cookie == 0) + dmu_objset_find(zc->zc_name, dmu_objset_prefetch, + NULL, DS_FIND_SNAPSHOTS); /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. @@ -1606,7 +1617,7 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) if (dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { if (dsl_prop_get_all(os, &origprops, TRUE) == 0) { - clear_props(zc->zc_name, origprops); + clear_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } dmu_objset_close(os); @@ -1640,11 +1651,30 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) nvlist_t *props; spa_t *spa; int error; + nvpair_t *elem; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) return (error); + /* + * If the only property is the configfile, then just do a spa_lookup() + * to handle the faulted case. + */ + elem = nvlist_next_nvpair(props, NULL); + if (elem != NULL && strcmp(nvpair_name(elem), + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && + nvlist_next_nvpair(props, elem) == NULL) { + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) { + spa_configfile_set(spa, props, B_FALSE); + spa_config_sync(spa, B_FALSE, B_TRUE); + } + mutex_exit(&spa_namespace_lock); + if (spa != NULL) + return (0); + } + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); @@ -1665,20 +1695,27 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) int error; nvlist_t *nvp = NULL; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_prop_get(spa, &nvp); + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + /* + * If the pool is faulted, there may be properties we can still + * get (such as altroot and cachefile), so attempt to get them + * anyway. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) + error = spa_prop_get(spa, &nvp); + mutex_exit(&spa_namespace_lock); + } else { + error = spa_prop_get(spa, &nvp); + spa_close(spa, FTAG); + } if (error == 0 && zc->zc_nvlist_dst != NULL) error = put_nvlist(zc, nvp); else error = EFAULT; - spa_close(spa, FTAG); - - if (nvp) - nvlist_free(nvp); + nvlist_free(nvp); return (error); } @@ -2385,7 +2422,7 @@ zfs_ioc_rename(zfs_cmd_t *zc) } static void -clear_props(char *dataset, nvlist_t *props) +clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops) { zfs_cmd_t *zc; nvpair_t *prop; @@ -2396,6 +2433,9 @@ clear_props(char *dataset, nvlist_t *props) (void) strcpy(zc->zc_name, dataset); for (prop = nvlist_next_nvpair(props, NULL); prop; prop = nvlist_next_nvpair(props, prop)) { + if (newprops != NULL && + nvlist_exists(newprops, nvpair_name(prop))) + continue; (void) strcpy(zc->zc_value, nvpair_name(prop)); if (zfs_secpolicy_inherit(zc, CRED()) == 0) (void) zfs_ioc_inherit_prop(zc); @@ -2503,7 +2543,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) * so that the properties are applied to the new data. */ if (props) { - clear_props(tofs, origprops); + clear_props(tofs, origprops, props); /* * XXX - Note, this is all-or-nothing; should be best-effort. */ @@ -2542,7 +2582,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) * On error, restore the original props. */ if (error && props) { - clear_props(tofs, props); + clear_props(tofs, props, NULL); (void) zfs_set_prop_nvlist(tofs, origprops); } out: diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c index f0a75b5fa..4de8d8a2d 100644 --- a/module/zfs/zfs_rlock.c +++ b/module/zfs/zfs_rlock.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains the code to implement file range locking in * ZFS, although there isn't much specific to ZFS (all that comes to mind @@ -431,6 +429,8 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) new = kmem_alloc(sizeof (rl_t), KM_SLEEP); new->r_zp = zp; new->r_off = off; + if (len + off < off) /* overflow */ + len = UINT64_MAX - off; new->r_len = len; new->r_cnt = 1; /* assume it's going to be in the tree */ new->r_type = type; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index f62d3bfa0..e38abb28c 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -348,56 +348,28 @@ zfs_unmap_page(page_t *pp, caddr_t addr) * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. */ -static int -mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int64_t start, off; - int len = nbytes; - int error = 0; + int64_t off; - start = uio->uio_loffset; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; - uint64_t bytes = MIN(PAGESIZE - off, len); - uint64_t woff = uio->uio_loffset; + uint64_t nbytes = MIN(PAGESIZE - off, len); - /* - * We don't want a new page to "appear" in the middle of - * the file update (because it may not get the write - * update data), so we grab a lock to block - * zfs_getpage(). - */ - rw_enter(&zp->z_map_lock, RW_WRITER); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; - rw_exit(&zp->z_map_lock); va = zfs_map_page(pp, S_WRITE); - error = uiomove(va+off, bytes, UIO_WRITE, uio); - if (error == 0) { - dmu_write(zfsvfs->z_os, zp->z_id, - woff, bytes, va+off, tx); - } + (void) dmu_read(os, oid, start+off, nbytes, va+off); zfs_unmap_page(pp, va); page_unlock(pp); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, bytes, tx); - rw_exit(&zp->z_map_lock); } - len -= bytes; + len -= nbytes; off = 0; - if (error) - break; } - return (error); } /* @@ -733,18 +705,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - rw_enter(&zp->z_map_lock, RW_READER); tx_bytes = uio->uio_resid; - if (vn_has_cached_data(vp)) { - rw_exit(&zp->z_map_lock); - error = mappedwrite(vp, nbytes, uio, tx); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, nbytes, tx); - rw_exit(&zp->z_map_lock); - } + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx); tx_bytes -= uio->uio_resid; + if (tx_bytes && vn_has_cached_data(vp)) + update_pages(vp, woff, + tx_bytes, zfsvfs->z_os, zp->z_id); /* * If we made no progress, we're done. If we made even @@ -3610,9 +3577,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; dmu_tx_t *tx; - rl_t *rl; u_offset_t off, koff; size_t len, klen; uint64_t filesz; @@ -3627,26 +3592,18 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, * a read-modify-write). */ if (off < filesz && zp->z_blksz > PAGESIZE) { - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - klen = zp->z_blksz; - koff = P2ALIGN(off, (u_offset_t)klen); - } + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; ASSERT(koff <= filesz); if (koff + klen > filesz) klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); } ASSERT3U(btop(len), ==, btopr(len)); -top: - rl = zfs_range_lock(zp, off, len, RL_WRITER); + /* * Can't push pages past end-of-file. */ - filesz = zp->z_phys->zp_size; if (off >= filesz) { /* ignore all pages */ err = 0; @@ -3661,17 +3618,15 @@ top: pvn_write_done(trunc, flags); len = filesz - off; } - +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_bonus(tx, zp->z_id); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) { - zfs_range_unlock(rl); dmu_tx_wait(tx); dmu_tx_abort(tx); - err = 0; goto top; } dmu_tx_abort(tx); @@ -3689,12 +3644,11 @@ top: if (err == 0) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); dmu_tx_commit(tx); } out: - zfs_range_unlock(rl); pvn_write_done(pp, (err ? B_ERROR : 0) | flags); if (offp) *offp = off; @@ -3731,31 +3685,50 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, page_t *pp; size_t io_len; u_offset_t io_off; - uint64_t filesz; + uint_t blksz; + rl_t *rl; int error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if (len == 0) { + /* + * Align this request to the file block size in case we kluster. + * XXX - this can result in pretty aggresive locking, which can + * impact simultanious read/write access. One option might be + * to break up long requests (len == 0) into block-by-block + * operations to get narrower locking. + */ + blksz = zp->z_blksz; + if (ISP2(blksz)) + io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); + else + io_off = 0; + if (len > 0 && ISP2(blksz)) + io_len = P2ROUNDUP_TYPED(len + (io_off - off), blksz, size_t); + else + io_len = 0; + + if (io_len == 0) { /* - * Search the entire vp list for pages >= off. + * Search the entire vp list for pages >= io_off. */ - error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, - flags, cr); + rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); + error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } + rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off > filesz) { + if (off > zp->z_phys->zp_size) { /* past end of file */ + zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (0); } - len = MIN(len, filesz - off); + len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off); - for (io_off = off; io_off < off + len; io_off += io_len) { + for (off = io_off; io_off < off + len; io_off += io_len) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { pp = page_lookup(vp, io_off, (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); @@ -3778,6 +3751,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, } } out: + zfs_range_unlock(rl); if ((flags & B_ASYNC) == 0) zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); ZFS_EXIT(zfsvfs); @@ -3894,7 +3868,8 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, /* * If we can't find a page in the cache, we will create a new page * and fill it with file data. For efficiency, we may try to fill - * multiple pages at once (klustering). + * multiple pages at once (klustering) to fill up the supplied page + * list. */ static int zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, @@ -3903,57 +3878,27 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, znode_t *zp = VTOZ(vp); page_t *pp, *cur_pp; objset_t *os = zp->z_zfsvfs->z_os; - caddr_t va; u_offset_t io_off, total; - uint64_t oid = zp->z_id; size_t io_len; - uint64_t filesz; int err; - /* - * If we are only asking for a single page don't bother klustering. - */ - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off >= filesz) - return (EFAULT); if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { + /* + * We only have a single page, don't bother klustering + */ io_off = off; io_len = PAGESIZE; pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); } else { /* - * Try to fill a kluster of pages (a blocks worth). + * Try to find enough pages to fill the page list */ - size_t klen; - u_offset_t koff; - - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - /* - * It would be ideal to align our offset to the - * blocksize but doing so has resulted in some - * strange application crashes. For now, we - * leave the offset as is and only adjust the - * length if we are off the end of the file. - */ - koff = off; - klen = plsz; - } - ASSERT(koff <= filesz); - if (koff + klen > filesz) - klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; - ASSERT3U(off, >=, koff); - ASSERT3U(off, <, koff + klen); pp = pvn_read_kluster(vp, off, seg, addr, &io_off, - &io_len, koff, klen, 0); + &io_len, off, plsz, 0); } if (pp == NULL) { /* - * Some other thread entered the page before us. - * Return to zfs_getpage to retry the lookup. + * The page already exists, nothing to do here. */ *pl = NULL; return (0); @@ -3964,9 +3909,11 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, */ cur_pp = pp; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); - err = dmu_read(os, oid, io_off, PAGESIZE, va); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -3978,15 +3925,14 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, } cur_pp = cur_pp->p_next; } -out: + /* - * Fill in the page list array from the kluster. If - * there are too many pages in the kluster, return - * as many pages as possible starting from the desired - * offset `off'. + * Fill in the page list array from the kluster starting + * from the desired offset `off'. * NOTE: the page list will always be null terminated. */ pvn_plist_init(pp, pl, plsz, off, io_len, rw); + ASSERT(pl == NULL || (*pl)->p_offset == off); return (0); } @@ -3994,10 +3940,10 @@ out: /* * Return pointers to the pages for the file region [off, off + len] * in the pl array. If plsz is greater than len, this function may - * also return page pointers from before or after the specified - * region (i.e. some region [off', off' + plsz]). These additional - * pages are only returned if they are already in the cache, or were - * created as part of a klustered read. + * also return page pointers from after the specified region + * (i.e. the region [off, off + plsz]). These additional pages are + * only returned if they are already in the cache, or were created as + * part of a klustered read. * * IN: vp - vnode of file to get data from. * off - position in file to get data from. @@ -4026,9 +3972,17 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp, **pl0 = pl; - int need_unlock = 0, err = 0; - offset_t orig_off; + page_t **pl0 = pl; + int err = 0; + + /* we do our own caching, faultahead is unnecessary */ + if (pl == NULL) + return (0); + else if (len > plsz) + len = plsz; + else + len = P2ROUNDUP(len, PAGESIZE); + ASSERT(plsz >= len); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -4036,104 +3990,51 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, if (protp) *protp = PROT_ALL; - /* no faultahead (for now) */ - if (pl == NULL) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* can't fault past EOF */ - if (off >= zp->z_phys->zp_size) { - ZFS_EXIT(zfsvfs); - return (EFAULT); - } - orig_off = off; - - /* - * If we already own the lock, then we must be page faulting - * in the middle of a write to this file (i.e., we are writing - * to this file using data from a mapped region of the file). - */ - if (rw_owner(&zp->z_map_lock) != curthread) { - rw_enter(&zp->z_map_lock, RW_WRITER); - need_unlock = TRUE; - } - /* * Loop through the requested range [off, off + len] looking * for pages. If we don't find a page, we will need to create * a new page and fill it with data from the file. */ while (len > 0) { - if (plsz < PAGESIZE) - break; - if (pp = page_lookup(vp, off, SE_SHARED)) { - *pl++ = pp; + if (*pl = page_lookup(vp, off, SE_SHARED)) + *(pl+1) = NULL; + else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) + goto out; + while (*pl) { + ASSERT3U((*pl)->p_offset, ==, off); off += PAGESIZE; addr += PAGESIZE; - len -= PAGESIZE; - plsz -= PAGESIZE; - } else { - err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); - if (err) - goto out; - /* - * klustering may have changed our region - * to be block aligned. - */ - if (((pp = *pl) != 0) && (off != pp->p_offset)) { - int delta = off - pp->p_offset; - len += delta; - off -= delta; - addr -= delta; - } - while (*pl) { - pl++; - off += PAGESIZE; - addr += PAGESIZE; - plsz -= PAGESIZE; - if (len > PAGESIZE) - len -= PAGESIZE; - else - len = 0; + if (len > 0) { + ASSERT3U(len, >=, PAGESIZE); + len -= PAGESIZE; } + ASSERT3U(plsz, >=, PAGESIZE); + plsz -= PAGESIZE; + pl++; } } /* * Fill out the page array with any pages already in the cache. */ - while (plsz > 0) { - pp = page_lookup_nowait(vp, off, SE_SHARED); - if (pp == NULL) - break; - *pl++ = pp; - off += PAGESIZE; - plsz -= PAGESIZE; + while (plsz > 0 && + (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { + off += PAGESIZE; + plsz -= PAGESIZE; } - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); out: - /* - * We can't grab the range lock for the page as reader which would - * stop truncation as this leads to deadlock. So we need to recheck - * the file size. - */ - if (orig_off >= zp->z_phys->zp_size) - err = EFAULT; if (err) { /* * Release any pages we have previously locked. */ while (pl > pl0) page_unlock(*--pl); + } else { + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); } *pl = NULL; - if (need_unlock) - rw_exit(&zp->z_map_lock); - ZFS_EXIT(zfsvfs); return (err); } diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 9a7860380..74983cdc5 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -117,7 +117,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); @@ -142,7 +141,6 @@ zfs_znode_cache_destructor(void *buf, void *arg) vn_free(ZTOV(zp)); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); - rw_destroy(&zp->z_map_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); @@ -1375,15 +1373,12 @@ top: dmu_tx_commit(tx); - zfs_range_unlock(rl); - /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ - rw_enter(&zp->z_map_lock, RW_WRITER); if (vn_has_cached_data(vp)) { page_t *pp; uint64_t start = end & PAGEMASK; @@ -1401,7 +1396,8 @@ top: B_INVAL | B_TRUNC, NULL); ASSERT(error == 0); } - rw_exit(&zp->z_map_lock); + + zfs_range_unlock(rl); return (0); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 62af799f5..a669ad64a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -69,6 +69,7 @@ char *zio_type_name[ZIO_TYPES] = { * ========================================================================== */ kmem_cache_t *zio_cache; +kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; @@ -92,8 +93,10 @@ zio_init(void) #ifdef _KERNEL data_alloc_arena = zio_alloc_arena; #endif - zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, - NULL, NULL, NULL, NULL, NULL, 0); + zio_cache = kmem_cache_create("zio_cache", + sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_link_cache = kmem_cache_create("zio_link_cache", + sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of @@ -164,6 +167,7 @@ zio_fini(void) zio_data_buf_cache[c] = NULL; } + kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); @@ -298,41 +302,102 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ +/* + * NOTE - Callers to zio_walk_parents() and zio_walk_children must + * continue calling these functions until they return NULL. + * Otherwise, the next caller will pick up the list walk in + * some indeterminate state. (Otherwise every caller would + * have to pass in a cookie to keep the state represented by + * io_walk_link, which gets annoying.) + */ +zio_t * +zio_walk_parents(zio_t *cio) +{ + zio_link_t *zl = cio->io_walk_link; + list_t *pl = &cio->io_parent_list; -static void -zio_add_child(zio_t *pio, zio_t *zio) + zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); + cio->io_walk_link = zl; + + if (zl == NULL) + return (NULL); + + ASSERT(zl->zl_child == cio); + return (zl->zl_parent); +} + +zio_t * +zio_walk_children(zio_t *pio) +{ + zio_link_t *zl = pio->io_walk_link; + list_t *cl = &pio->io_child_list; + + zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); + pio->io_walk_link = zl; + + if (zl == NULL) + return (NULL); + + ASSERT(zl->zl_parent == pio); + return (zl->zl_child); +} + +zio_t * +zio_unique_parent(zio_t *cio) +{ + zio_t *pio = zio_walk_parents(cio); + + VERIFY(zio_walk_parents(cio) == NULL); + return (pio); +} + +void +zio_add_child(zio_t *pio, zio_t *cio) { + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT(cio->io_child_type <= pio->io_child_type); + + zl->zl_parent = pio; + zl->zl_child = cio; + + mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); - if (zio->io_stage < ZIO_STAGE_READY) - pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; - if (zio->io_stage < ZIO_STAGE_DONE) - pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; - zio->io_sibling_prev = NULL; - zio->io_sibling_next = pio->io_child; - if (pio->io_child != NULL) - pio->io_child->io_sibling_prev = zio; - pio->io_child = zio; - zio->io_parent = pio; + + ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; + + list_insert_head(&pio->io_child_list, zl); + list_insert_head(&cio->io_parent_list, zl); + mutex_exit(&pio->io_lock); + mutex_exit(&cio->io_lock); } static void -zio_remove_child(zio_t *pio, zio_t *zio) +zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { - zio_t *next, *prev; - - ASSERT(zio->io_parent == pio); + ASSERT(zl->zl_parent == pio); + ASSERT(zl->zl_child == cio); + mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); - next = zio->io_sibling_next; - prev = zio->io_sibling_prev; - if (next != NULL) - next->io_sibling_prev = prev; - if (prev != NULL) - prev->io_sibling_next = next; - if (pio->io_child == zio) - pio->io_child = next; + + list_remove(&pio->io_child_list, zl); + list_remove(&cio->io_parent_list, zl); + mutex_exit(&pio->io_lock); + mutex_exit(&cio->io_lock); + + kmem_cache_free(zio_link_cache, zl); } static boolean_t @@ -407,6 +472,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + list_create(&zio->io_parent_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_parent_node)); + list_create(&zio->io_child_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_child_node)); + if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) @@ -441,17 +511,13 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); + zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); + if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { - /* - * Logical I/Os can have logical, gang, or vdev children. - * Gang I/Os can have gang or vdev children. - * Vdev I/Os can only have vdev children. - * The following ASSERT captures all of these constraints. - */ - ASSERT(zio->io_child_type <= pio->io_child_type); if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; zio_add_child(pio, zio); @@ -466,6 +532,8 @@ zio_destroy(zio_t *zio) spa_t *spa = zio->io_spa; uint8_t async_root = zio->io_async_root; + list_destroy(&zio->io_parent_list); + list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); @@ -479,13 +547,13 @@ zio_destroy(zio_t *zio) } zio_t * -zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, - int flags) +zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, + void *private, int flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); @@ -494,7 +562,7 @@ zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) { - return (zio_null(NULL, spa, done, private, flags)); + return (zio_null(NULL, spa, NULL, done, private, flags)); } zio_t * @@ -573,12 +641,12 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ASSERT(!BP_IS_HOLE(bp)); if (bp->blk_fill == BLK_FILL_ALREADY_FREED) - return (zio_null(pio, spa, NULL, NULL, flags)); + return (zio_null(pio, spa, NULL, NULL, NULL, flags)); if (txg == spa->spa_syncing_txg && spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); - return (zio_null(pio, spa, NULL, NULL, flags)); + return (zio_null(pio, spa, NULL, NULL, NULL, flags)); } zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), @@ -629,7 +697,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio->io_cmd = cmd; } else { - zio = zio_null(pio, spa, NULL, NULL, flags); + zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, @@ -1020,11 +1088,12 @@ zio_nowait(zio_t *zio) { ASSERT(zio->io_executor == NULL); - if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) { + if (zio->io_child_type == ZIO_CHILD_LOGICAL && + zio_unique_parent(zio) == NULL) { /* * This is a logical async I/O with no parent to wait for it. - * Attach it to the pool's global async root zio so that - * spa_unload() has a way of waiting for async I/O to finish. + * Track how many outstanding I/Os of this type exist so + * that spa_unload() knows when they are all done. */ spa_t *spa = zio->io_spa; zio->io_async_root = B_TRUE; @@ -1045,13 +1114,18 @@ zio_nowait(zio_t *zio) static void zio_reexecute(zio_t *pio) { - zio_t *zio, *zio_next; + zio_t *cio, *cio_next; + + ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_error = 0; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; @@ -1071,18 +1145,18 @@ zio_reexecute(zio_t *pio) /* * As we reexecute pio's children, new children could be created. - * New children go to the head of the io_child list, however, + * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that - * the remainder of the io_child list, from 'zio_next' onward, - * cannot be affected by any side effects of reexecuting 'zio'. + * the remainder of pio's io_child_list, from 'cio_next' onward, + * cannot be affected by any side effects of reexecuting 'cio'. */ - for (zio = pio->io_child; zio != NULL; zio = zio_next) { - zio_next = zio->io_sibling_next; + for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio); mutex_enter(&pio->io_lock); - pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; - pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); - zio_reexecute(zio); + zio_reexecute(cio); } /* @@ -1111,7 +1185,7 @@ zio_suspend(spa_t *spa, zio_t *zio) if (zio != NULL) { ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(zio->io_parent == NULL); + ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } @@ -1122,7 +1196,7 @@ zio_suspend(spa_t *spa, zio_t *zio) void zio_resume(spa_t *spa) { - zio_t *pio, *zio; + zio_t *pio, *cio, *cio_next; /* * Reexecute all previously suspended i/o. @@ -1137,10 +1211,11 @@ zio_resume(spa_t *spa) if (pio == NULL) return; - while ((zio = pio->io_child) != NULL) { - zio_remove_child(pio, zio); - zio->io_parent = NULL; - zio_reexecute(zio); + for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { + zio_link_t *zl = pio->io_walk_link; + cio_next = zio_walk_children(pio); + zio_remove_child(pio, cio, zl); + zio_reexecute(cio); } ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); @@ -1352,9 +1427,10 @@ zio_gang_tree_assemble_done(zio_t *zio) zio_t *lio = zio->io_logical; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; + zio_t *pio = zio_unique_parent(zio); - ASSERT(zio->io_parent == lio); - ASSERT(zio->io_child == NULL); + ASSERT(pio == lio); + ASSERT(zio_walk_children(zio) == NULL); if (zio->io_error) return; @@ -1445,7 +1521,7 @@ zio_gang_issue(zio_t *zio) static void zio_write_gang_member_ready(zio_t *zio) { - zio_t *pio = zio->io_parent; + zio_t *pio = zio_unique_parent(zio); zio_t *lio = zio->io_logical; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; @@ -1690,72 +1766,6 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) * Read and write to physical devices * ========================================================================== */ - -static void -zio_vdev_io_probe_done(zio_t *zio) -{ - zio_t *dio; - vdev_t *vd = zio->io_private; - - mutex_enter(&vd->vdev_probe_lock); - ASSERT(vd->vdev_probe_zio == zio); - vd->vdev_probe_zio = NULL; - mutex_exit(&vd->vdev_probe_lock); - - while ((dio = zio->io_delegate_list) != NULL) { - zio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - if (!vdev_accessible(vd, dio)) - dio->io_error = ENXIO; - zio_execute(dio); - } -} - -/* - * Probe the device to determine whether I/O failure is specific to this - * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged). - */ -static int -zio_vdev_io_probe(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - zio_t *pio = NULL; - boolean_t created_pio = B_FALSE; - - /* - * Don't probe the probe. - */ - if (zio->io_flags & ZIO_FLAG_PROBE) - return (ZIO_PIPELINE_CONTINUE); - - /* - * To prevent 'probe storms' when a device fails, we create - * just one probe i/o at a time. All zios that want to probe - * this vdev will join the probe zio's io_delegate_list. - */ - mutex_enter(&vd->vdev_probe_lock); - - if ((pio = vd->vdev_probe_zio) == NULL) { - vd->vdev_probe_zio = pio = zio_root(zio->io_spa, - zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL); - created_pio = B_TRUE; - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(zio->io_spa, SPA_ASYNC_PROBE); - } - - zio->io_delegate_next = pio->io_delegate_list; - pio->io_delegate_list = zio; - - mutex_exit(&vd->vdev_probe_lock); - - if (created_pio) { - zio_nowait(vdev_probe(vd, pio)); - zio_nowait(pio); - } - - return (ZIO_PIPELINE_STOP); -} - static int zio_vdev_io_start(zio_t *zio) { @@ -1811,7 +1821,6 @@ zio_vdev_io_start(zio_t *zio) zio->io_txg != 0 && /* not a delegated i/o */ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); - ASSERT(zio->io_delegate_list == NULL); zio_vdev_io_bypass(zio); return (ZIO_PIPELINE_CONTINUE); } @@ -1820,7 +1829,7 @@ zio_vdev_io_start(zio_t *zio) (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return (ZIO_PIPELINE_STOP); + return (ZIO_PIPELINE_CONTINUE); if ((zio = vdev_queue_io(zio)) == NULL) return (ZIO_PIPELINE_STOP); @@ -1872,7 +1881,7 @@ zio_vdev_io_done(zio_t *zio) ops->vdev_op_io_done(zio); if (unexpected_error) - return (zio_vdev_io_probe(zio)); + VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); } @@ -2068,7 +2077,7 @@ static int zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; - zio_t *pio = zio->io_parent; + zio_t *pio, *pio_next; if (zio->io_ready) { if (BP_IS_GANG(bp) && @@ -2088,8 +2097,22 @@ zio_ready(zio_t *zio) if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - if (pio != NULL) + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_READY] = 1; + pio = zio_walk_parents(zio); + mutex_exit(&zio->io_lock); + + /* + * As we notify zio's parents, new parents could be added. + * New parents go to the head of zio's io_parent_list, however, + * so we will (correctly) not notify them. The remainder of zio's + * io_parent_list, from 'pio_next' onward, cannot change because + * all parents must wait for us to be done before they can be done. + */ + for (; pio != NULL; pio = pio_next) { + pio_next = zio_walk_parents(zio); zio_notify_parent(pio, zio, ZIO_WAIT_READY); + } return (ZIO_PIPELINE_CONTINUE); } @@ -2098,11 +2121,11 @@ static int zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_t *pio = zio->io_parent; zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; + zio_t *pio, *pio_next; /* * If our of children haven't all completed, @@ -2122,7 +2145,7 @@ zio_done(zio_t *zio) ASSERT(bp->blk_pad[1] == 0); ASSERT(bp->blk_pad[2] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || - (pio != NULL && bp == pio->io_bp)); + (bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(bp)); @@ -2217,7 +2240,11 @@ zio_done(zio_t *zio) zio_gang_tree_free(&zio->io_gang_tree); - if (pio != NULL) { + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); + + if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors @@ -2243,20 +2270,28 @@ zio_done(zio_t *zio) return (ZIO_PIPELINE_STOP); } - ASSERT(zio->io_child == NULL); + ASSERT(zio_walk_children(zio) == NULL); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); + /* + * It is the responsibility of the done callback to ensure that this + * particular zio is no longer discoverable for adoption, and as + * such, cannot acquire any new parents. + */ if (zio->io_done) zio->io_done(zio); zio_gang_tree_free(&zio->io_gang_tree); - ASSERT(zio->io_delegate_list == NULL); - ASSERT(zio->io_delegate_next == NULL); + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); - if (pio != NULL) { - zio_remove_child(pio, zio); + for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { + zio_link_t *zl = zio->io_walk_link; + pio_next = zio_walk_parents(zio); + zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 0206dad9e..d3be8fb50 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1324,6 +1324,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) break; } zfs_range_unlock(rl); + if (!zil_disable) + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); return (error); } |