diff options
author | Brian Behlendorf <[email protected]> | 2009-08-18 12:10:47 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2009-08-18 12:10:47 -0700 |
commit | 8aab887b7c10fa4b5d57f9f9d7bef3517df50d9a (patch) | |
tree | 2e50785db393c816cd342644db9fa29220179558 /module/zfs | |
parent | 7069d048a0b7439575e5d00f85ce28838a3fc731 (diff) | |
parent | 5a81224e6f24d7db8f7fb8a61e7fdd6f79c9d1d5 (diff) |
Merge commit 'refs/top-bases/feature-pthreads' into feature-pthreads
Diffstat (limited to 'module/zfs')
46 files changed, 2553 insertions, 711 deletions
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 785c7c621..d86468202 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -87,6 +87,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { zap_byteswap, TRUE, "scrub work queue" }, { zap_byteswap, TRUE, "ZFS user/group used" }, { zap_byteswap, TRUE, "ZFS user/group quota" }, + { zap_byteswap, TRUE, "snapshot refcount tags"}, }; int @@ -195,7 +196,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, ASSERT(length <= DMU_MAX_ACCESS); - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; @@ -212,6 +213,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); + rw_exit(&dn->dn_struct_rwlock); return (EIO); } nblks = 1; @@ -234,9 +236,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } /* initiate async i/o */ if (read) { - rw_exit(&dn->dn_struct_rwlock); (void) dbuf_read(db, zio, dbuf_flags); - rw_enter(&dn->dn_struct_rwlock, RW_READER); } dbp[i] = &db->db; } @@ -376,56 +376,51 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) dnode_rele(dn, FTAG); } +/* + * Get the next "chunk" of file data to free. We traverse the file from + * the end so that the file gets shorter over time (if we crashes in the + * middle, this will leave us in a better state). We find allocated file + * data by simply searching the allocated level 1 indirects. + */ static int -get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) { - uint64_t len = *offset - limit; - uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; - uint64_t subchunk = + uint64_t len = *start - limit; + uint64_t blkcnt = 0; + uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); + uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); - ASSERT(limit <= *offset); + ASSERT(limit <= *start); - if (len <= chunk_len) { - *offset = limit; + if (len <= iblkrange * maxblks) { + *start = limit; return (0); } + ASSERT(ISP2(iblkrange)); - ASSERT(ISP2(subchunk)); - - while (*offset > limit) { - uint64_t initial_offset = P2ROUNDUP(*offset, subchunk); - uint64_t delta; + while (*start > limit && blkcnt < maxblks) { int err; - /* skip over allocated data */ + /* find next allocated L1 indirect */ err = dnode_next_offset(dn, - DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); - if (err == ESRCH) - *offset = limit; - else if (err) - return (err); + DNODE_FIND_BACKWARDS, start, 2, 1, 0); - ASSERT3U(*offset, <=, initial_offset); - *offset = P2ALIGN(*offset, subchunk); - delta = initial_offset - *offset; - if (delta >= chunk_len) { - *offset += delta - chunk_len; + /* if there are no more, then we are done */ + if (err == ESRCH) { + *start = limit; return (0); - } - chunk_len -= delta; - - /* skip over unallocated data */ - err = dnode_next_offset(dn, - DNODE_FIND_BACKWARDS, offset, 1, 1, 0); - if (err == ESRCH) - *offset = limit; - else if (err) + } else if (err) { return (err); + } + blkcnt += 1; - if (*offset < limit) - *offset = limit; - ASSERT3U(*offset, <, initial_offset); + /* reset offset to end of "next" block back */ + *start = P2ALIGN(*start, iblkrange); + if (*start <= limit) + *start = limit; + else + *start -= 1; } return (0); } @@ -548,7 +543,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, { dnode_t *dn; dmu_buf_t **dbp; - int numbufs, i, err; + int numbufs, err; err = dnode_hold(os->os, object, FTAG, &dn); if (err) @@ -559,7 +554,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ - if (dn->dn_datablkshift == 0) { + if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); @@ -568,6 +563,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int i; /* * NB: we could do this block-at-a-time, but it's nice @@ -803,9 +799,6 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); - if (err) - break; - offset += tocpy; size -= tocpy; } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index e962c4b88..5a9d25b77 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -679,7 +679,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, } int -dmu_objset_destroy(const char *name) +dmu_objset_destroy(const char *name, boolean_t defer) { objset_t *os; int error; @@ -696,7 +696,7 @@ dmu_objset_destroy(const char *name) dsl_dataset_t *ds = os->os->os_dsl_dataset; zil_destroy(dmu_objset_zil(os), B_FALSE); - error = dsl_dataset_destroy(ds, os); + error = dsl_dataset_destroy(ds, os, defer); /* * dsl_dataset_destroy() closes the ds. */ @@ -1130,7 +1130,7 @@ dmu_objset_userspace_upgrade(objset_t *os) */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { - dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_t *tx; dmu_buf_t *db; int objerr; @@ -1140,6 +1140,7 @@ dmu_objset_userspace_upgrade(objset_t *os) objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr) continue; + tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 9ca3999dd..ce59aac50 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -393,6 +393,7 @@ recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dataset_t *ds = arg1; struct recvbeginsyncarg *rbsa = arg2; int err; + struct dsl_ds_destroyarg dsda = {0}; /* must be a head ds */ if (ds->ds_phys->ds_next_snap_obj != 0) @@ -402,7 +403,8 @@ recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) if (dsl_dir_is_clone(ds->ds_dir)) return (EINVAL); - err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); + dsda.ds = ds; + err = dsl_dataset_destroy_check(&dsda, rbsa->tag, tx); if (err) return (err); @@ -427,13 +429,16 @@ recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dsl_dir_t *dd = ds->ds_dir; uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; + struct dsl_ds_destroyarg dsda = {0}; /* * NB: caller must provide an extra hold on the dsl_dir_t, so it * won't go away when dsl_dataset_destroy_sync() closes the * dataset. */ - dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); + dsda.ds = ds; + dsl_dataset_destroy_sync(&dsda, rbsa->tag, cr, tx); + ASSERT3P(dsda.rm_origin, ==, NULL); dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx); @@ -483,7 +488,7 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ohds = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -513,27 +518,13 @@ recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_spa, tx, cr, "dataset = %lld", dsobj); } -/* ARGSUSED */ -static void -recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", - ds->ds_object); -} - /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) + boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) { int err = 0; boolean_t byteswap; @@ -582,36 +573,8 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, /* * Process the begin in syncing context. */ - if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { - /* offline incremental receive */ - err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds); - if (err) - return (err); - - /* - * Only do the rollback if the most recent snapshot - * matches the incremental source - */ - if (force) { - if (ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_guid != - rbsa.fromguid) { - dsl_dataset_disown(ds, dmu_recv_tag); - return (ENODEV); - } - (void) dsl_dataset_rollback(ds, DMU_OST_NONE); - } - rbsa.force = B_FALSE; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_incremental_check, - recv_offline_incremental_sync, ds, &rbsa, 1); - if (err) { - dsl_dataset_disown(ds, dmu_recv_tag); - return (err); - } - drc->drc_logical_ds = drc->drc_real_ds = ds; - } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { - /* online incremental receive */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { + /* incremental receive */ /* tmp clone name is: tofs/%tosnap" */ (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), @@ -622,11 +585,18 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, if (err) return (err); + /* must not have an incremental recv already in progress */ + if (!mutex_tryenter(&ds->ds_recvlock)) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (EBUSY); + } + rbsa.force = force; err = dsl_sync_task_do(ds->ds_dir->dd_pool, recv_incremental_check, - recv_online_incremental_sync, ds, &rbsa, 5); + recv_incremental_sync, ds, &rbsa, 5); if (err) { + mutex_exit(&ds->ds_recvlock); dsl_dataset_rele(ds, dmu_recv_tag); return (err); } @@ -931,26 +901,6 @@ restore_free(struct restorearg *ra, objset_t *os, return (err); } -void -dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) -{ - if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { - /* - * online incremental or new fs: destroy the fs (which - * may be a clone) that we created - */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); - if (drc->drc_real_ds != drc->drc_logical_ds) - dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); - } else { - /* - * offline incremental: rollback to most recent snapshot. - */ - (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE); - dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag); - } -} - /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ @@ -1078,11 +1028,17 @@ out: if (ra.err != 0) { /* - * rollback or destroy what we created, so we don't - * leave it in the restoring state. + * destroy what we created, so we don't leave it in the + * inconsistent restoring state. */ txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - dmu_recv_abort_cleanup(drc); + + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + if (drc->drc_real_ds != drc->drc_logical_ds) { + mutex_exit(&drc->drc_logical_ds->ds_recvlock); + dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); + } } kmem_free(ra.buf, ra.bufsize); @@ -1149,7 +1105,9 @@ dmu_recv_end(dmu_recv_cookie_t *drc) dsl_dataset_rele(ds, dmu_recv_tag); } /* dsl_dataset_destroy() will disown the ds */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + mutex_exit(&drc->drc_logical_ds->ds_recvlock); if (err) return (err); } @@ -1163,7 +1121,8 @@ dmu_recv_end(dmu_recv_cookie_t *drc) if (err) { if (drc->drc_newfs) { ASSERT(ds == drc->drc_real_ds); - (void) dsl_dataset_destroy(ds, dmu_recv_tag); + (void) dsl_dataset_destroy(ds, dmu_recv_tag, + B_FALSE); return (err); } else { (void) dsl_dataset_rollback(ds, DMU_OST_NONE); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 9c148bfd5..629e97f49 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -699,8 +699,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) } err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add, - &txh->txh_space_towrite, &txh->txh_space_tooverwrite, - txh->txh_dnode->dn_datablkshift); + &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* * If the modified blocks are scattered to the four winds, diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index cf49b97f1..d82e72a14 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1260,6 +1260,22 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) dmu_tx_willuse_space(tx, space); } +/* + * This function scans a block at the indicated "level" looking for + * a hole or data (depending on 'flags'). If level > 0, then we are + * scanning an indirect block looking at its pointers. If level == 0, + * then we are looking at a block of dnodes. If we don't find what we + * are looking for in the block, we return ESRCH. Otherwise, return + * with *offset pointing to the beginning (if searching forwards) or + * end (if searching backwards) of the range covered by the block + * pointer we matched on (or dnode). + * + * The basic search algorithm used below by dnode_next_offset() is to + * use this function to search up the block tree (widen the search) until + * we find something (i.e., we don't return ESRCH) and then search back + * down the tree (narrow the search) until we reach our original search + * level. + */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) @@ -1330,6 +1346,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, error = ESRCH; } else { blkptr_t *bp = data; + uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); @@ -1339,18 +1356,25 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, else minfill++; - for (i = (*offset >> span) & ((1ULL << epbs) - 1); + *offset = *offset >> span; + for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { if (bp[i].blk_fill >= minfill && bp[i].blk_fill <= maxfill && (hole || bp[i].blk_birth > txg)) break; - if (inc < 0 && *offset < (1ULL << span)) - *offset = 0; - else - *offset += (1ULL << span) * inc; + if (inc > 0 || *offset > 0) + *offset += inc; + } + *offset = *offset << span; + if (inc < 0) { + /* traversing backwards; position offset at the end */ + ASSERT3U(*offset, <=, start); + *offset = MIN(*offset + (1ULL << span) - 1, start); + } else if (*offset < start) { + *offset = start; } - if (i < 0 || i == epb) + if (i < 0 || i >= epb) error = ESRCH; } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 0fe7eb583..edc36e72b 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -39,6 +39,7 @@ #include <sys/spa.h> #include <sys/zfs_znode.h> #include <sys/sunddi.h> +#include <sys/zvol.h> static char *dsl_reaper = "the grim reaper"; @@ -262,6 +263,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) ASSERT(!list_link_active(&ds->ds_synced_link)); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); @@ -359,6 +361,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); @@ -377,6 +380,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, * just opened it. */ mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); @@ -406,8 +410,15 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_rele(origin, FTAG); } } - } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { - err = dsl_dataset_get_snapname(ds); + } else { + if (zfs_flags & ZFS_DEBUG_SNAPNAMES) + err = dsl_dataset_get_snapname(ds); + if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { + err = zap_count( + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj, + &ds->ds_userrefs); + } } if (err == 0 && !dsl_dataset_is_snapshot(ds)) { @@ -448,6 +459,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); @@ -845,6 +857,7 @@ struct destroyarg { dsl_sync_task_group_t *dstg; char *snapname; char *failed; + boolean_t defer; }; static int @@ -852,23 +865,30 @@ dsl_snapshot_destroy_one(char *name, void *arg) { struct destroyarg *da = arg; dsl_dataset_t *ds; - char *cp; int err; - - (void) strcat(name, "@"); - (void) strcat(name, da->snapname); - err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT, + char *dsname; + size_t buflen; + + /* alloc a buffer to hold name@snapname, plus the terminating NULL */ + buflen = strlen(name) + strlen(da->snapname) + 2; + dsname = kmem_alloc(buflen, KM_SLEEP); + (void) snprintf(dsname, buflen, "%s@%s", name, da->snapname); + err = dsl_dataset_own(dsname, DS_MODE_READONLY | DS_MODE_INCONSISTENT, da->dstg, &ds); - cp = strchr(name, '@'); - *cp = '\0'; + kmem_free(dsname, buflen); if (err == 0) { + struct dsl_ds_destroyarg *dsda; + dsl_dataset_make_exclusive(ds, da->dstg); if (ds->ds_user_ptr) { ds->ds_user_evict_func(ds, ds->ds_user_ptr); ds->ds_user_ptr = NULL; } + dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); + dsda->ds = ds; + dsda->defer = da->defer; dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, da->dstg, 0); + dsl_dataset_destroy_sync, dsda, da->dstg, 0); } else if (err == ENOENT) { err = 0; } else { @@ -882,7 +902,7 @@ dsl_snapshot_destroy_one(char *name, void *arg) */ #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy int -dsl_snapshots_destroy(char *fsname, char *snapname) +dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) { int err; struct destroyarg da; @@ -895,6 +915,7 @@ dsl_snapshots_destroy(char *fsname, char *snapname) da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); da.snapname = snapname; da.failed = fsname; + da.defer = defer; err = dmu_objset_find(fsname, dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); @@ -904,7 +925,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname) for (dst = list_head(&da.dstg->dstg_tasks); dst; dst = list_next(&da.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; + struct dsl_ds_destroyarg *dsda = dst->dst_arg1; + dsl_dataset_t *ds = dsda->ds; + /* * Return the file system name that triggered the error */ @@ -912,7 +935,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname) dsl_dataset_name(ds, fsname); *strchr(fsname, '@') = '\0'; } + ASSERT3P(dsda->rm_origin, ==, NULL); dsl_dataset_disown(ds, da.dstg); + kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } dsl_sync_task_group_destroy(da.dstg); @@ -920,18 +945,100 @@ dsl_snapshots_destroy(char *fsname, char *snapname) return (err); } +static boolean_t +dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) +{ + boolean_t might_destroy = B_FALSE; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && + DS_IS_DEFER_DESTROY(ds)) + might_destroy = B_TRUE; + mutex_exit(&ds->ds_lock); + + return (might_destroy); +} + +#ifdef _KERNEL +static int +dsl_dataset_zvol_cleanup(dsl_dataset_t *ds, const char *name) +{ + int error; + objset_t *os; + + error = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); + if (error) + return (error); + + if (dmu_objset_type(os) == DMU_OST_ZVOL) + error = zvol_remove_minor(name); + dmu_objset_close(os); + + return (error); +} +#endif + +/* + * If we're removing a clone, and these three conditions are true: + * 1) the clone's origin has no other children + * 2) the clone's origin has no user references + * 3) the clone's origin has been marked for deferred destruction + * Then, prepare to remove the origin as part of this sync task group. + */ +static int +dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) +{ + dsl_dataset_t *ds = dsda->ds; + dsl_dataset_t *origin = ds->ds_prev; + + if (dsl_dataset_might_destroy_origin(origin)) { + char *name; + int namelen; + int error; + + namelen = dsl_dataset_namelen(origin) + 1; + name = kmem_alloc(namelen, KM_SLEEP); + dsl_dataset_name(origin, name); +#ifdef _KERNEL + error = zfs_unmount_snap(name, NULL); + if (error) { + kmem_free(name, namelen); + return (error); + } + error = dsl_dataset_zvol_cleanup(origin, name); + if (error) { + kmem_free(name, namelen); + return (error); + } +#endif + error = dsl_dataset_own(name, + DS_MODE_READONLY | DS_MODE_INCONSISTENT, + tag, &origin); + kmem_free(name, namelen); + if (error) + return (error); + dsda->rm_origin = origin; + dsl_dataset_make_exclusive(origin, tag); + } + + return (0); +} + /* * ds must be opened as OWNER. On return (whether successful or not), * ds will be closed and caller can no longer dereference it. */ int -dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) +dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) { int err; dsl_sync_task_group_t *dstg; objset_t *os; dsl_dir_t *dd; uint64_t obj; + struct dsl_ds_destroyarg dsda = {0}; + + dsda.ds = ds; if (dsl_dataset_is_snapshot(ds)) { /* Destroying a snapshot is simpler */ @@ -941,9 +1048,12 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) ds->ds_user_evict_func(ds, ds->ds_user_ptr); ds->ds_user_ptr = NULL; } + /* NOTE: defer is always B_FALSE for non-snapshots */ + dsda.defer = defer; err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - ds, tag, 0); + &dsda, tag, 0); + ASSERT3P(dsda.rm_origin, ==, NULL); goto out; } @@ -1024,13 +1134,45 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) ds->ds_user_evict_func(ds, ds->ds_user_ptr); ds->ds_user_ptr = NULL; } - dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, tag, 0); - dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, dd, FTAG, 0); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); + + /* + * If we're removing a clone, we might also need to remove its + * origin. + */ + do { + dsda.need_prep = B_FALSE; + if (dsl_dir_is_clone(dd)) { + err = dsl_dataset_origin_rm_prep(&dsda, tag); + if (err) { + dsl_dir_close(dd, FTAG); + goto out; + } + } + + dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, &dsda, tag, 0); + dsl_sync_task_create(dstg, dsl_dir_destroy_check, + dsl_dir_destroy_sync, dd, FTAG, 0); + err = dsl_sync_task_group_wait(dstg); + dsl_sync_task_group_destroy(dstg); + + /* + * We could be racing against 'zfs release' or 'zfs destroy -d' + * on the origin snap, in which case we can get EBUSY if we + * needed to destroy the origin snap but were not ready to + * do so. + */ + if (dsda.need_prep) { + ASSERT(err == EBUSY); + ASSERT(dsl_dir_is_clone(dd)); + ASSERT(dsda.rm_origin == NULL); + } + } while (dsda.need_prep); + + if (dsda.rm_origin != NULL) + dsl_dataset_disown(dsda.rm_origin, tag); + /* if it is successful, dsl_dir_destroy_sync will close the dd */ if (err) dsl_dir_close(dd, FTAG); @@ -1211,7 +1353,8 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * We can only roll back to emptyness if it is a ZPL objset. */ - if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0) + if (*ost != DMU_OST_ZFS && + ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) return (EINVAL); /* @@ -1316,6 +1459,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } else { objset_impl_t *osi; + ASSERT(*ost != DMU_OST_ZVOL); ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); @@ -1385,18 +1529,63 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) cr, "dataset = %llu", ds->ds_object); } +static int +dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, + dmu_tx_t *tx) +{ + dsl_dataset_t *ds = dsda->ds; + dsl_dataset_t *ds_prev = ds->ds_prev; + + if (dsl_dataset_might_destroy_origin(ds_prev)) { + struct dsl_ds_destroyarg ndsda = {0}; + + /* + * If we're not prepared to remove the origin, don't remove + * the clone either. + */ + if (dsda->rm_origin == NULL) { + dsda->need_prep = B_TRUE; + return (EBUSY); + } + + ndsda.ds = ds_prev; + ndsda.is_origin_rm = B_TRUE; + return (dsl_dataset_destroy_check(&ndsda, tag, tx)); + } + + /* + * If we're not going to remove the origin after all, + * undo the open context setup. + */ + if (dsda->rm_origin != NULL) { + dsl_dataset_disown(dsda->rm_origin, tag); + dsda->rm_origin = NULL; + } + + return (0); +} + /* ARGSUSED */ int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; + struct dsl_ds_destroyarg *dsda = arg1; + dsl_dataset_t *ds = dsda->ds; /* we have an owner hold, so noone else can destroy us */ ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - /* Can't delete a branch point. */ - if (ds->ds_phys->ds_num_children > 1) - return (EEXIST); + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (dsda->defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (ENOTSUP); + ASSERT(dsl_dataset_is_snapshot(ds)); + return (0); + } /* * Can't delete a head dataset if there are snapshots of it. @@ -1414,6 +1603,31 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) return (EAGAIN); + if (dsl_dataset_is_snapshot(ds)) { + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0 && !dsda->releasing) + return (EBUSY); + + mutex_enter(&ds->ds_lock); + /* + * Can't delete a branch point. However, if we're destroying + * a clone and removing its origin due to it having a user + * hold count of 0 and having been marked for deferred destroy, + * it's OK for the origin to have a single clone. + */ + if (ds->ds_phys->ds_num_children > + (dsda->is_origin_rm ? 2 : 1)) { + mutex_exit(&ds->ds_lock); + return (EEXIST); + } + mutex_exit(&ds->ds_lock); + } else if (dsl_dir_is_clone(ds->ds_dir)) { + return (dsl_dataset_origin_check(dsda, arg2, tx)); + } + /* XXX we should do some i/o error checking... */ return (0); } @@ -1461,7 +1675,8 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) void dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; + struct dsl_ds_destroyarg *dsda = arg1; + dsl_dataset_t *ds = dsda->ds; zio_t *zio; int err; int after_branch_point = FALSE; @@ -1471,11 +1686,20 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) uint64_t obj; ASSERT(ds->ds_owner); - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + if (dsda->defer) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + return; + } + } + /* signal any waiters that this dataset is going away */ mutex_enter(&ds->ds_lock); ds->ds_owner = dsl_reaper; @@ -1521,6 +1745,20 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) /* This clone is toast. */ ASSERT(ds_prev->ds_phys->ds_num_children > 1); ds_prev->ds_phys->ds_num_children--; + + /* + * If the clone's origin has no other clones, no + * user holds, and has been marked for deferred + * deletion, then we should have done the necessary + * destroy setup for it. + */ + if (ds_prev->ds_phys->ds_num_children == 1 && + ds_prev->ds_userrefs == 0 && + DS_IS_DEFER_DESTROY(ds_prev)) { + ASSERT3P(dsda->rm_origin, !=, NULL); + } else { + ASSERT3P(dsda->rm_origin, ==, NULL); + } } else if (!after_branch_point) { ds_prev->ds_phys->ds_next_snap_obj = ds->ds_phys->ds_next_snap_obj; @@ -1733,10 +1971,32 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) } if (ds->ds_phys->ds_props_obj != 0) VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + if (ds->ds_phys->ds_userrefs_obj != 0) + VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); dsl_dir_close(ds->ds_dir, ds); ds->ds_dir = NULL; dsl_dataset_drain_refs(ds, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); + + if (dsda->rm_origin) { + /* + * Remove the origin of the clone we just destroyed. + */ + dsl_dataset_t *origin = ds->ds_prev; + struct dsl_ds_destroyarg ndsda = {0}; + + ASSERT3P(origin, ==, dsda->rm_origin); + if (origin->ds_user_ptr) { + origin->ds_user_evict_func(origin, origin->ds_user_ptr); + origin->ds_user_ptr = NULL; + } + + dsl_dataset_rele(origin, tag); + ds->ds_prev = NULL; + + ndsda.ds = origin; + dsl_dataset_destroy_sync(&ndsda, tag, cr, tx); + } } static int @@ -1951,6 +2211,9 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) ds->ds_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ds->ds_phys->ds_guid); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, + DS_IS_DEFER_DESTROY(ds) ? 1 : 0); if (ds->ds_phys->ds_next_snap_obj) { /* @@ -3019,7 +3282,7 @@ dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_quota = new_quota; - dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); + dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ", @@ -3114,7 +3377,7 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); - dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", + dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refreservation", new_reservation, cr, tx); spa_history_internal_log(LOG_DS_REFRESERV, @@ -3138,3 +3401,421 @@ dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) dsl_dataset_rele(ds, FTAG); return (err); } + +static int +dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + char *htag = arg2; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int error = 0; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + if (!dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + if (strlen(htag) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + /* tags must be unique */ + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj) { + error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, + 8, 1, tx); + if (error == 0) + error = EEXIST; + else if (error == ENOENT) + error = 0; + } + mutex_exit(&ds->ds_lock); + + return (error); +} + +static void +dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + char *htag = arg2; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + time_t now = gethrestime_sec(); + uint64_t zapobj; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = ds->ds_phys->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = ds->ds_phys->ds_userrefs_obj; + } + ds->ds_userrefs++; + mutex_exit(&ds->ds_lock); + + VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + spa_history_internal_log(LOG_DS_USER_HOLD, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "<%s> dataset = %llu", + htag, ds->ds_object); +} + +struct dsl_ds_holdarg { + dsl_sync_task_group_t *dstg; + char *htag; + char *snapname; + boolean_t recursive; + char failed[MAXPATHLEN]; +}; + +static int +dsl_dataset_user_hold_one(char *dsname, void *arg) +{ + struct dsl_ds_holdarg *ha = arg; + dsl_dataset_t *ds; + int error; + char *name; + size_t buflen; + + /* alloc a buffer to hold dsname@snapname plus terminating NULL */ + buflen = strlen(dsname) + strlen(ha->snapname) + 2; + name = kmem_alloc(buflen, KM_SLEEP); + (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname); + error = dsl_dataset_hold(name, ha->dstg, &ds); + kmem_free(name, buflen); + if (error == 0) { + dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, ds, ha->htag, 0); + } else if (error == ENOENT && ha->recursive) { + error = 0; + } else { + (void) strcpy(ha->failed, dsname); + } + return (error); +} + +int +dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, + boolean_t recursive) +{ + struct dsl_ds_holdarg *ha; + dsl_sync_task_t *dst; + spa_t *spa; + int error; + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + + error = spa_open(dsname, &spa, FTAG); + if (error) { + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + return (error); + } + + ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + ha->htag = htag; + ha->snapname = snapname; + ha->recursive = recursive; + if (recursive) { + error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, + ha, DS_FIND_CHILDREN); + } else { + error = dsl_dataset_user_hold_one(dsname, ha); + } + if (error == 0) + error = dsl_sync_task_group_wait(ha->dstg); + + for (dst = list_head(&ha->dstg->dstg_tasks); dst; + dst = list_next(&ha->dstg->dstg_tasks, dst)) { + dsl_dataset_t *ds = dst->dst_arg1; + + if (dst->dst_err) { + dsl_dataset_name(ds, ha->failed); + *strchr(ha->failed, '@') = '\0'; + } + dsl_dataset_rele(ds, ha->dstg); + } + + if (error) + (void) strcpy(dsname, ha->failed); + + dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + spa_close(spa, FTAG); + return (error); +} + +struct dsl_ds_releasearg { + dsl_dataset_t *ds; + const char *htag; + boolean_t own; /* do we own or just hold ds? */ +}; + +static int +dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, + boolean_t *might_destroy) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj; + uint64_t tmp; + int error; + + *might_destroy = B_FALSE; + + mutex_enter(&ds->ds_lock); + zapobj = ds->ds_phys->ds_userrefs_obj; + if (zapobj == 0) { + /* The tag can't possibly exist */ + mutex_exit(&ds->ds_lock); + return (ESRCH); + } + + /* Make sure the tag exists */ + error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); + if (error) { + mutex_exit(&ds->ds_lock); + if (error == ENOENT) + error = ESRCH; + return (error); + } + + if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)) + *might_destroy = B_TRUE; + + mutex_exit(&ds->ds_lock); + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) +{ + struct dsl_ds_releasearg *ra = arg1; + dsl_dataset_t *ds = ra->ds; + boolean_t might_destroy; + int error; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); + if (error) + return (error); + + if (might_destroy) { + struct dsl_ds_destroyarg dsda = {0}; + + if (dmu_tx_is_syncing(tx)) { + /* + * If we're not prepared to remove the snapshot, + * we can't allow the release to happen right now. + */ + if (!ra->own) + return (EBUSY); + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + } + dsda.ds = ds; + dsda.releasing = B_TRUE; + return (dsl_dataset_destroy_check(&dsda, tag, tx)); + } + + return (0); +} + +static void +dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +{ + struct dsl_ds_releasearg *ra = arg1; + dsl_dataset_t *ds = ra->ds; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj; + uint64_t dsobj = ds->ds_object; + uint64_t refs; + + mutex_enter(&ds->ds_lock); + ds->ds_userrefs--; + refs = ds->ds_userrefs; + mutex_exit(&ds->ds_lock); + zapobj = ds->ds_phys->ds_userrefs_obj; + VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); + if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)) { + struct dsl_ds_destroyarg dsda = {0}; + + ASSERT(ra->own); + dsda.ds = ds; + dsda.releasing = B_TRUE; + /* We already did the destroy_check */ + dsl_dataset_destroy_sync(&dsda, tag, cr, tx); + } + + spa_history_internal_log(LOG_DS_USER_RELEASE, + spa, tx, cr, "<%s> %lld dataset = %llu", + ra->htag, (longlong_t)refs, dsobj); +} + +static int +dsl_dataset_user_release_one(char *dsname, void *arg) +{ + struct dsl_ds_holdarg *ha = arg; + struct dsl_ds_releasearg *ra; + dsl_dataset_t *ds; + int error; + void *dtag = ha->dstg; + char *name; + size_t buflen; + boolean_t own = B_FALSE; + boolean_t might_destroy; + + if (strlen(ha->htag) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ + buflen = strlen(dsname) + strlen(ha->snapname) + 2; + name = kmem_alloc(buflen, KM_SLEEP); + (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname); + error = dsl_dataset_hold(name, dtag, &ds); + kmem_free(name, buflen); + if (error == ENOENT && ha->recursive) + return (0); + (void) strcpy(ha->failed, dsname); + if (error) + return (error); + + ASSERT(dsl_dataset_is_snapshot(ds)); + + error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } + + if (might_destroy) { +#ifdef _KERNEL + error = zfs_unmount_snap(name, NULL); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } + error = dsl_dataset_zvol_cleanup(ds, name); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } +#endif + if (!dsl_dataset_tryown(ds, + DS_MODE_READONLY | DS_MODE_INCONSISTENT, dtag)) { + dsl_dataset_rele(ds, dtag); + return (EBUSY); + } else { + own = B_TRUE; + dsl_dataset_make_exclusive(ds, dtag); + } + } + + ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); + ra->ds = ds; + ra->htag = ha->htag; + ra->own = own; + dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, ra, dtag, 0); + + return (0); +} + +int +dsl_dataset_user_release(char *dsname, char *snapname, char *htag, + boolean_t recursive) +{ + struct dsl_ds_holdarg *ha; + dsl_sync_task_t *dst; + spa_t *spa; + int error; + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + + error = spa_open(dsname, &spa, FTAG); + if (error) { + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + return (error); + } + + ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + ha->htag = htag; + ha->snapname = snapname; + ha->recursive = recursive; + if (recursive) { + error = dmu_objset_find(dsname, dsl_dataset_user_release_one, + ha, DS_FIND_CHILDREN); + } else { + error = dsl_dataset_user_release_one(dsname, ha); + } + if (error == 0) + error = dsl_sync_task_group_wait(ha->dstg); + + for (dst = list_head(&ha->dstg->dstg_tasks); dst; + dst = list_next(&ha->dstg->dstg_tasks, dst)) { + struct dsl_ds_releasearg *ra = dst->dst_arg1; + dsl_dataset_t *ds = ra->ds; + + if (dst->dst_err) + dsl_dataset_name(ds, ha->failed); + + if (ra->own) + dsl_dataset_disown(ds, ha->dstg); + else + dsl_dataset_rele(ds, ha->dstg); + + kmem_free(ra, sizeof (struct dsl_ds_releasearg)); + } + + if (error) + (void) strcpy(dsname, ha->failed); + + dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + spa_close(spa, FTAG); + return (error); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); + if (ds->ds_phys->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, + za->za_first_integer)); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index da5d15787..5d76ff5f9 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -66,8 +66,6 @@ * The ZAP OBJ is referred to as the jump object. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_objset.h> #include <sys/dmu_tx.h> @@ -540,7 +538,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) dsl_pool_t *dp; void *cookie; int error; - char checkflag = ZFS_DELEG_LOCAL; + char checkflag; objset_t *mos; avl_tree_t permsets; perm_set_t *setnode; @@ -563,6 +561,16 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) return (EPERM); } + if (dsl_dataset_is_snapshot(ds)) { + /* + * Snapshots are treated as descendents only, + * local permissions do not apply. + */ + checkflag = ZFS_DELEG_DESCENDENT; + } else { + checkflag = ZFS_DELEG_LOCAL; + } + avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), offsetof(perm_set_t, p_node)); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index f19653d92..2f312ae34 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -96,7 +96,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, #endif if (dd == NULL) { dsl_dir_t *winner; - int err; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index 664ccff45..bfc0fa87e 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -442,7 +442,7 @@ dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } void -dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, +dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c index 8a802b53a..03ebb90bb 100644 --- a/module/zfs/dsl_scrub.c +++ b/module/zfs/dsl_scrub.c @@ -1024,6 +1024,8 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, int dsl_pool_scrub_clean(dsl_pool_t *dp) { + spa_t *spa = dp->dp_spa; + /* * Purge all vdev caches. We do this here rather than in sync * context because this requires a writer lock on the spa_config @@ -1031,11 +1033,11 @@ dsl_pool_scrub_clean(dsl_pool_t *dp) * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ - spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER); - dp->dp_spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(dp->dp_spa->spa_root_vdev); - dp->dp_spa->spa_scrub_reopen = B_FALSE; - spa_config_exit(dp->dp_spa, SCL_ALL, FTAG); + spa_vdev_state_enter(spa); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); } diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h index 989b6ff24..b15da8391 100644 --- a/module/zfs/include/sys/dmu.h +++ b/module/zfs/include/sys/dmu.h @@ -117,6 +117,7 @@ typedef enum dmu_object_type { DMU_OT_SCRUB_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -174,8 +175,8 @@ int dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, objset_t *clone_parent, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); -int dmu_snapshots_destroy(char *fsname, char *snapname); +int dmu_objset_destroy(const char *name, boolean_t defer); +int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); int dmu_objset_rollback(objset_t *os); int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props, boolean_t recursive); @@ -666,10 +667,9 @@ typedef struct dmu_recv_cookie { } dmu_recv_cookie_t; int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *); + boolean_t force, objset_t *origin, dmu_recv_cookie_t *); int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp); int dmu_recv_end(dmu_recv_cookie_t *drc); -void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ diff --git a/module/zfs/include/sys/dmu_impl.h b/module/zfs/include/sys/dmu_impl.h index 96ce688e1..3868a5816 100644 --- a/module/zfs/include/sys/dmu_impl.h +++ b/module/zfs/include/sys/dmu_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -211,10 +211,11 @@ extern "C" { * ds_lock * protects: * ds_user_ptr - * ds_user_evice_func + * ds_user_evict_func * ds_open_refcount * ds_snapname * ds_phys accounting + * ds_phys userrefs zapobj * ds_reserved * held from: * dsl_dataset_* diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h index 82cb6ad7d..052cb8dd9 100644 --- a/module/zfs/include/sys/dmu_objset.h +++ b/module/zfs/include/sys/dmu_objset.h @@ -117,7 +117,7 @@ void dmu_objset_close(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, objset_t *clone_parent, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); +int dmu_objset_destroy(const char *name, boolean_t defer); int dmu_objset_rollback(objset_t *os); int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props, boolean_t recursive); diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h index a1c2896e3..b51036d38 100644 --- a/module/zfs/include/sys/dsl_dataset.h +++ b/module/zfs/include/sys/dsl_dataset.h @@ -63,6 +63,14 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); #define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) /* + * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called + * on a dataset. This allows the dataset to be destroyed using 'zfs release'. + */ +#define DS_FLAG_DEFER_DESTROY (1ULL<<3) +#define DS_IS_DEFER_DESTROY(ds) \ + ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY) + +/* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. */ @@ -93,7 +101,8 @@ typedef struct dsl_dataset_phys { blkptr_t ds_bp; uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ - uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */ + uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ + uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ } dsl_dataset_phys_t; typedef struct dsl_dataset { @@ -111,6 +120,9 @@ typedef struct dsl_dataset { /* has internal locking: */ bplist_t ds_deadlist; + /* to protect against multiple concurrent incremental recv */ + kmutex_t ds_recvlock; + /* protected by lock on pool's dp_dirty_datasets list */ txg_node_t ds_dirty_link; list_node_t ds_synced_link; @@ -122,6 +134,7 @@ typedef struct dsl_dataset { kmutex_t ds_lock; void *ds_user_ptr; dsl_dataset_evict_func_t *ds_user_evict_func; + uint64_t ds_userrefs; /* * ds_owner is protected by the ds_rwlock and the ds_lock @@ -143,6 +156,15 @@ typedef struct dsl_dataset { char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; +struct dsl_ds_destroyarg { + dsl_dataset_t *ds; /* ds to destroy */ + dsl_dataset_t *rm_origin; /* also remove our origin? */ + boolean_t is_origin_rm; /* set if removing origin snap */ + boolean_t defer; /* destroy -d requested? */ + boolean_t releasing; /* destroying due to release? */ + boolean_t need_prep; /* do we need to retry due to EBUSY? */ +}; + #define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) @@ -167,8 +189,8 @@ uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx); -int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag); -int dsl_snapshots_destroy(char *fsname, char *snapname); +int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer); +int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); dsl_checkfunc_t dsl_dataset_destroy_check; dsl_syncfunc_t dsl_dataset_destroy_sync; dsl_checkfunc_t dsl_dataset_snapshot_check; @@ -178,6 +200,11 @@ int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); int dsl_dataset_promote(const char *name); int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force); +int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, + boolean_t recursive); +int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, + boolean_t recursive); +int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, void *p, dsl_dataset_evict_func_t func); diff --git a/module/zfs/include/sys/dsl_deleg.h b/module/zfs/include/sys/dsl_deleg.h index b064c9228..a26a3f705 100644 --- a/module/zfs/include/sys/dsl_deleg.h +++ b/module/zfs/include/sys/dsl_deleg.h @@ -53,6 +53,8 @@ extern "C" { #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" /* * Note: the names of properties that are marked delegatable are also diff --git a/module/zfs/include/sys/dsl_prop.h b/module/zfs/include/sys/dsl_prop.h index 26018a46d..5afaa1f0d 100644 --- a/module/zfs/include/sys/dsl_prop.h +++ b/module/zfs/include/sys/dsl_prop.h @@ -69,7 +69,7 @@ dsl_syncfunc_t dsl_props_set_sync; int dsl_prop_set(const char *ddname, const char *propname, int intsz, int numints, const void *buf); int dsl_props_set(const char *dsname, nvlist_t *nvl); -void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, +void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx); void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index c7ae4022e..0a4d55097 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -500,8 +500,9 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf, history_log_type_t what); -void spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_internal_log(history_internal_events_t event, + spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); /* error handling */ struct zbookmark; diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h index 12999ee9e..84da68488 100644 --- a/module/zfs/include/sys/spa_impl.h +++ b/module/zfs/include/sys/spa_impl.h @@ -105,6 +105,7 @@ struct spa { int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ + boolean_t spa_load_verbatim; /* load the given config? */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; metaslab_class_t *spa_normal_class; /* normal data class */ diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h index 71b9b12d6..7e53f62d2 100644 --- a/module/zfs/include/sys/vdev.h +++ b/module/zfs/include/sys/vdev.h @@ -47,6 +47,7 @@ typedef enum vdev_dtl_type { extern boolean_t zfs_nocacheflush; extern int vdev_open(vdev_t *); +extern void vdev_open_children(vdev_t *vd); extern int vdev_validate(vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h index 8240b66ac..23780430d 100644 --- a/module/zfs/include/sys/vdev_impl.h +++ b/module/zfs/include/sys/vdev_impl.h @@ -127,6 +127,8 @@ struct vdev { space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ vdev_stat_t vdev_stat; /* virtual device statistics */ boolean_t vdev_expanding; /* expand the vdev? */ + int vdev_open_error; /* error on last open */ + kthread_t *vdev_open_thread; /* thread opening children */ /* * Top-level vdev state. diff --git a/module/zfs/include/sys/zap.h b/module/zfs/include/sys/zap.h index de2053812..967174be4 100644 --- a/module/zfs/include/sys/zap.h +++ b/module/zfs/include/sys/zap.h @@ -182,8 +182,7 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, boolean_t *normalization_conflictp); int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, - int add, uint64_t *towrite, uint64_t *tooverwrite, - uint64_t dn_datablkshift); + int add, uint64_t *towrite, uint64_t *tooverwrite); /* * Create an attribute with the given name and value. diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h index f5e5aa7f4..3488962e2 100644 --- a/module/zfs/include/sys/zfs_acl.h +++ b/module/zfs/include/sys/zfs_acl.h @@ -203,6 +203,7 @@ void zfs_oldace_byteswap(ace_t *, int); void zfs_ace_byteswap(void *, size_t, boolean_t); extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +int zfs_fastaccesschk_execute(struct znode *, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h index 1e9f35155..3a3e6e711 100644 --- a/module/zfs/include/sys/zfs_ioctl.h +++ b/module/zfs/include/sys/zfs_ioctl.h @@ -165,6 +165,7 @@ typedef struct zfs_cmd { dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; + boolean_t zc_defer_destroy; } zfs_cmd_t; typedef struct zfs_useracct { diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h index b8ed7b27f..28555232b 100644 --- a/module/zfs/include/sys/zfs_vfsops.h +++ b/module/zfs/include/sys/zfs_vfsops.h @@ -73,7 +73,6 @@ struct zfsvfs { boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ - kmutex_t z_online_recv_lock; /* held while recv in progress */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ kmutex_t z_lock; diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h index 69f4b50f5..5db5b8d51 100644 --- a/module/zfs/include/sys/zfs_znode.h +++ b/module/zfs/include/sys/zfs_znode.h @@ -77,6 +77,7 @@ extern "C" { #define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ #define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ #define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ +#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ /* * Is ID ephemeral? @@ -200,6 +201,7 @@ typedef struct znode { uint64_t z_gen; /* generation (same as zp_gen) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ + zfs_acl_t *z_acl_cached; /* cached acl */ list_node_t z_link_node; /* all znodes in fs link */ /* * These are dmu managed fields. diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h index 5c51717c1..e47d8f468 100644 --- a/module/zfs/include/sys/zio.h +++ b/module/zfs/include/sys/zio.h @@ -143,6 +143,8 @@ enum zio_compress { #define ZIO_FLAG_GODFATHER 0x080000 #define ZIO_FLAG_TRYHARD 0x100000 +#define ZIO_FLAG_NODATA 0x200000 +#define ZIO_FLAG_OPTIONAL 0x400000 #define ZIO_FLAG_GANG_INHERIT \ (ZIO_FLAG_CANFAIL | \ @@ -161,7 +163,9 @@ enum zio_compress { ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_IO_RETRY | \ ZIO_FLAG_PROBE | \ - ZIO_FLAG_TRYHARD) + ZIO_FLAG_TRYHARD | \ + ZIO_FLAG_NODATA | \ + ZIO_FLAG_OPTIONAL) #define ZIO_FLAG_AGG_INHERIT \ (ZIO_FLAG_DONT_AGGREGATE | \ diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c index 710685dbc..4cef53f95 100644 --- a/module/zfs/rrwlock.c +++ b/module/zfs/rrwlock.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/refcount.h> #include <sys/rrwlock.h> @@ -118,7 +116,7 @@ rrn_find_and_remove(rrwlock_t *rrl) rrw_node_t *prev = NULL; if (refcount_count(&rrl->rr_linked_rcount) == 0) - return (NULL); + return (B_FALSE); for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { if (rn->rn_rrl == rrl) { @@ -159,6 +157,14 @@ static void rrw_enter_read(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); +#if !defined(DEBUG) && defined(_KERNEL) + if (!rrl->rr_writer && !rrl->rr_writer_wanted) { + rrl->rr_anon_rcount.rc_count++; + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__rdmiss); +#endif ASSERT(rrl->rr_writer != curthread); ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); @@ -208,19 +214,28 @@ void rrw_exit(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); +#if !defined(DEBUG) && defined(_KERNEL) + if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { + rrl->rr_anon_rcount.rc_count--; + if (rrl->rr_anon_rcount.rc_count == 0) + cv_broadcast(&rrl->rr_cv); + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__exitmiss); +#endif ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || !refcount_is_zero(&rrl->rr_linked_rcount) || rrl->rr_writer != NULL); if (rrl->rr_writer == NULL) { - if (rrn_find_and_remove(rrl)) { - if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) - cv_broadcast(&rrl->rr_cv); - - } else { - if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) - cv_broadcast(&rrl->rr_cv); - } + int64_t count; + if (rrn_find_and_remove(rrl)) + count = refcount_remove(&rrl->rr_linked_rcount, tag); + else + count = refcount_remove(&rrl->rr_anon_rcount, tag); + if (count == 0) + cv_broadcast(&rrl->rr_cv); } else { ASSERT(rrl->rr_writer == curthread); ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6a95b399b..d7ed23e6f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1574,9 +1574,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. + * + * If spa_load_verbatim is true, trust the current + * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT) + state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) @@ -2271,6 +2274,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); + spa_history_log_version(spa, LOG_POOL_CREATE); spa->spa_minref = refcount_count(&spa->spa_refcount); @@ -2404,6 +2408,7 @@ spa_import_rootpool(char *devpath, char *devid) spa = spa_add(pname, NULL); spa->spa_is_root = B_TRUE; + spa->spa_load_verbatim = B_TRUE; /* * Build up a vdev tree based on the boot device's label config. @@ -2459,6 +2464,7 @@ spa_import_rootpool(char *devpath, char *devid) VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); error = 0; + spa_history_log_version(spa, LOG_POOL_IMPORT); out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); @@ -2491,6 +2497,8 @@ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, altroot); + spa->spa_load_verbatim = B_TRUE; + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); if (props != NULL) @@ -2499,6 +2507,7 @@ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); return (0); } @@ -2624,7 +2633,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) /* * Update the config cache to include the newly-imported pool. */ - spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, B_FALSE); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* @@ -2634,6 +2643,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); return (0); } @@ -2991,7 +3001,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; - dmu_tx_t *tx; char *oldvdpath, *newvdpath; int newvd_isspare; int error; @@ -3147,17 +3156,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - if (dmu_tx_assign(tx, TXG_WAIT) == 0) { - spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, - CRED(), "%s vdev=%s %s vdev=%s", - replacing && newvd_isspare ? "spare in" : - replacing ? "replace" : "attach", newvdpath, - replacing ? "for" : "to", oldvdpath); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } + spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, + CRED(), "%s vdev=%s %s vdev=%s", + replacing && newvd_isspare ? "spare in" : + replacing ? "replace" : "attach", newvdpath, + replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); @@ -3747,19 +3750,11 @@ spa_async_thread(spa_t *spa) * then log an internal history event. */ if (space_update) { - dmu_tx_t *tx; - - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - if (dmu_tx_assign(tx, TXG_WAIT) == 0) { - spa_history_internal_log(LOG_POOL_VDEV_ONLINE, - spa, tx, CRED(), - "pool '%s' size: %llu(+%llu)", - spa_name(spa), spa_get_space(spa), - space_update); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } + spa_history_internal_log(LOG_POOL_VDEV_ONLINE, + spa, NULL, CRED(), + "pool '%s' size: %llu(+%llu)", + spa_name(spa), spa_get_space(spa), + space_update); } } diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 7103e179b..b2063bba1 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -209,7 +209,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) ASSERT(MUTEX_HELD(&spa_namespace_lock)); - if (rootdir == NULL) + if (rootdir == NULL || !(spa_mode_global & FWRITE)) return; /* @@ -394,23 +394,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) } /* - * For a pool that's not currently a booting rootpool, update all disk labels, - * generate a fresh config based on the current in-core state, and sync the - * global config cache. - */ -void -spa_config_update(spa_t *spa, int what) -{ - spa_config_update_common(spa, what, FALSE); -} - -/* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void -spa_config_update_common(spa_t *spa, int what, boolean_t isroot) +spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; @@ -447,9 +436,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot) /* * Update the global config cache to reflect the new mosconfig. */ - if (!isroot) + if (!spa->spa_is_root) spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot); + spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 97d97d847..b77ac4208 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -390,13 +390,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) return (err); } -void -spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +static void +log_internal(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx) { history_arg_t *hap; char *str; - va_list adx; /* * If this is part of creating a pool, not everything is @@ -408,9 +407,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); - va_start(adx, fmt); (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx); - va_end(adx); hap->ha_log_type = LOG_INTERNAL; hap->ha_history_str = str; @@ -425,3 +422,48 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, } /* spa_history_log_sync() will free hap and str */ } + +void +spa_history_internal_log(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +{ + dmu_tx_t *htx = tx; + va_list adx; + + /* create a tx if we didn't get one */ + if (tx == NULL) { + htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(htx, TXG_WAIT) != 0) { + dmu_tx_abort(htx); + return; + } + } + + va_start(adx, fmt); + log_internal(event, spa, htx, cr, fmt, adx); + va_end(adx); + + /* if we didn't get a tx from the caller, commit the one we made */ + if (tx == NULL) + dmu_tx_commit(htx); +} + +void +spa_history_log_version(spa_t *spa, history_internal_events_t event) +{ +#ifdef _KERNEL + uint64_t current_vers = spa_version(spa); + + if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { + spa_history_internal_log(event, spa, NULL, CRED(), + "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", + (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, + utsname.nodename, utsname.release, utsname.version, + utsname.machine); + } + cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", + event == LOG_POOL_IMPORT ? "imported" : + event == LOG_POOL_CREATE ? "created" : "accessed", + (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); +#endif +} diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index aea3f5625..8150ac937 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -310,8 +310,12 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { + int wlocks_held = 0; + for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (scl->scl_writer == curthread) + wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); @@ -331,6 +335,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } + ASSERT(wlocks_held <= locks); } void diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3fa677e05..bb5024f98 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -405,22 +405,26 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { /* - * Currently, we can only support 2 parity devices. + * Currently, we can only support 3 parity devices. */ - if (nparity == 0 || nparity > 2) + if (nparity == 0 || nparity > 3) return (EINVAL); /* - * Older versions can only support 1 parity device. + * Previous versions could only support 1 or 2 parity + * device. */ - if (nparity == 2 && - spa_version(spa) < SPA_VERSION_RAID6) + if (nparity > 1 && + spa_version(spa) < SPA_VERSION_RAIDZ2) + return (ENOTSUP); + if (nparity > 2 && + spa_version(spa) < SPA_VERSION_RAIDZ3) return (ENOTSUP); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ - if (spa_version(spa) >= SPA_VERSION_RAID6) + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (EINVAL); /* * Otherwise, we default to 1 parity device for RAID-Z. @@ -993,6 +997,32 @@ vdev_probe(vdev_t *vd, zio_t *zio) return (NULL); } +static void +vdev_open_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_open_thread = curthread; + vd->vdev_open_error = vdev_open(vd); + vd->vdev_open_thread = NULL; +} + +void +vdev_open_children(vdev_t *vd) +{ + taskq_t *tq; + int children = vd->vdev_children; + + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], + TQ_SLEEP) != NULL); + + taskq_destroy(tq); +} + /* * Prepare a virtual device for access. */ @@ -1005,8 +1035,8 @@ vdev_open(vdev_t *vd) uint64_t asize, psize; uint64_t ashift = 0; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); @@ -1217,7 +1247,12 @@ vdev_validate(vdev_t *vd) nvlist_free(label); - if (spa->spa_load_state == SPA_LOAD_OPEN && + /* + * If spa->spa_load_verbatim is true, no need to check the + * state of the pool. + */ + if (!spa->spa_load_verbatim && + spa->spa_load_state == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 48d5fc232..06cb72012 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -246,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * into a crufty old storage pool. */ ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity == 2 && - spa_version(spa) >= SPA_VERSION_RAID6)); + (vd->vdev_nparity <= 2 && + spa_version(spa) >= SPA_VERSION_RAIDZ2) || + (vd->vdev_nparity <= 3 && + spa_version(spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add the nparity tag even on storage pools @@ -642,8 +644,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ub, VDEV_UBERBLOCK_SIZE(vd)); + ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); + bzero(ub, VDEV_UBERBLOCK_RING); *ub = spa->spa_uberblock; ub->ub_txg = 0; @@ -672,11 +674,9 @@ retry: offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_write(zio, vd, l, ub, - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); - } + vdev_label_write(zio, vd, l, ub, + offsetof(vdev_label_t, vl_uberblock), + VDEV_UBERBLOCK_RING, NULL, NULL, flags); } error = zio_wait(zio); @@ -688,7 +688,7 @@ retry: nvlist_free(label); zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); + zio_buf_free(ub, VDEV_UBERBLOCK_RING); zio_buf_free(vp, sizeof (vdev_phys_t)); /* diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index fff7e0842..836386d42 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -124,21 +124,21 @@ vdev_mirror_map_alloc(zio_t *zio) static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - vdev_t *cvd; - uint64_t c; int numerrors = 0; - int ret, lasterror = 0; + int lasterror = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_open_children(vd); - if ((ret = vdev_open(cvd)) != 0) { - lasterror = ret; + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) { + lasterror = cvd->vdev_open_error; numerrors++; continue; } @@ -158,9 +158,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_mirror_close(vdev_t *vd) { - uint64_t c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 5e57a1513..9867d0970 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -24,7 +24,7 @@ */ #include <sys/zfs_context.h> -#include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/avl.h> @@ -48,11 +48,14 @@ int zfs_vdev_time_shift = 6; int zfs_vdev_ramp_rate = 2; /* - * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. - * For read i/os, we also aggregate across small adjacency gaps. + * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. + * For read I/Os, we also aggregate across small adjacency gaps; for writes + * we include spans of optional I/Os to aid aggregation at the disk even when + * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; +int zfs_vdev_write_gap_limit = 4 << 10; /* * Virtual device vector for disk I/O scheduling. @@ -172,12 +175,14 @@ vdev_queue_agg_io_done(zio_t *aio) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { - zio_t *fio, *lio, *aio, *dio, *nio; + zio_t *fio, *lio, *aio, *dio, *nio, *mio; avl_tree_t *t; int flags; uint64_t maxspan = zfs_vdev_aggregation_limit; uint64_t maxgap; + int stretch; +again: ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || @@ -192,21 +197,88 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* - * We can aggregate I/Os that are adjacent and of the - * same flavor, as expressed by the AGG_INHERIT flags. - * The latter is necessary so that certain attributes - * of the I/O, such as whether it's a normal I/O or a - * scrub/resilver, can be preserved in the aggregate. + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ + + /* + * We keep track of the last non-optional I/O. + */ + mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; + + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-option I/O. */ while ((dio = AVL_PREV(t, fio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) + IO_SPAN(dio, lio) <= maxspan && + IO_GAP(dio, fio) <= maxgap) { fio = dio; + if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = fio; + } + /* + * Skip any initial optional I/Os. + */ + while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { + fio = AVL_NEXT(t, fio); + ASSERT(fio != NULL); + } + + /* + * Walk forward through sufficiently contiguous I/Os. + */ while ((dio = AVL_NEXT(t, lio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) + IO_SPAN(fio, dio) <= maxspan && + IO_GAP(lio, dio) <= maxgap) { lio = dio; + if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = lio; + } + + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + stretch = B_FALSE; + if (t != &vq->vq_read_tree && mio != NULL) { + nio = lio; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; + } + } + } + + if (stretch) { + /* This may be a no-op. */ + VERIFY((dio = AVL_NEXT(t, lio)) != NULL); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + while (lio != mio && lio != fio) { + ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); + lio = AVL_PREV(t, lio); + ASSERT(lio != NULL); + } + } } if (fio != lio) { @@ -225,10 +297,15 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_vdev_tree == t); - if (dio->io_type == ZIO_TYPE_WRITE) + if (dio->io_flags & ZIO_FLAG_NODATA) { + ASSERT(dio->io_type == ZIO_TYPE_WRITE); + bzero((char *)aio->io_data + (dio->io_offset - + aio->io_offset), dio->io_size); + } else if (dio->io_type == ZIO_TYPE_WRITE) { bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); + } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); @@ -244,6 +321,20 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); + /* + * If the I/O is or was optional and therefore has no data, we need to + * simply discard it. We need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will complete + * immediately. + */ + if (fio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(fio); + zio_execute(fio); + mutex_enter(&vq->vq_lock); + goto again; + } + avl_add(&vq->vq_pending_tree, fio); return (fio); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 92753d871..b3074173e 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -35,12 +35,27 @@ /* * Virtual device vector for RAID-Z. * - * This vdev supports both single and double parity. For single parity, we - * use a simple XOR of all the data columns. For double parity, we use both - * the simple XOR as well as a technique described in "The mathematics of - * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), - * over the integers expressable in a single byte. Briefly, the operations on - * the field are defined as follows: + * This vdev supports single, double, and triple parity. For single parity, + * we use a simple XOR of all the data columns. For double or triple parity, + * we use a special case of Reed-Solomon coding. This extends the + * technique described in "The mathematics of RAID-6" by H. Peter Anvin by + * drawing on the system described in "A Tutorial on Reed-Solomon Coding for + * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the + * former is also based. The latter is designed to provide higher performance + * for writes. + * + * Note that the Plank paper claimed to support arbitrary N+M, but was then + * amended six years later identifying a critical flaw that invalidates its + * claims. Nevertheless, the technique can be adapted to work for up to + * triple parity. For additional parity, the amendment "Note: Correction to + * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding + * is viable, but the additional complexity means that write performance will + * suffer. + * + * All of the methods above operate on a Galois field, defined over the + * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements + * can be expressed with a single byte. Briefly, the operations on the + * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B @@ -55,22 +70,32 @@ * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). + * As an aside, this multiplication is derived from the error correcting + * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather - * than field addition). The inverse of a field element A (A^-1) is A^254. + * than field addition). The inverse of a field element A (A^-1) is therefore + * A ^ (255 - 1) = A^254. * - * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, - * can be expressed by field operations: + * The up-to-three parity columns, P, Q, R over several data columns, + * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 + * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 + * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * - * See the reconstruction code below for how P and Q can used individually or - * in concert to recover missing data columns. + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival + * XOR operation, and 2 and 4 can be computed quickly and generate linearly- + * independent coefficients. (There are no additional coefficients that have + * this property which is why the uncorrected Plank method breaks down.) + * + * See the reconstruction code below for how P, Q and R can used individually + * or in concert to recover missing data columns. */ typedef struct raidz_col { @@ -84,21 +109,49 @@ typedef struct raidz_col { } raidz_col_t; typedef struct raidz_map { - uint64_t rm_cols; /* Column count */ + uint64_t rm_cols; /* Regular column count */ + uint64_t rm_scols; /* Count including skipped columns */ uint64_t rm_bigcols; /* Number of oversized columns */ uint64_t rm_asize; /* Actual total I/O size */ uint64_t rm_missingdata; /* Count of missing data devices */ uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ + uint64_t rm_skipped; /* Skipped sectors for padding */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 +#define VDEV_RAIDZ_R 2 +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) +#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) + +/* + * We provide a mechanism to perform the field multiplication operation on a + * 64-bit value all at once rather than a byte at a time. This works by + * creating a mask from the top bit in each byte and using that to + * conditionally apply the XOR of 0x1d. + */ +#define VDEV_RAIDZ_64MUL_2(x, mask) \ +{ \ + (mask) = (x) & 0x8080808080808080ULL; \ + (mask) = ((mask) << 1) - ((mask) >> 7); \ + (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ + ((mask) & 0x1d1d1d1d1d1d1d1d); \ +} -#define VDEV_RAIDZ_MAXPARITY 2 +#define VDEV_RAIDZ_64MUL_4(x, mask) \ +{ \ + VDEV_RAIDZ_64MUL_2((x), mask); \ + VDEV_RAIDZ_64MUL_2((x), mask); \ +} -#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) +/* + * Force reconstruction to use the general purpose method. + */ +int vdev_raidz_default_to_general; /* * These two tables represent powers and logs of 2 in the Galois field defined @@ -201,7 +254,7 @@ vdev_raidz_map_free(zio_t *zio) for (c = 0; c < rm->rm_firstdatacol; c++) zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } static raidz_map_t * @@ -213,24 +266,35 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t s = zio->io_size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; - uint64_t q, r, c, bc, col, acols, coff, devidx; + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; q = s / (dcols - nparity); r = s - q * (dcols - nparity); bc = (r == 0 ? 0 : r + nparity); + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + if (q == 0) { + acols = bc; + scols = MIN(dcols, roundup(bc, nparity + 1)); + } else { + acols = dcols; + scols = dcols; + } - acols = (q == 0 ? bc : dcols); + ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); rm->rm_cols = acols; + rm->rm_scols = scols; rm->rm_bigcols = bc; - rm->rm_asize = 0; rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; - for (c = 0; c < acols; c++) { + asize = 0; + + for (c = 0; c < scols; c++) { col = f + c; coff = o; if (col >= dcols) { @@ -239,15 +303,26 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; rm->rm_col[c].rc_data = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; - rm->rm_asize += rm->rm_col[c].rc_size; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; } - rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + rm->rm_skipped = roundup(tot, nparity + 1) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift); + ASSERT3U(rm->rm_skipped, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); @@ -305,12 +380,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, p++, src++) { + for (i = 0; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, p++, src++) { + for (i = 0; i < ccount; i++, src++, p++) { *p ^= *src; } } @@ -320,10 +395,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *q, *p, *src, pcount, ccount, mask, i; + uint64_t *p, *q, *src, pcnt, ccnt, mask, i; int c; - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); @@ -331,55 +406,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount || ccount == 0); - for (i = 0; i < ccount; i++, p++, q++, src++) { - *q = *src; + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++) { *p = *src; + *q = *src; } - for (; i < pcount; i++, p++, q++, src++) { - *q = 0; + for (; i < pcnt; i++, src++, p++, q++) { *p = 0; + *q = 0; } } else { - ASSERT(ccount <= pcount); + ASSERT(ccnt <= pcnt); /* - * Rather than multiplying each byte individually (as - * described above), we are able to handle 8 at once - * by generating a mask based on the high bit in each - * byte and using that to conditionally XOR in 0x1d. + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. */ - for (i = 0; i < ccount; i++, p++, q++, src++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + for (i = 0; i < ccnt; i++, src++, p++, q++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); *q ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + } + } + } +} + +static void +vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; } /* * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. */ - for (; i < pcount; i++, q++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); } } } } +/* + * Generate RAID parity in the first virtual columns according to the number of + * parity columns available. + */ static void -vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + } +} + +static int +vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, i; + int x = tgts[0]; int c; + ASSERT(ntgts == 1); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(x < rm->rm_cols); + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); ASSERT(xcount > 0); @@ -404,15 +562,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) *dst ^= *src; } } + + return (1 << VDEV_RAIDZ_P); } -static void -vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) +static int +vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, mask, i; uint8_t *b; + int x = tgts[0]; int c, j, exp; + ASSERT(ntgts == 1); + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); @@ -436,23 +599,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) } } else { - /* - * For an explanation of this, see the comment in - * vdev_raidz_generate_parity_pq() above. - */ for (i = 0; i < count; i++, dst++, src++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } for (; i < xcount; i++, dst++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + VDEV_RAIDZ_64MUL_2(*dst, mask); } } } @@ -467,15 +620,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) *b = vdev_raidz_exp2(*b, exp); } } + + return (1 << VDEV_RAIDZ_Q); } -static void -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) +static int +vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; void *pdata, *qdata; uint64_t xsize, ysize, i; + int x = tgts[0]; + int y = tgts[1]; + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rm->rm_firstdatacol); ASSERT(y < rm->rm_cols); @@ -553,15 +711,554 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) */ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; + + return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); +} + +/* BEGIN CSTYLED */ +/* + * In the general case of reconstruction, we must solve the system of linear + * equations defined by the coeffecients used to generate parity as well as + * the contents of the data and parity disks. This can be expressed with + * vectors for the original data (D) and the actual data (d) and parity (p) + * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): + * + * __ __ __ __ + * | | __ __ | p_0 | + * | V | | D_0 | | p_m-1 | + * | | x | : | = | d_0 | + * | I | | D_n-1 | | : | + * | | ~~ ~~ | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * I is simply a square identity matrix of size n, and V is a vandermonde + * matrix defined by the coeffecients we chose for the various parity columns + * (1, 2, 4). Note that these values were chosen both for simplicity, speedy + * computation as well as linear separability. + * + * __ __ __ __ + * | 1 .. 1 1 1 | | p_0 | + * | 2^n-1 .. 4 2 1 | __ __ | : | + * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | + * | 1 .. 0 0 0 | | D_1 | | d_0 | + * | 0 .. 0 0 0 | x | D_2 | = | d_1 | + * | : : : : | | : | | d_2 | + * | 0 .. 1 0 0 | | D_n-1 | | : | + * | 0 .. 0 1 0 | ~~ ~~ | : | + * | 0 .. 0 0 1 | | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * Note that I, V, d, and p are known. To compute D, we must invert the + * matrix and use the known data and parity values to reconstruct the unknown + * data values. We begin by removing the rows in V|I and d|p that correspond + * to failed or missing columns; we then make V|I square (n x n) and d|p + * sized n by removing rows corresponding to unused parity from the bottom up + * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' + * using Gauss-Jordan elimination. In the example below we use m=3 parity + * columns, n=8 data columns, with errors in d_1, d_2, and p_1: + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks + * | 19 205 116 29 64 16 4 1 | / / + * | 1 0 0 0 0 0 0 0 | / / + * | 0 1 0 0 0 0 0 0 | <--' / + * (V|I) = | 0 0 1 0 0 0 0 0 | <---' + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | + * | 19 205 116 29 64 16 4 1 | + * | 1 0 0 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 | + * (V|I)' = | 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We + * have carefully chosen the seed values 1, 2, and 4 to ensure that this + * matrix is not singular. + * __ __ + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 0 0 1 0 0 0 0 0 | + * | 167 100 5 41 159 169 217 208 | + * | 166 100 4 40 158 168 216 209 | + * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values + * of the missing data. + * + * As is apparent from the example above, the only non-trivial rows in the + * inverse matrix correspond to the data disks that we're trying to + * reconstruct. Indeed, those are the only rows we need as the others would + * only be useful for reconstructing data known or assumed to be valid. For + * that reason, we only build the coefficients in the rows that correspond to + * targeted columns. + */ +/* END CSTYLED */ + +static void +vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, + uint8_t **rows) +{ + int i, j; + int pow; + + ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + + /* + * Fill in the missing rows of interest. + */ + for (i = 0; i < nmap; i++) { + ASSERT3S(0, <=, map[i]); + ASSERT3S(map[i], <=, 2); + + pow = map[i] * n; + if (pow > 255) + pow -= 255; + ASSERT(pow <= 255); + + for (j = 0; j < n; j++) { + pow -= map[i]; + if (pow < 0) + pow += 255; + rows[i][j] = vdev_raidz_pow2[pow]; + } + } +} + +static void +vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, + uint8_t **rows, uint8_t **invrows, const uint8_t *used) +{ + int i, j, ii, jj; + uint8_t log; + + /* + * Assert that the first nmissing entries from the array of used + * columns correspond to parity columns and that subsequent entries + * correspond to data columns. + */ + for (i = 0; i < nmissing; i++) { + ASSERT3S(used[i], <, rm->rm_firstdatacol); + } + for (; i < n; i++) { + ASSERT3S(used[i], >=, rm->rm_firstdatacol); + } + + /* + * First initialize the storage where we'll compute the inverse rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + invrows[i][j] = (i == j) ? 1 : 0; + } + } + + /* + * Subtract all trivial rows from the rows of consequence. + */ + for (i = 0; i < nmissing; i++) { + for (j = nmissing; j < n; j++) { + ASSERT3U(used[j], >=, rm->rm_firstdatacol); + jj = used[j] - rm->rm_firstdatacol; + ASSERT3S(jj, <, n); + invrows[i][j] = rows[i][jj]; + rows[i][jj] = 0; + } + } + + /* + * For each of the rows of interest, we must normalize it and subtract + * a multiple of it from the other rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < missing[i]; j++) { + ASSERT3U(rows[i][j], ==, 0); + } + ASSERT3U(rows[i][missing[i]], !=, 0); + + /* + * Compute the inverse of the first element and multiply each + * element in the row by that value. + */ + log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[i][j] = vdev_raidz_exp2(rows[i][j], log); + invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); + } + + for (ii = 0; ii < nmissing; ii++) { + if (i == ii) + continue; + + ASSERT3U(rows[ii][missing[i]], !=, 0); + + log = vdev_raidz_log2[rows[ii][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[ii][j] ^= + vdev_raidz_exp2(rows[i][j], log); + invrows[ii][j] ^= + vdev_raidz_exp2(invrows[i][j], log); + } + } + } + + /* + * Verify that the data that is left in the rows are properly part of + * an identity matrix. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + if (j == missing[i]) { + ASSERT3U(rows[i][j], ==, 1); + } else { + ASSERT3U(rows[i][j], ==, 0); + } + } + } } +static void +vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, + int *missing, uint8_t **invrows, const uint8_t *used) +{ + int i, j, x, cc, c; + uint8_t *src; + uint64_t ccount; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; + uint8_t log, val; + int ll; + uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; + uint8_t *p, *pp; + size_t psize; + + psize = sizeof (invlog[0][0]) * n * nmissing; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing; i++) { + invlog[i] = pp; + pp += n; + } + + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + ASSERT3U(invrows[i][j], !=, 0); + invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; + } + } + + for (i = 0; i < n; i++) { + c = used[i]; + ASSERT3U(c, <, rm->rm_cols); + + src = rm->rm_col[c].rc_data; + ccount = rm->rm_col[c].rc_size; + for (j = 0; j < nmissing; j++) { + cc = missing[j] + rm->rm_firstdatacol; + ASSERT3U(cc, >=, rm->rm_firstdatacol); + ASSERT3U(cc, <, rm->rm_cols); + ASSERT3U(cc, !=, c); + + dst[j] = rm->rm_col[cc].rc_data; + dcount[j] = rm->rm_col[cc].rc_size; + } + + ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); + + for (x = 0; x < ccount; x++, src++) { + if (*src != 0) + log = vdev_raidz_log2[*src]; + + for (cc = 0; cc < nmissing; cc++) { + if (x >= dcount[cc]) + continue; + + if (*src == 0) { + val = 0; + } else { + if ((ll = log + invlog[cc][i]) >= 255) + ll -= 255; + val = vdev_raidz_pow2[ll]; + } + + if (i == 0) + dst[cc][x] = val; + else + dst[cc][x] ^= val; + } + } + } + + kmem_free(p, psize); +} + +static int +vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +{ + int n, i, c, t, tt; + int nmissing_rows; + int missing_rows[VDEV_RAIDZ_MAXPARITY]; + int parity_map[VDEV_RAIDZ_MAXPARITY]; + + uint8_t *p, *pp; + size_t psize; + + uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *used; + + int code = 0; + + + n = rm->rm_cols - rm->rm_firstdatacol; + + /* + * Figure out which data columns are missing. + */ + nmissing_rows = 0; + for (t = 0; t < ntgts; t++) { + if (tgts[t] >= rm->rm_firstdatacol) { + missing_rows[nmissing_rows++] = + tgts[t] - rm->rm_firstdatacol; + } + } + + /* + * Figure out which parity columns to use to help generate the missing + * data columns. + */ + for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { + ASSERT(tt < ntgts); + ASSERT(c < rm->rm_firstdatacol); + + /* + * Skip any targeted parity columns. + */ + if (c == tgts[tt]) { + tt++; + continue; + } + + code |= 1 << c; + + parity_map[i] = c; + i++; + } + + ASSERT(code != 0); + ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); + + psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * + nmissing_rows * n + sizeof (used[0]) * n; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing_rows; i++) { + rows[i] = pp; + pp += n; + invrows[i] = pp; + pp += n; + } + used = pp; + + for (i = 0; i < nmissing_rows; i++) { + used[i] = parity_map[i]; + } + + for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + if (tt < nmissing_rows && + c == missing_rows[tt] + rm->rm_firstdatacol) { + tt++; + continue; + } + + ASSERT3S(i, <, n); + used[i] = c; + i++; + } + + /* + * Initialize the interesting rows of the matrix. + */ + vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + + /* + * Invert the matrix. + */ + vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + invrows, used); + + /* + * Reconstruct the missing data using the generated matrix. + */ + vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + invrows, used); + + kmem_free(p, psize); + + return (code); +} + +static int +vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY], *dt; + int ntgts; + int i, c; + int code; + int nbadparity, nbaddata; + int parity_valid[VDEV_RAIDZ_MAXPARITY]; + + /* + * The tgts list must already be sorted. + */ + for (i = 1; i < nt; i++) { + ASSERT(t[i] > t[i - 1]); + } + + nbadparity = rm->rm_firstdatacol; + nbaddata = rm->rm_cols - nbadparity; + ntgts = 0; + for (i = 0, c = 0; c < rm->rm_cols; c++) { + if (c < rm->rm_firstdatacol) + parity_valid[c] = B_FALSE; + + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rm->rm_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } else if (c >= rm->rm_firstdatacol) { + nbaddata--; + } else { + parity_valid[c] = B_TRUE; + nbadparity--; + } + } + + ASSERT(ntgts >= nt); + ASSERT(nbaddata >= 0); + ASSERT(nbaddata + nbadparity == ntgts); + + dt = &tgts[nbadparity]; + + /* + * See if we can use any of our optimized reconstruction routines. + */ + if (!vdev_raidz_default_to_general) { + switch (nbaddata) { + case 1: + if (parity_valid[VDEV_RAIDZ_P]) + return (vdev_raidz_reconstruct_p(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_q(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 2); + break; + + case 2: + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_P] && + parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + + ASSERT(rm->rm_firstdatacol > 2); + + break; + } + } + + code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); + ASSERT(code > 0); + return (code); +} static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; - int c, error; + int c; int lasterror = 0; int numerrors = 0; @@ -573,11 +1270,13 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) return (EINVAL); } + vdev_open_children(vd); + for (c = 0; c < vd->vdev_children; c++) { cvd = vd->vdev_child[c]; - if ((error = vdev_open(cvd)) != 0) { - lasterror = error; + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; numerrors++; continue; } @@ -639,7 +1338,7 @@ vdev_raidz_io_start(zio_t *zio) blkptr_t *bp = zio->io_bp; raidz_map_t *rm; raidz_col_t *rc; - int c; + int c, i; rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); @@ -647,13 +1346,7 @@ vdev_raidz_io_start(zio_t *zio) ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * Generate RAID parity in the first virtual columns. - */ - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; @@ -664,6 +1357,23 @@ vdev_raidz_io_start(zio_t *zio) vdev_raidz_child_done, rc)); } + /* + * Generate optional I/Os for any skipped sectors to improve + * aggregation contiguity. + */ + for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) { + ASSERT(c <= rm->rm_scols); + if (c == rm->rm_scols) + c = 0; + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, + 1 << tvd->vdev_ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + return (ZIO_PIPELINE_CONTINUE); } @@ -671,8 +1381,7 @@ vdev_raidz_io_start(zio_t *zio) /* * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity - * data. + * last -- any errors along the way will force us to read the parity. */ for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; @@ -748,10 +1457,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) bcopy(rc->rc_data, orig[c], rc->rc_size); } - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; @@ -768,9 +1474,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) return (ret); } -static uint64_t raidz_corrected_p; -static uint64_t raidz_corrected_q; -static uint64_t raidz_corrected_pq; +/* + * Keep statistics on all the ways that we used parity to correct data. + */ +static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; static int vdev_raidz_worst_error(raidz_map_t *rm) @@ -783,19 +1490,176 @@ vdev_raidz_worst_error(raidz_map_t *rm) return (error); } +/* + * Iterate over all combinations of bad data and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + */ +static int +vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) +{ + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc; + void *orig[VDEV_RAIDZ_MAXPARITY]; + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *tgts = &tstore[1]; + int current, next, i, c, n; + int code, ret = 0; + + ASSERT(total_errors < rm->rm_firstdatacol); + + /* + * This simplifies one edge condition. + */ + tgts[-1] = -1; + + for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + /* + * Initialize the targets array by finding the first n columns + * that contain no error. + * + * If there were no data errors, we need to ensure that we're + * always explicitly attempting to reconstruct at least one + * data column. To do this, we simply push the highest target + * up into the data columns. + */ + for (c = 0, i = 0; i < n; i++) { + if (i == n - 1 && data_errors == 0 && + c < rm->rm_firstdatacol) { + c = rm->rm_firstdatacol; + } + + while (rm->rm_col[c].rc_error != 0) { + c++; + ASSERT3S(c, <, rm->rm_cols); + } + + tgts[i] = c++; + } + + /* + * Setting tgts[n] simplifies the other edge condition. + */ + tgts[n] = rm->rm_cols; + + /* + * These buffers were allocated in previous iterations. + */ + for (i = 0; i < n - 1; i++) { + ASSERT(orig[i] != NULL); + } + + orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); + + current = 0; + next = tgts[current]; + + while (current != n) { + tgts[current] = next; + current = 0; + + /* + * Save off the original data that we're going to + * attempt to reconstruct. + */ + for (i = 0; i < n; i++) { + ASSERT(orig[i] != NULL); + c = tgts[i]; + ASSERT3S(c, >=, 0); + ASSERT3S(c, <, rm->rm_cols); + rc = &rm->rm_col[c]; + bcopy(rc->rc_data, orig[i], rc->rc_size); + } + + /* + * Attempt a reconstruction and exit the outer loop on + * success. + */ + code = vdev_raidz_reconstruct(rm, tgts, n); + if (zio_checksum_error(zio) == 0) { + atomic_inc_64(&raidz_corrected[code]); + + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + ASSERT(rc->rc_error == 0); + if (rc->rc_tried) + raidz_checksum_error(zio, rc); + rc->rc_error = ECKSUM; + } + + ret = code; + goto done; + } + + /* + * Restore the original data. + */ + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + bcopy(orig[i], rc->rc_data, rc->rc_size); + } + + do { + /* + * Find the next valid column after the current + * position.. + */ + for (next = tgts[current] + 1; + next < rm->rm_cols && + rm->rm_col[next].rc_error != 0; next++) + continue; + + ASSERT(next <= tgts[current + 1]); + + /* + * If that spot is available, we're done here. + */ + if (next != tgts[current + 1]) + break; + + /* + * Otherwise, find the next valid column after + * the previous position. + */ + for (c = tgts[current - 1] + 1; + rm->rm_col[c].rc_error != 0; c++) + continue; + + tgts[current] = c; + current++; + + } while (current != n); + } + } + n--; +done: + for (i = 0; i < n; i++) { + zio_buf_free(orig[i], rm->rm_col[0].rc_size); + } + + return (ret); +} + static void vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *cvd; raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc, *rc1; + raidz_col_t *rc; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; - int n, c, c1; + int n, c; + int tgts[VDEV_RAIDZ_MAXPARITY]; + int code; ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ @@ -859,8 +1723,7 @@ vdev_raidz_io_done(zio_t *zio) * any errors. */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { - switch (data_errors) { - case 0: + if (data_errors == 0) { if (zio_checksum_error(zio) == 0) { /* * If we read parity information (unnecessarily @@ -880,9 +1743,7 @@ vdev_raidz_io_done(zio_t *zio) } goto done; } - break; - - case 1: + } else { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we @@ -894,45 +1755,38 @@ vdev_raidz_io_done(zio_t *zio) ASSERT(parity_errors < rm->rm_firstdatacol); /* - * Find the column that reported the error. + * Identify the data columns that reported an error. */ + n = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { - vdev_raidz_reconstruct_p(rm, c); - } else { - ASSERT(rm->rm_firstdatacol > 1); - vdev_raidz_reconstruct_q(rm, c); - } + ASSERT(rm->rm_firstdatacol >= n); + + code = vdev_raidz_reconstruct(rm, tgts, n); if (zio_checksum_error(zio) == 0) { - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) - atomic_inc_64(&raidz_corrected_p); - else - atomic_inc_64(&raidz_corrected_q); + atomic_inc_64(&raidz_corrected[code]); /* - * If there's more than one parity disk that - * was successfully read, confirm that the - * other parity disk produced the correct data. - * This routine is suboptimal in that it - * regenerates both the parity we wish to test - * as well as the parity we just used to - * perform the reconstruction, but this should - * be a relatively uncommon case, and can be - * optimized if it becomes a problem. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. + * If we read more parity disks than were used + * for reconstruction, confirm that the other + * parity disks produced correct data. This + * routine is suboptimal in that it regenerates + * the parity that we already used in addition + * to the parity that we're attempting to + * verify, but this should be a relatively + * uncommon case, and can be optimized if it + * becomes a problem. Note that we regenerate + * parity when resilvering so we can write it + * out to failed devices later. */ - if (parity_errors < rm->rm_firstdatacol - 1 || + if (parity_errors < rm->rm_firstdatacol - n || (zio->io_flags & ZIO_FLAG_RESILVER)) { n = raidz_parity_verify(zio, rm); unexpected_errors += n; @@ -942,46 +1796,6 @@ vdev_raidz_io_done(zio_t *zio) goto done; } - break; - - case 2: - /* - * Two data column errors require double parity. - */ - ASSERT(rm->rm_firstdatacol == 2); - - /* - * Find the two columns that reported errors. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - for (c1 = c++; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - vdev_raidz_reconstruct_pq(rm, c1, c); - - if (zio_checksum_error(zio) == 0) { - atomic_inc_64(&raidz_corrected_pq); - goto done; - } - break; - - default: - ASSERT(rm->rm_firstdatacol <= 2); - ASSERT(0); } } @@ -1020,8 +1834,10 @@ vdev_raidz_io_done(zio_t *zio) * errors we detected, and we've attempted to read all columns. There * must, therefore, be one or more additional problems -- silent errors * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. Before we attempt combinatorial reconstruction make - * sure we have a chance of coming up with the right answer. + * in absent data. We check if there is enough additional data to + * possibly reconstruct the data and then perform combinatorial + * reconstruction over all possible combinations. If that fails, + * we're cooked. */ if (total_errors >= rm->rm_firstdatacol) { zio->io_error = vdev_raidz_worst_error(rm); @@ -1032,133 +1848,30 @@ vdev_raidz_io_done(zio_t *zio) */ if (total_errors == rm->rm_firstdatacol) zio->io_error = zio_worst_error(zio->io_error, ECKSUM); - goto done; - } - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { + } else if ((code = vdev_raidz_combrec(zio, total_errors, + data_errors)) != 0) { /* - * Attempt to reconstruct the data from parity P. + * If we didn't use all the available parity for the + * combinatorial reconstruction, verify that the remaining + * parity is correct. */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_p(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - atomic_inc_64(&raidz_corrected_p); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + if (code != (1 << rm->rm_firstdatacol) - 1) + (void) raidz_parity_verify(zio, rm); + } else { /* - * Attempt to reconstruct the data from parity Q. + * All combinations failed to checksum. Generate checksum + * ereports for all children. */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_q(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - atomic_inc_64(&raidz_corrected_q); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } + zio->io_error = ECKSUM; - if (rm->rm_firstdatacol > 1 && - rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && - rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { - /* - * Attempt to reconstruct the data from both P and Q. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { - void *orig, *orig1; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - - for (c1 = c + 1; c1 < rm->rm_cols; c1++) { - rc1 = &rm->rm_col[c1]; - - orig1 = zio_buf_alloc(rc1->rc_size); - bcopy(rc1->rc_data, orig1, rc1->rc_size); - - vdev_raidz_reconstruct_pq(rm, c, c1); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - zio_buf_free(orig1, rc1->rc_size); - atomic_inc_64(&raidz_corrected_pq); - - /* - * If these children didn't know they - * returned bad data, inform them. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - if (rc1->rc_tried && rc1->rc_error == 0) - raidz_checksum_error(zio, rc1); - - rc->rc_error = ECKSUM; - rc1->rc_error = ECKSUM; - - goto done; - } - - bcopy(orig1, rc1->rc_data, rc1->rc_size); - zio_buf_free(orig1, rc1->rc_size); + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, vd->vdev_child[rc->rc_devidx], + zio, rc->rc_offset, rc->rc_size); } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - /* - * All combinations failed to checksum. Generate checksum ereports for - * all children. - */ - zio->io_error = ECKSUM; - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, - rc->rc_offset, rc->rc_size); } } diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 88383f002..524c8e606 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,7 +52,6 @@ too_many_errors(vdev_t *vd, int numerrors) static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - int c; int lasterror = 0; int numerrors = 0; @@ -61,15 +60,14 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) return (EINVAL); } - for (c = 0; c < vd->vdev_children; c++) { + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - int error; - if ((error = vdev_open(cvd)) != 0 && - !cvd->vdev_islog) { - lasterror = error; + if (cvd->vdev_open_error && !cvd->vdev_islog) { + lasterror = cvd->vdev_open_error; numerrors++; - continue; } } @@ -87,9 +85,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_root_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index fbc93b423..528d31d5e 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1068,7 +1068,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, - uint64_t *towrite, uint64_t *tooverwrite, uint64_t dn_datablkshift) + uint64_t *towrite, uint64_t *tooverwrite) { zap_t *zap; int err = 0; @@ -1113,28 +1113,28 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; } } else { - if (!add) { - if (dmu_buf_freeable(zap->zap_dbuf)) - *tooverwrite += SPA_MAXBLOCKSIZE; - else - *towrite += SPA_MAXBLOCKSIZE; - } else { - /* - * We are here if we are adding and (name != NULL). - * It is hard to find out if this add will promote this - * microzap to fatzap. Hence, we assume the worst case - * and account for the blocks assuming this microzap - * would be promoted to a fatzap. - * - * 1 block overwritten : header block - * 4 new blocks written : 2 new split leaf, 2 grown - * ptrtbl blocks - */ - if (dmu_buf_freeable(zap->zap_dbuf)) - *tooverwrite += 1 << dn_datablkshift; - else - *towrite += 1 << dn_datablkshift; - *towrite += 4 << dn_datablkshift; + /* + * We are here if (name != NULL) and this is a micro-zap. + * We account for the header block depending on whether it + * is freeable. + * + * Incase of an add-operation it is hard to find out + * if this add will promote this microzap to fatzap. + * Hence, we consider the worst case and account for the + * blocks assuming this microzap would be promoted to a + * fatzap. + * + * 1 block overwritten : header block + * 4 new blocks written : 2 new split leaf, 2 grown + * ptrtbl blocks + */ + if (dmu_buf_freeable(zap->zap_dbuf)) + *tooverwrite += SPA_MAXBLOCKSIZE; + else + *towrite += SPA_MAXBLOCKSIZE; + + if (add) { + *towrite += 4 * SPA_MAXBLOCKSIZE; } } diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c index 734bd8395..12ffe9f30 100644 --- a/module/zfs/zfs_acl.c +++ b/module/zfs/zfs_acl.c @@ -93,6 +93,8 @@ #define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ ZFS_ACL_OBJ_ACE) +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + static uint16_t zfs_ace_v0_get_type(void *acep) { @@ -781,6 +783,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) uint64_t who; uint16_t iflags, type; uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); @@ -905,8 +908,32 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) } } } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; } } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED; + else + zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED; + return (mode); } @@ -946,7 +973,8 @@ zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) } /* - * Read an external acl object. + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. */ static int zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) @@ -960,8 +988,15 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { *aclpp = zfs_acl_node_read_internal(zp, will_modify); + if (!will_modify) + zp->z_acl_cached = *aclpp; return (0); } @@ -995,6 +1030,8 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) } *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; return (0); } @@ -1019,11 +1056,16 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(zp->z_dbuf, tx); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + zphys->zp_mode = zfs_mode_compute(zp, aclp); /* - * Decide which opbject type to use. If we are forced to - * use old ACL format than transform ACL into zfs_oldace_t + * Decide which object type to use. If we are forced to + * use old ACL format then transform ACL into zfs_oldace_t * layout. */ if (!zfsvfs->z_use_fuids) { @@ -1869,7 +1911,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, mutex_exit(&dzp->z_acl_lock); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_type, paclp, acl_ids->z_mode, &need_chmod); - zfs_acl_free(paclp); } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); @@ -1998,8 +2039,6 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); - return (0); } @@ -2095,11 +2134,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); } top: - if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) { - zfs_acl_free(aclp); - return (error); - } - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); @@ -2145,6 +2179,7 @@ top: error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); + zp->z_acl_cached = aclp; if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); @@ -2154,7 +2189,6 @@ top: if (fuidp) zfs_fuid_info_free(fuidp); - zfs_acl_free(aclp); dmu_tx_commit(tx); done: mutex_exit(&zp->z_acl_lock); @@ -2301,7 +2335,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, checkit = B_TRUE; break; } else { - zfs_acl_free(aclp); mutex_exit(&zp->z_acl_lock); return (EIO); } @@ -2321,7 +2354,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, uint32_t, mask_matched); if (anyaccess) { mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); return (0); } } @@ -2334,7 +2366,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, } mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); /* Put the found 'denies' back on the working mode */ if (deny_mask) { @@ -2366,8 +2397,7 @@ zfs_has_access(znode_t *zp, cred_t *cr) secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 || secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 || secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 || - secpolicy_vnode_chown(cr, B_TRUE) == 0 || - secpolicy_vnode_chown(cr, B_FALSE) == 0 || + secpolicy_vnode_chown(cr, owner) == 0 || secpolicy_vnode_setdac(cr, owner) == 0 || secpolicy_vnode_remove(cr) == 0); } @@ -2421,6 +2451,78 @@ zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, check_privs, B_FALSE, cr)); } +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t fowner; + uid_t gowner; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + return (EACCES); + + is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) && + (ZTOV(zdp)->v_type == VDIR)); + if (is_attr) + goto slow; + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 || + FUID_INDEX(zdp->z_phys->zp_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + fowner = (uid_t)zdp->z_phys->zp_uid; + gowner = (uid_t)zdp->z_phys->zp_gid; + + if (uid == fowner) { + owner = B_TRUE; + if (zdp->z_phys->zp_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(gowner, cr)) { + groupmbr = B_TRUE; + if (zdp->z_phys->zp_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_phys->zp_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(zdp->z_zfsvfs); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(zdp->z_zfsvfs); + return (error); +} + /* * Determine whether Access should be granted/denied, invoking least * priv subsytem when a deny is determined. @@ -2515,7 +2617,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) owner, checkmode); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_chown(cr, B_TRUE); + error = secpolicy_vnode_chown(cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) error = secpolicy_vnode_setdac(cr, owner); @@ -2524,7 +2626,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) error = secpolicy_vnode_remove(cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { - error = secpolicy_vnode_chown(cr, B_FALSE); + error = secpolicy_vnode_chown(cr, owner); } if (error == 0) { /* diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 27c2c51a3..c6c719871 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -700,7 +700,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, if (err) avl_add(&sdp->sd_snaps, sep); else - err = dmu_objset_destroy(snapname); + err = dmu_objset_destroy(snapname, B_FALSE); } else { err = ENOENT; } diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 8e481dffb..e704b1ca9 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -353,6 +353,7 @@ retry: rw_exit(&zfsvfs->z_fuid_lock); return (retidx); } else { + rw_exit(&zfsvfs->z_fuid_lock); return (-1); } } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 07dd03c35..9cb40816d 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -761,6 +761,20 @@ zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr)); } +static int +zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_HOLD, cr)); +} + +static int +zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RELEASE, cr)); +} + /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ @@ -2466,7 +2480,7 @@ zfs_ioc_create(zfs_cmd_t *zc) */ if (error == 0) { if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0) - (void) dmu_objset_destroy(zc->zc_name); + (void) dmu_objset_destroy(zc->zc_name, B_FALSE); } nvlist_free(nvprops); return (error); @@ -2553,8 +2567,9 @@ zfs_unmount_snap(char *name, void *arg) /* * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot + * zc_name name of filesystem + * zc_value short name of snapshot + * zc_defer_destroy mark for deferred destroy * * outputs: none */ @@ -2569,13 +2584,15 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc) zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); if (err) return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); + return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value, + zc->zc_defer_destroy)); } /* * inputs: * zc_name name of dataset to destroy * zc_objset_type type of objset + * zc_defer_destroy mark for deferred destroy * * outputs: none */ @@ -2588,7 +2605,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc) return (err); } - return (dmu_objset_destroy(zc->zc_name)); + return (dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy)); } /* @@ -2708,7 +2725,6 @@ zfs_ioc_recv(zfs_cmd_t *zc) file_t *fp; objset_t *os; dmu_recv_cookie_t drc; - zfsvfs_t *zfsvfs = NULL; boolean_t force = (boolean_t)zc->zc_guid; int error, fd; offset_t off; @@ -2740,25 +2756,12 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (EBADF); } - if (getzfsvfs(tofs, &zfsvfs) == 0) { - if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { - VFS_RELE(zfsvfs->z_vfs); - zfsvfs = NULL; - error = EBUSY; - goto out; - } + if (props && dmu_objset_open(tofs, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { /* * If new properties are supplied, they are to completely * replace the existing ones, so stash away the existing ones. */ - if (props) - (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE); - } else if (props && dmu_objset_open(tofs, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - /* - * Get the props even if there was no zfsvfs (zvol or - * unmounted zpl). - */ (void) dsl_prop_get_all(os, &origprops, TRUE); dmu_objset_close(os); @@ -2772,7 +2775,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) } error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, - force, origin, zfsvfs != NULL, &drc); + force, origin, &drc); if (origin) dmu_objset_close(origin); if (error) @@ -2793,25 +2796,33 @@ zfs_ioc_recv(zfs_cmd_t *zc) off = fp->f_offset; error = dmu_recv_stream(&drc, fp->f_vnode, &off); - if (error == 0 && zfsvfs) { - char *osname; - int mode; + if (error == 0) { + zfsvfs_t *zfsvfs = NULL; - /* online recv */ - osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - error = zfs_suspend_fs(zfsvfs, osname, &mode); - if (error == 0) { - int resume_err; + if (getzfsvfs(tofs, &zfsvfs) == 0) { + /* online recv */ + int end_err; + char *osname; + int mode; - error = dmu_recv_end(&drc); - resume_err = zfs_resume_fs(zfsvfs, osname, mode); - error = error ? error : resume_err; + osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + error = zfs_suspend_fs(zfsvfs, osname, &mode); + /* + * If the suspend fails, then the recv_end will + * likely also fail, and clean up after itself. + */ + end_err = dmu_recv_end(&drc); + if (error == 0) { + int resume_err = + zfs_resume_fs(zfsvfs, osname, mode); + error = error ? error : resume_err; + } + error = error ? error : end_err; + VFS_RELE(zfsvfs->z_vfs); + kmem_free(osname, MAXNAMELEN); } else { - dmu_recv_abort_cleanup(&drc); + error = dmu_recv_end(&drc); } - kmem_free(osname, MAXNAMELEN); - } else if (error == 0) { - error = dmu_recv_end(&drc); } zc->zc_cookie = off - fp->f_offset; @@ -2826,10 +2837,6 @@ zfs_ioc_recv(zfs_cmd_t *zc) (void) zfs_set_prop_nvlist(tofs, origprops); } out: - if (zfsvfs) { - mutex_exit(&zfsvfs->z_online_recv_lock); - VFS_RELE(zfsvfs->z_vfs); - } nvlist_free(props); nvlist_free(origprops); releasef(fd); @@ -3432,6 +3439,69 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) } /* + * inputs: + * zc_name name of filesystem + * zc_value short name of snap + * zc_string user-supplied tag for this reference + * zc_cookie recursive flag + * + * outputs: none + */ +static int +zfs_ioc_hold(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, + zc->zc_string, recursive)); +} + +/* + * inputs: + * zc_name name of dataset from which we're releasing a user reference + * zc_value short name of snap + * zc_string user-supplied tag for this reference + * zc_cookie recursive flag + * + * outputs: none + */ +static int +zfs_ioc_release(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, + zc->zc_string, recursive)); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of snapshot holds + */ +static int +zfs_ioc_get_holds(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + +/* * pool create, destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export * do the logging of those commands. @@ -3511,8 +3581,8 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { B_TRUE }, { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE, - B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE, + B_TRUE }, { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_TRUE }, { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, @@ -3534,6 +3604,11 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { DATASET_NAME, B_FALSE, B_FALSE }, { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, DATASET_NAME, B_FALSE, B_TRUE }, + { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE } }; int diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 8a859b575..d03f92ba0 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -935,7 +935,6 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) goto out; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); @@ -1051,7 +1050,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs) zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_online_recv_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 88d4e52c1..8eb4665ae 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -208,6 +208,12 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; + /* + * Clean up any locks held by this process on the vp. + */ + cleanlocks(vp, ddi_get_pid(), 0); + cleanshares(vp, ddi_get_pid()); + ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -215,12 +221,6 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); - /* - * Clean up any locks held by this process on the vp. - */ - cleanlocks(vp, ddi_get_pid(), 0); - cleanshares(vp, ddi_get_pid()); - if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && @@ -855,6 +855,10 @@ zfs_get_done(dmu_buf_t *db, void *vzgd) kmem_free(zgd, sizeof (zgd_t)); } +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + /* * Get data to generate a TX_WRITE intent log record. */ @@ -936,7 +940,21 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd->zgd_rl = rl; zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_bp = &lr->lr_blkptr; - VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); +#ifdef DEBUG + if (zil_fault_io) { + error = EIO; + zil_fault_io = 0; + } else { + error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db); + } +#else + error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db); +#endif + if (error != 0) { + kmem_free(zgd, sizeof (zgd_t)); + goto out; + } + ASSERT(boff == db->db_offset); lr->lr_blkoff = off - boff; error = dmu_sync(zio, db, &lr->lr_blkptr, @@ -988,6 +1006,27 @@ zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, } /* + * If vnode is for a device return a specfs vnode instead. + */ +static int +specvp_check(vnode_t **vpp, cred_t *cr) +{ + int error = 0; + + if (IS_DEVVP(*vpp)) { + struct vnode *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + *vpp = svp; + } + return (error); +} + + +/* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * @@ -1017,7 +1056,46 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - int error; + int error = 0; + + /* fast path */ + if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + + if (dvp->v_type != VDIR) { + return (ENOTDIR); + } else if (zdp->z_dbuf == NULL) { + return (EIO); + } + + if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (!error) { + *vpp = dvp; + VN_HOLD(*vpp); + return (0); + } + return (error); + } else { + vnode_t *tvp = dnlc_lookup(dvp, nm); + + if (tvp) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (error) { + VN_RELE(tvp); + return (error); + } + if (tvp == DNLC_NO_VNODE) { + VN_RELE(tvp); + return (ENOENT); + } else { + *vpp = tvp; + return (specvp_check(vpp, cr)); + } + } + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); @@ -1082,21 +1160,8 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, } error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); - if (error == 0) { - /* - * Convert device special files - */ - if (IS_DEVVP(*vpp)) { - vnode_t *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) - error = ENOSYS; - else - *vpp = svp; - } - } + if (error == 0) + error = specvp_check(vpp, cr); ZFS_EXIT(zfsvfs); return (error); @@ -1235,6 +1300,7 @@ top: &acl_ids)) != 0) goto out; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); error = EDQUOT; goto out; } @@ -1332,19 +1398,7 @@ out: VN_RELE(ZTOV(zp)); } else { *vpp = ZTOV(zp); - /* - * If vnode is for a device return a specfs vnode instead. - */ - if (IS_DEVVP(*vpp)) { - struct vnode *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) { - error = ENOSYS; - } - *vpp = svp; - } + error = specvp_check(vpp, cr); } ZFS_EXIT(zfsvfs); @@ -1653,6 +1707,7 @@ top: return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (EDQUOT); @@ -2456,6 +2511,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, top: attrzp = NULL; + /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (EROFS); @@ -2765,6 +2821,8 @@ top: zp->z_phys->zp_mode = new_mode; err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); + zp->z_acl_cached = aclp; + aclp = NULL; mutex_exit(&zp->z_acl_lock); } @@ -2856,10 +2914,8 @@ out: if (attrzp) VN_RELE(ZTOV(attrzp)); - if (aclp) { + if (aclp) zfs_acl_free(aclp); - aclp = NULL; - } if (fuidp) { zfs_fuid_info_free(fuidp); @@ -3724,8 +3780,8 @@ top: if (err == 0) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); - dmu_tx_commit(tx); } + dmu_tx_commit(tx); out: pvn_write_done(pp, (err ? B_ERROR : 0) | flags); diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 8ced95174..f99e72f1d 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -133,6 +133,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_dbuf = NULL; zp->z_dirlocks = NULL; + zp->z_acl_cached = NULL; return (0); } @@ -155,6 +156,7 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT(zp->z_dbuf == NULL); ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_acl_cached == NULL); } #ifdef ZNODE_STATS @@ -199,6 +201,18 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) nzp->z_phys = ozp->z_phys; nzp->z_dbuf = ozp->z_dbuf; + /* + * Release any cached ACL, since it *may* have + * zfs_acl_node_t's that directly references an + * embedded ACL in the zp_acl of the old znode_phys_t + * + * It will be recached the next time the ACL is needed. + */ + if (ozp->z_acl_cached) { + zfs_acl_free(ozp->z_acl_cached); + ozp->z_acl_cached = NULL; + } + /* Update back pointers. */ (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, znode_evict_error); @@ -1081,6 +1095,11 @@ zfs_znode_free(znode_t *zp) list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + kmem_cache_free(znode_cache, zp); VFS_RELE(zfsvfs->z_vfs); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 53d9d9bf7..db3822f5a 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -714,14 +714,15 @@ zil_lwb_write_done(zio_t *zio) lwb->lwb_buf = NULL; if (zio->io_error) zilog->zl_log_error = B_TRUE; - mutex_exit(&zilog->zl_lock); /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. + * which we allocated the next block sync. We still have the + * zl_lock to ensure zil_sync doesn't kmem free the lwb. */ txg_rele_to_sync(&lwb->lwb_txgh); + mutex_exit(&zilog->zl_lock); } /* @@ -925,6 +926,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) } error = zilog->zl_get_data( itx->itx_private, lr, dbuf, lwb->lwb_zio); + if (error == EIO) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + return (lwb); + } if (error) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); |